In [None]:
import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Preprocessing

In [None]:
train = pd.read_csv('../input/tps-sep-train-kfold/train_10_folds.csv',index_col=0)
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv',index_col=0)
sample_submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
train['n_missing'] = train[train.columns].isna().sum(axis=1)
test['n_missing'] = test[test.columns].isna().sum(axis=1)

In [None]:
print(train.shape)
print(test.shape)
useful_features = [column for column in train.columns if column not in ["claim", "kfold"]]

In [None]:
my_imputer = SimpleImputer(strategy = 'mean')

train[useful_features] = my_imputer.fit_transform(train[useful_features])
test[useful_features] = my_imputer.transform(test[useful_features])

In [None]:
fold = 0
xtrain = train[train.kfold != fold].reset_index(drop = True)
xvalid = train[train.kfold == fold].reset_index(drop = True)
valid_ids = xvalid.index.values.tolist()
len(valid_ids)

# Model Learning

In [None]:
final_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop = True)
    xvalid = train[train.kfold == fold].reset_index(drop = True)
    xtest = test.copy()
    
    valid_ids = xvalid.index.values.tolist()
    
    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    
    my_scaler = StandardScaler()
    xtrain[useful_features] = my_scaler.fit_transform(xtrain[useful_features])
    xvalid[useful_features] = my_scaler.transform(xvalid[useful_features])
    xtest[useful_features] = my_scaler.transform(xtest[useful_features])
    
    parameter = {
    "verbosity":0,
    "objective": "binary:logistic",
    "tree_method": 'gpu_hist',
    "booster":'gbtree',
    'learning_rate': 0.053412516326389936,
    'max_depth': 3,
    'gamma': 0.21936641952157981,
    'subsample': 0.9978683971251602,
    'colsample_bytree': 0.8718594096500578,
    'n_estimators': 3000,
    'reg_alpha': 0.01631769681569393,
    'min_child_weight': 7
    }

    model = XGBRegressor(**parameter, random_state = fold)
    model.fit(xtrain,ytrain,verbose = False, eval_set = [(xtrain,ytrain),(xvalid,yvalid)],
             eval_metric = "auc",early_stopping_rounds=200)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids,preds_valid)))
    
    temp = roc_auc_score(yvalid,preds_valid)
    scores.append(temp)
    print(fold, temp)

In [None]:
print(np.mean(scores), np.std(scores))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_xgb"]
final_valid_predictions.to_csv("train_pred_xgb.csv", index = False)
final_valid_predictions.head()

# Output

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis = 1)

In [None]:
sample_submission.claim = preds
sample_submission.to_csv("test_pred_xgb.csv", index = False)
sample_submission.head()