In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from xgboost import XGBRegressor
# from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

In [2]:
train_df = pd.read_csv('train.csv', index_col = 'id')

In [3]:
feature_cols = [c for c in train_df.columns if c not in ('claim', 'fold')]
train_df['fold'] = -1
fold_numbers = 10

In [4]:
KF = KFold(n_splits = fold_numbers, shuffle = True, random_state = 42)
for fold, (train_indices, valid_indices) in enumerate(KF.split(X = train_df)):
    train_df.loc[valid_indices, 'fold'] = fold

In [5]:
# Raw Data as input, Mean imputed
test_preds = []
for fold in range(fold_numbers):
    xtrain = train_df.loc[train_df.fold != fold]
    xval = train_df.loc[train_df.fold == fold]
    
    # xtest = test_df.copy()
    
    ytrain = xtrain.claim
    yval = xval.claim
    
    xtrain = xtrain[feature_cols]
    xval = xval[feature_cols]
    my_imputer = SimpleImputer(strategy= 'mean')
    
    xtrain = pd.DataFrame(my_imputer.fit_transform(xtrain))
    xtrain.columns = feature_cols
    xval = pd.DataFrame(my_imputer.transform(xval))
    xval.columns = feature_cols
    
    # xtest = pd.DataFrame(my_imputer.transform(xtest))
    # xtest.columns = feature_cols
    
    model = XGBClassifier(n_estimators = 10000,
                          max_depth = 2,
                          objective='binary:logistic',
                          random_state = fold, use_label_encoder=False, gpu_id = 0,
                          tree_method = 'gpu_hist',
                          predictor = 'gpu_predictor')
    
    model.fit(xtrain, ytrain, early_stopping_rounds= 300, eval_set=[(xval, yval)],
              eval_metric= 'auc', verbose = 1000) # 
    predval = model.predict_proba(xval)[:,1]
    print(fold, roc_auc_score(yval, predval))
    
    # test_pred = model.predict_proba(xtest)[:,1]
    # test_preds.append(test_pred)

[0]	validation_0-auc:0.51654
[1000]	validation_0-auc:0.79452
[1738]	validation_0-auc:0.79507
0 0.7951951144929449
[0]	validation_0-auc:0.51494
[1000]	validation_0-auc:0.79460
[1819]	validation_0-auc:0.79537
1 0.7955525455523259
[0]	validation_0-auc:0.51849
[1000]	validation_0-auc:0.79821
[1939]	validation_0-auc:0.79864
2 0.7988539992394094
[0]	validation_0-auc:0.51551
[1000]	validation_0-auc:0.79455
[2000]	validation_0-auc:0.79506
[2184]	validation_0-auc:0.79501
3 0.7951199890549824
[0]	validation_0-auc:0.51821
[1000]	validation_0-auc:0.79476
[1659]	validation_0-auc:0.79516
4 0.7952593680728184
[0]	validation_0-auc:0.51474
[1000]	validation_0-auc:0.79537
[1724]	validation_0-auc:0.79595
5 0.796085845584372
[0]	validation_0-auc:0.51600
[1000]	validation_0-auc:0.79667
[1977]	validation_0-auc:0.79705
6 0.7973454898775243
[0]	validation_0-auc:0.51689
[1000]	validation_0-auc:0.79691
[1529]	validation_0-auc:0.79743
7 0.7974522230557721
[0]	validation_0-auc:0.51834
[1000]	validation_0-auc:0.79

In [7]:
# Raw Data as input, Constant imputed
test_preds = []
for fold in range(fold_numbers):
    xtrain = train_df.loc[train_df.fold != fold]
    xval = train_df.loc[train_df.fold == fold]
    
    # xtest = test_df.copy()
    
    ytrain = xtrain.claim
    yval = xval.claim
    
    xtrain = xtrain[feature_cols]
    xval = xval[feature_cols]
    my_imputer = SimpleImputer(strategy= 'constant')
    
    xtrain = pd.DataFrame(my_imputer.fit_transform(xtrain))
    xtrain.columns = feature_cols
    xval = pd.DataFrame(my_imputer.transform(xval))
    xval.columns = feature_cols
    
    # xtest = pd.DataFrame(my_imputer.transform(xtest))
    # xtest.columns = feature_cols
    
    model = XGBRegressor(n_estimators = 10000,
                          max_depth = 4,
                          objective='binary:logistic',
                          random_state = fold, use_label_encoder=False, gpu_id = 0,
                          tree_method = 'gpu_hist',
                          predictor = 'gpu_predictor')
    
    model.fit(xtrain, ytrain, early_stopping_rounds= 300, eval_set=[(xval, yval)],
              eval_metric= 'auc', verbose = 1000)
    predval = model.predict(xval)
    # predval = model.predict_proba(xval)[:,1]
    print(fold, roc_auc_score(yval, predval))
    
    # test_pred = model.predict_proba(xtest)[:,1]
    # test_preds.append(test_pred)

[0]	validation_0-auc:0.52909
[616]	validation_0-auc:0.79427
0 0.7950738552329855
[0]	validation_0-auc:0.52924
[626]	validation_0-auc:0.79459
1 0.7960060552223516
[0]	validation_0-auc:0.53053
[644]	validation_0-auc:0.79687
2 0.7985259042471426
[0]	validation_0-auc:0.53135
[656]	validation_0-auc:0.79422
3 0.7960751342789762
[0]	validation_0-auc:0.53087
[644]	validation_0-auc:0.79370
4 0.795567504384604
[0]	validation_0-auc:0.53008
[579]	validation_0-auc:0.79454
5 0.796391575676837
[0]	validation_0-auc:0.53189
[617]	validation_0-auc:0.79554
6 0.7974307248579687
[0]	validation_0-auc:0.53160
[652]	validation_0-auc:0.79497
7 0.7969820746089176
[0]	validation_0-auc:0.53131
[629]	validation_0-auc:0.79449
8 0.7962364272734344
[0]	validation_0-auc:0.52933
[646]	validation_0-auc:0.79643
9 0.7977064506334085
