In [None]:
# Import Libraries for reading data and computation
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

#Import Libraries for train test split
from sklearn.model_selection import train_test_split

#Import Library to handle missing values
from sklearn.impute import SimpleImputer

# Import XGBoost module
from xgboost import XGBClassifier

# Confusion matrix to evaluate performance
from sklearn.metrics import confusion_matrix, accuracy_score

# AUC score to evaluate performance
from sklearn.metrics import roc_auc_score

# Feature scaling
from sklearn import preprocessing

In [None]:
# Reading the datasets
X_full = pd.read_csv('../input/tps-5skfolds/train_5skfolds.csv',index_col='id')
X_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv',index_col='id')

In [None]:
# feature engineering:
# ref:https://www.kaggle.com/akihironomura/tps-xgboost-kfold
features = [x for x in X_full.columns.values if x[0]=="f"]

In [None]:
X_full['n_missing'] = X_full[features].isna().sum(axis=1)
X_test['n_missing'] = X_test[features].isna().sum(axis=1)

X_full['abs_sum'] = X_full[features].abs().sum(axis=1)
X_test['abs_sum'] = X_test[features].abs().sum(axis=1)

X_full['sem'] = X_full[features].sem(axis=1) #function return unbiased standard error of the mean over requested axis
X_test['sem'] = X_test[features].sem(axis=1)

X_full['std'] = X_full[features].std(axis=1)
X_test['std'] = X_test[features].std(axis=1)

X_full['avg'] = X_full[features].mean(axis=1)
X_test['avg'] = X_test[features].mean(axis=1)

X_full['max'] = X_full[features].max(axis=1)
X_test['max'] = X_test[features].min(axis=1)

X_full['min'] = X_full[features].min(axis=1)
X_test['min'] = X_test[features].min(axis=1)

In [None]:
# New features: "n_missing","abs_sum","sem","std","avg","max","min"

In [None]:
useful_features = [c for c in X_full.columns if c not in ("id", "claim", "kfold")]

In [None]:
X_test = X_test[useful_features]

In [None]:
# Training and validation process for each of the folds
#Ref: https://www.kaggle.com/abhishek/competition-part-2-feature-engineering
final_predictions = []
auc_scores = []
for fold in range(5):
    # Iteratively split the full data into train and validation sets.
    X_train =  X_full[X_full.kfold != fold].reset_index(drop=True)
    X_valid = X_full[X_full.kfold == fold].reset_index(drop=True)
    test = X_test.copy()
    
    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Imputation of missing values - Training set and validation set
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    X_train_im = pd.DataFrame(imputer.fit_transform(X_train))
    X_valid_im = pd.DataFrame(imputer.transform(X_valid))

    X_train_im.columns = X_train.columns
    X_valid_im.columns = X_valid.columns
    
    #Imputation - Final test set
    final_X_test = pd.DataFrame(imputer.transform(test))
    final_X_test.columns = test.columns
    
    # Feature Scaling - used standard scaler here.
    scaler = preprocessing.StandardScaler()
    X_train_sc = scaler.fit_transform(X_train_im)
    X_valid_sc = scaler.transform(X_valid_im)
    
    # Feature scaling - Final test set
    final_X_test_sc = scaler.transform(test)
    
    #Training with XGBoost
    classifier = XGBClassifier(random_state  = 1, silent = False, scale_pos_weight = 1, learning_rate=0.06,
                           colsample_bytree = 0.7,subsample = 0.8, objective = 'binary:logistic',
                           eval_metric = 'error',n_estimators= 1000, reg_alpha = 3.2, reg_lambda = 0.15, 
                           max_depth=6, gamma=1,tree_method = 'gpu_hist' )
    # Making prediction on validation set
    classifier.fit(X_train_sc, y_train)
    y_pred_prob = classifier.predict_proba(X_valid_sc)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_prob)
    print("fold = %g , auc = %g" %(fold,auc_score))
    preds_test_proba = classifier.predict_proba(final_X_test_sc)[:,1]
    final_predictions.append(preds_test_proba)
    auc_scores.append(auc_score)
    
print("Mean auc score: %g, Standard deviation: %g" %(np.mean(auc_scores),np.std(auc_scores)))
    

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [None]:
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
sample_submission.claim = preds
sample_submission.to_csv("submission.csv", index=False)