In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv("../input/tpssep21-folds/train_folds.csv")
df_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample_solution = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")


In [None]:
df.kfold.value_counts()

In [None]:
useful_features = [c for c in df.columns if c not in ('id', 'claim', 'kfold')]
X_test= df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train = df[df.kfold != fold].reset_index(drop = True)
    X_valid = df[df.kfold == fold].reset_index(drop = True)
    X_test = X_test.copy()
    
    
    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    #Imputation of Null Values because earlier it was found to have null 
    #in both training and testing data
    
    my_imputer = SimpleImputer()
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
    imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))
    
    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    imputed_X_test.columns = X_test.columns
    
    
    model_xgboost= XGBClassifier(random_state = fold, 
                                 tree_method='gpu_hist',
                                 gpu_id = 0, predictor='gpu_predictor',
                                use_label_encoder=False, eval_metric= 'error')
    model_xgboost.fit(imputed_X_train, y_train)
    predictions_valid = model_xgboost.predict(imputed_X_valid)
    test_predictions = model_xgboost.predict(imputed_X_test)
    final_predictions.append(test_predictions)
    
    score = roc_auc_score(y_valid, predictions_valid)
    scores.append(score)
    print("Score: ", fold, score)
    
print("Final Scores")
print(np.mean(scores), np.std(scores))

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)
sample_solution.claim = preds

In [None]:
sample_solution.to_csv('sample_submission.csv', index = False)