# Installing Dependencies

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn.metrics as metrics
import sklearn.impute as impute
import sklearn.model_selection as ms
from scipy.stats import mode

# Loading Data

In [None]:
train_csv = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test_csv = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

# Basic EDA

In [None]:
train_csv.head()

### checking number of columns and dataset length, shape

In [None]:
print("Number of training samples: ", len(train_csv))

In [None]:
print("Training Data shape: ", train_csv.shape)

In [None]:
print("Number of test samples: ", len(test_csv))

In [None]:
print("Test Data shape: ", test_csv.shape)

In [None]:
print("Data columns")
cols = train_csv.columns.to_list()
print(cols)

### check for null or nan values in dataset

In [None]:
print("number of null samples in train csv columns:")
for col, null in zip(cols, train_csv.isnull().sum()):
    print(f"{col} - {null}")

In [None]:
print("number of null samples in test csv columns:")
for col, null in zip(cols, test_csv.isnull().sum()):
    print(f"{col} - {null}")

### excluding id and target from feature set

In [None]:
target = "claim"
features = [f for f in train_csv.columns if f not in ["id", target]]
print(features)

### checking class distribution of dataset

In [None]:
train_csv["claim"].value_counts()

Not much data imbalance

# Data Preprocessing

### Handling Missing Values
Here we use mean strategy to fill missing values. In this method we fill missing data with mean of feature column. There are other methods that can be expermented like KNNImputer, Using a model to regress missing values but currently this will work fine for a baseling

In [None]:
%%time
imputer = impute.SimpleImputer(strategy="mean")
imputer.fit(train_csv[features])
train_csv[features] = imputer.transform(train_csv[features])
test_csv[features] = imputer.transform(test_csv[features])

In [None]:
%%time
train_csv.to_csv("train_mean_filling.csv", index=False)
test_csv.to_csv("test_mean_filling.csv", index=False)

# Cross Validation

In [None]:
class CrossValidation:
    def __init__(self, df, shuffle,random_state=None):
        self.df = df
        self.random_state = random_state
        self.shuffle = shuffle
        if shuffle is True:
            self.df = df.sample(frac=1,
                random_state=self.random_state).reset_index(drop=True)

    def hold_out_split(self,percent,stratify=None):
        if stratify is not None:
            y = self.df[stratify]
            train,val = ms.train_test_split(self.df, test_size=percent/100,
                stratify=y, random_state=self.random_state)
            return train,val
        size = len(self.df) - int(len(self.df)*(percent/100))
        train = self.df.iloc[:size,:]
        val = self.df.iloc[size:,:]
        return train,val

    def kfold_split(self, splits, stratify=None):
        if stratify is not None:
            kf = ms.StratifiedKFold(n_splits=splits,
                shuffle=self.shuffle,
                random_state=self.random_state)
            y = self.df[stratify]
            for train, val in kf.split(X=self.df,y=y):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v
        else:
            kf = ms.KFold(n_splits=splits, shuffle=self.shuffle,
                random_state=self.random_state)
            for train, val in kf.split(X=self.df):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v

In [None]:
seed = 42
folds = 5

In [None]:
cv = CrossValidation(train_csv,
                     shuffle=True,
                     random_state=seed
                    )

# Folds Predictions

In [None]:
total_val_fold_auc = []
test_predictions = []

In [None]:
def xgb_train_and_predict(xgb_params, seed_mul=1):
    valid_preds = {}
    test_preds = []
    val_fold_auc = []
    for fold, (train_, val_) in enumerate(cv.kfold_split(splits=folds, stratify="claim")):
        print("Training fold: ", fold+1)
        model = xgb.XGBClassifier(**xgb_params,
                                  seed=fold*seed_mul,
                                  tree_method="gpu_hist",
                                  gpu_id=0,
                                  predictor="gpu_predictor",
                                  use_label_encoder=False
                                )
        trainX = train_[features]
        trainY = train_[target]
        valX = val_[features]
        valY = val_[target]

        val_ids = val_.id.values.tolist()

        model.fit(trainX, trainY, 
                  early_stopping_rounds=300, 
                  eval_set=[(valX, valY)],
                  eval_metric="auc",
                  verbose=1000)

        predY = model.predict(valX)
        val_auc = metrics.roc_auc_score(valY, predY)
        print(val_auc)
        val_fold_auc.append(val_auc)

        valid_preds.update(dict(zip(val_ids, predY)))

        predY = model.predict(test_csv[features])
        test_preds.append(predY)
    return val_fold_auc, valid_preds, np.column_stack(test_preds)

In [None]:
model_count = 1

### Prediction with XGB model 1

In [None]:
xgb_params = {
    'booster': 'gbtree',
    'n_estimators': 10000
}

val_fold_auc, valid_preds, test_preds = xgb_train_and_predict(xgb_params, seed_mul=1)

fold_auc = np.mean(val_fold_auc)
print("Fold Validation: ", fold_auc)

total_val_fold_auc.append(fold_auc)

pred_df = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
pred_df.columns = ["id", f"pred_{model_count}"]
pred_df.to_csv(f"train_pred_{model_count}.csv", index=False)

test_df = pd.DataFrame(columns=["id", f"pred_{model_count}"]) 
test_df["id"] = test_csv["id"]
test_preds = np.mean(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

### Prediction with XGB model 2

In [None]:
xgb_params = {
    'booster': 'gbtree',
    'n_estimators': 5000
}

val_fold_auc, valid_preds, test_preds = xgb_train_and_predict(xgb_params, seed_mul=11)

fold_auc = np.mean(val_fold_auc)
print("Fold Validation: ", fold_auc)

total_val_fold_auc.append(fold_auc)

pred_df = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
pred_df.columns = ["id", f"pred_{model_count}"]
pred_df.to_csv(f"train_pred_{model_count}.csv", index=False)

test_df = pd.DataFrame(columns=["id", f"pred_{model_count}"]) 
test_df["id"] = test_csv["id"]
test_preds = np.mean(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

In [None]:
print("All Models Validation AUC: ", np.mean(total_val_fold_auc))

# Creating Submission File

In [None]:
def create_submission(sub_name,
                      predictions, 
                      template_path="../input/tabular-playground-series-sep-2021/sample_solution.csv"):
    template = pd.read_csv(template_path)
    template[target] = predictions
    template.to_csv(sub_name+".csv", index=False)

In [None]:
def voting_ensembling(predictions, axis):
    predictions, _ = mode(predictions, axis=axis)
    return predictions

In [None]:
predictions = voting_ensembling(np.column_stack(test_predictions), axis=1)
predictions.shape

In [None]:
create_submission("submission", predictions)