# Setup -> Load data -> Preprocess -> Evaluate -> Train -> check score -> fine tune 

In [None]:
# Import Libraries for reading data and computation
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

#Import Libraries for train test split
from sklearn.model_selection import train_test_split

#Import Library to handle missing values
from sklearn.impute import SimpleImputer

# Import XGBoost module
from xgboost import XGBClassifier

# Confusion matrix to evaluate performance
from sklearn.metrics import confusion_matrix, accuracy_score

# AUC score to evaluate performance
from sklearn.metrics import roc_auc_score

# Feature scaling
from sklearn import preprocessing

In [None]:
# Read data
X_full = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv',index_col='id')
X_test_full = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')

In [None]:
# Get Matrix of features (X) and Target(Y)
y = X_full[X_full.columns[-1]] # Target
X = X_full.drop(X_full.columns[-1], axis=1) # Features

In [None]:
# Making a new feature out of missing values
# Ref: https://www.kaggle.com/virasydoriak/simple-logistic-regression-very-fast-with-sklearn
# Ref: https://www.kaggle.com/c/tabular-playground-series-sep-2021/discussion/270206

X['n_missing'] = X.isna().sum(axis=1)
X_test_full['n_missing'] = X_test_full.isna().sum(axis=1)

In [None]:
# Shapes of all tables for reference
def data_table_summary(X,y,X_test):
    """ Gives a summary of the number of rows, columns  and NaN values per dataset"""
    print("Shapes of all datasets: \n \b Shape of y = %s \n Shape of X = %s \n Shape of X_test = %s"
          %(y.shape, X.shape, X_test.shape))
    print("\nNaN values per dataset: \n NaN vals in y = %d \n NaN vals in X = %d \n NaN vals in X_test = %d"
         %(np.count_nonzero(np.isnan(y)),np.count_nonzero(np.isnan(X)),np.count_nonzero(np.isnan(X_test))))

In [None]:
data_table_summary(X,y,X_test_full)

In [None]:
# Split training data into training and validation sets.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Imputation of missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_im = pd.DataFrame(imputer.fit_transform(X_train))
X_valid_im = pd.DataFrame(imputer.transform(X_valid))

X_train_im.columns = X_train.columns
X_valid_im.columns = X_valid.columns

In [None]:
# Feature Scaling - used standard scaler here.
scaler = preprocessing.StandardScaler()
X_train_sc = scaler.fit_transform(X_train_im)
X_valid_sc = scaler.transform(X_valid_im)

In [None]:
# # Training the XGBoost classifier on the Training set
# classifier = XGBClassifier(random_state  = 1)

# #Train-test split, evaluation metric and early stopping
# #Ref: https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
# eval_set = [(X_train_sc, y_train), (X_valid_sc, y_valid)]
# eval_metric = ["auc","error"]
# %time classifier.fit(X_train_sc, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True)

In [None]:
# #Plotting evaaluation results:
# #Ref:https://stackoverflow.com/questions/51900874/how-to-plot-xgboost-evaluation-metrics
# results = classifier.evals_result()
# epochs = len(results['validation_0']['error'])
# x_axis = range(0, epochs)

In [None]:
# fig, ax = plt.subplots()
# ax.plot(x_axis, results['validation_0']['error'], label='Train')
# ax.plot(x_axis, results['validation_1']['error'], label='Test')
# ax.legend()
# plt.ylabel('Classification error')
# plt.title('XGBoost classification error')
# plt.show()

In [None]:
# Fine tuning XGBoost model
# Ref: https://www.kaggle.com/mustafacicek/tps-09-21-xgboost-0-81785
classifier = XGBClassifier(random_state  = 1, silent = False, scale_pos_weight = 1, learning_rate=0.06,
                           colsample_bytree = 0.7,subsample = 0.8, objective = 'binary:logistic',
                           eval_metric = 'error',n_estimators= 1000, reg_alpha = 3.2, reg_lambda = 0.15, 
                           max_depth=6, gamma=1,tree_method = 'gpu_hist', )

In [None]:
# Making prediction on validation set
classifier.fit(X_train_sc, y_train)
y_pred = classifier.predict(X_valid_sc)

In [None]:
#Making the Confusion Matrix
cm = confusion_matrix(y_valid, y_pred)
print(cm)
accuracy_score(y_valid, y_pred)

In [None]:
# Making prediction of probabilities on validation set
y_pred_prob = classifier.predict_proba(X_valid_sc)[:,1]

In [None]:
# Calculating AUC score
auc_score = roc_auc_score(y_valid, y_pred_prob)
auc_score

# Preprocessing the test data.

In [None]:
#Imputation
final_X_test = pd.DataFrame(imputer.transform(X_test_full))
final_X_test.columns = X_test_full.columns

In [None]:
# Feature scaling on test set
final_X_test_sc = scaler.transform(final_X_test)

In [None]:
# Get the test predictions
preds_test_proba = classifier.predict_proba(final_X_test_sc)[:,1]
preds_test_proba

In [None]:
# Save test predictions to file
output = pd.DataFrame({'id': X_test_full.index,
                       'claim': preds_test_proba})
output.to_csv('submission.csv', index=False)