Hello everybody.

# Libraries and Data import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import random

from matplotlib.lines import Line2D

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer

import optuna

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# import warnings
# warnings.simplefilter(action='ignore', category=UserWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')
sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')

# NA values in train and test

In [None]:
print("(train, test) na --> ",(train.isna().sum().sum(), test.isna().sum().sum()))

In [None]:
is_na_train_df = train.drop(columns="claim").isna().sum(axis = 1)
print(is_na_train_df.shape)

is_na_test_df = test.isna().sum(axis = 1)
print(is_na_test_df.shape)

## Data preparation: Siple Imputer + NA to median

In [None]:
x_Mm_scaler = MinMaxScaler()
X = pd.DataFrame(x_Mm_scaler.fit_transform(train.drop("claim", axis=1)),
                 columns=train.drop("claim", axis=1).columns)
y = train.claim
X_test = pd.DataFrame(x_Mm_scaler.transform(test), columns=test.columns)

In [None]:
imputer_zeros = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer_zeros.fit_transform(train.drop("claim", axis=1)),
                 columns=train.drop("claim", axis=1).columns)
X_test = pd.DataFrame(imputer_zeros.transform(test), columns=test.columns)
X = pd.DataFrame(x_Mm_scaler.fit_transform(X),
                 columns=train.drop("claim", axis=1).columns)
X_test = pd.DataFrame(x_Mm_scaler.transform(X_test), columns=test.columns)
print("(train, test) na --> ",(X.isna().sum().sum(), X_test.isna().sum().sum()))

In [None]:
X["isNA"] =is_na_train_df
print(X.shape)
X_test["isNA"] = is_na_test_df
print(X_test.shape)

In [None]:
import matplotlib.pyplot as pyplt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [None]:
def train_model_optuna_xgb(trial, X_train, X_valid, y_train, y_valid):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna. 
    Loss of validation data predictions is returned to estimate hyperparameters effectiveness.
    """
    preds = 0
       
    #A set of hyperparameters to optimize by optuna
    xgb_params = {
                 "n_estimators": trial.suggest_categorical('n_estimators', [10000]),
                 "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.8),
                 "subsample": trial.suggest_float('subsample', 0.5, 0.95),
                 "colsample_bytree": trial.suggest_float('colsample_bytree', 0.5, 0.95),
                 "max_depth": trial.suggest_int("max_depth", 5, 16),
                 "booster": trial.suggest_categorical('booster', ["gbtree"]),
                 "tree_method": trial.suggest_categorical('tree_method', ["gpu_hist"]),
                 "reg_lambda": trial.suggest_float('reg_lambda', 2, 100),
                 "reg_alpha": trial.suggest_float('reg_alpha', 1, 50),
                 "random_state": trial.suggest_categorical('random_state', [42]),
                 "n_jobs": trial.suggest_categorical('n_jobs', [4]),
                    }

    # Model loading and training
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    
    print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)
    oof[oof<0] = 0
    
    return np.sqrt(mean_squared_error(y_valid, oof))

In [None]:
%%time

skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X,y)):
    
    X_train, X_valid = X.loc[train_indicies], X.loc[valid_indicies]
    y_train, y_valid = y.loc[train_indicies], y.loc[valid_indicies]

# Setting optuna verbosity to show only warning messages
# If the line is uncommeted each iteration results will be shown
optuna.logging.set_verbosity(optuna.logging.WARNING)
time_limit = 3600 * 2
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: train_model_optuna_xgb(trial, 
                                                X_train, 
                                                X_valid,
                                                y_train, 
                                                y_valid),
               n_trials = 100,
               timeout=time_limit
              )
 # Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)


In [None]:
xgb_params = study.best_params

In [None]:
%%time
splits = 6
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0

for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X,y)):
    
    X_train, X_valid = X.loc[train_indicies], X.loc[valid_indicies]
    y_train, y_valid = y.loc[train_indicies], y.loc[valid_indicies]
    print(fold, f"X_train = {X_train.shape} - y_train: {y_train.shape}")
    print(fold, f"X_valid = {X_valid.shape} - y_valid: {y_valid.shape}")
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=100,
              verbose=False)
    print("fitted")
    preds += model.predict(X_test) / splits
    print(preds.shape)
    print("preds ok")
    model_fi += model.feature_importances_
    print("model_fi ok")
    oof_preds[valid_indicies] = model.predict(X_valid)
    print(oof_preds)
    oof_preds[oof_preds < 0] = 0
#     fold_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(np.array(y_valid).reshape(-1,1)), y_scaler.inverse_transform(np.array(oof_preds[valid_idx]).reshape(-1,1))))
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_indicies]))
    print(f"Fold {fold} RMSE: {fold_rmse}")
#         print(f"Trees: {model.tree_count_}")
    total_mean_rmse += fold_rmse / splits
print(f"\nOverall RMSE: {total_mean_rmse}")

In [None]:
# xgb public Score untuned and fast parameters: 0.76817
predictions = pd.DataFrame()
predictions["id"] = test.index
predictions["claim"] = preds

predictions.to_csv('submission_xgb_optimized.csv', index=False, header=predictions.columns)
predictions.head()

# Final considerations:
This are the results of the simulation :<br>

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
# Define the model


In [None]:
#Function for plotting Confusion Matrix


def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix')

    print(cm)

    thresh = cm.max() / 2.
    for i in range (cm.shape[0]):
        for j in range (cm.shape[1]):
            plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
#Feeding parameters in the CM Function

cm = confusion_matrix(y_true=y, y_pred=oof_preds)

In [None]:
len(oof_preds)

In [None]:
#Labels for the CM

cm_plot_labels = ['Negative','Positive']

In [None]:
#Plotting the CM

plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')

In [None]:
from sklearn.metrics import roc_curve, auc #for model evaluation
y_true=y
y_pred=oof_preds
fpr, tpr, thresholds = roc_curve(y_true, y_pred)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for TPS 09')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)