# Welcome to the September 2021 Tabular Playground Competition! #

In this competition, we predict whether a customer will make an insurance claim.

# Data #

The full dataset has almost one million rows. We'll use just a sample so we can explore the data more quickly.

In [None]:
import pandas as pd
from pathlib import Path


import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


data_dir = Path('../input/tabular-playground-series-sep-2021/')

df_train = pd.read_csv(
    data_dir / "train.csv",
    index_col='id',
    #nrows=25000,  # comment this row to use the full dataset
)

df_train.head(20)

In [None]:
df_train.describe()

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
# Heatmap to View Missing Values by Variable
plt.figure(figsize = (14,6))
p = sns.heatmap(df_train.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')
p.axes.set_title("Valores Ausentes", fontsize = 20)

In [None]:
# Check the nan values
{df_train[col].isna().sum():col for col in df_train.columns if df_train[col].isna().sum() > 0}

In [None]:
# Check the ZERO values
{(df_train[col] == 0).sum():col for col in df_train.columns if (df_train[col] == 0).sum() > 0}

In [None]:
# creating a feature with a count of null columns per row
df_train["null_count"] = df_train.isnull().sum(axis=1)

In [None]:
df_train

In [None]:
#df_train_slice = df_train[(df_train.null_count > 3) &  (df_train.null_count < 7)] 
#df_train_slice["variance"] = df_train_slice.var(axis=1)

In [None]:
#df_train_slice.groupby(['null_count','claim'])['null_count'].count()

In [None]:
# Features and target
FEATURES = df_train.drop('claim', axis = 1)
TARGET = df_train['claim'].astype(int).astype(str)

In [None]:
sns.set(style="whitegrid")

# Using a bar chart to show the distribution of classes
bp = sns.countplot(x=df_train['claim'])
plt.title("Distribuição de classe do conjunto de dados")
bp.set_xticklabels(["0","1"])
plt.show()

In [None]:
from numpy import mean
from numpy import std
import numpy as np
from scipy import stats
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import mean_absolute_error, classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer,QuantileTransformer

In [None]:
# sliced impucation of missing values
imp = SimpleImputer(missing_values=np.nan, strategy='median') # feel free to use others strategy
FEATURES[(FEATURES.null_count < 4)] = imp.fit_transform(FEATURES[(FEATURES.null_count < 4)])

In [None]:
# sliced impucation of missing values
#imp = KNNImputer(n_neighbors=5) # feel free to use others strategy
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # feel free to use others strategy
FEATURES[((FEATURES.null_count > 3) &  (FEATURES.null_count < 7))] = imp.fit_transform(FEATURES[((FEATURES.null_count > 3) &  (FEATURES.null_count < 7))])

In [None]:
# sliced impucation of missing values
#imp_estimator = ExtraTreesRegressor(n_estimators=5, n_jobs=-1, criterion="mse", verbose=1, random_state=42)
#imp = IterativeImputer(random_state=42, estimator=imp_estimator)
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # feel free to use others strategy
FEATURES[((FEATURES.null_count > 6) &  (FEATURES.null_count < 13))] = imp.fit_transform(FEATURES[((FEATURES.null_count > 6) &  (FEATURES.null_count < 13))])

In [None]:
# sliced impucation of missing values
imp = SimpleImputer(strategy='mean') # feel free to use others strategy
FEATURES[(FEATURES.null_count > 12)] = imp.fit_transform(FEATURES[(FEATURES.null_count > 12)])

In [None]:
# sliced impucation of missing values
#imp = KNNImputer(n_neighbors=5) # feel free to use others strategy
#FEATURES[(FEATURES.null_count > 12)] = imp.fit_transform(FEATURES[(FEATURES.null_count > 12)])

In [None]:
z = np.abs(stats.zscore(FEATURES))

In [None]:
z.max()

In [None]:
FEATURES.shape

In [None]:
FEATURES[(z < 3).all(axis=1)]

In [None]:
# impute media to outliear them
imp = SimpleImputer(strategy='median') # feel free to use others strategy
FEATURES[(z < 3).all(axis=1)] = imp.fit_transform(FEATURES[(z < 3).all(axis=1)])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(FEATURES, TARGET, 
                                                      train_size=0.8, test_size=0.2, random_state=42, shuffle=True)

# Model #

Let's try out a simple XGBoost model. This algorithm can handle missing values, but you could try imputing them instead.  We use `XGBClassifier` (instead of `XGBRegressor`, for instance), since this is a classification problem.

In [None]:
# machine learning model configuration
XGB = XGBClassifier(
        learning_rate= 0.00312345,
        reg_alpha = 0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.4,
        objective='multi:softprob',
        n_estimators=27000,
        eval_metric='auc',
        num_class=2,
        n_jobs=-1,
        tree_method='gpu_hist',
        # Uncomment if you want to use GPU. Recommended for whole training set.
        #tree_method='gpu_hist',
        random_state=42,
        )

#steps = [('imputer', SimpleImputer(strategy='most_frequent')),
steps = [('scle', MinMaxScaler()),
         ('m', XGB)]
model = Pipeline(steps=steps)

In [None]:
X = X_train
y = y_train.values

In [None]:
# Fit the model
model.fit(X, y)

In [None]:
# get predictions
y_pred = model.predict_proba(X_valid)


# Evaluation #

The evaluation metric is AUC, which stands for "area under curve".  Run the next code cell to evaluate the model.

A "neutral" AUC is 0.5, so anything better than that means our model learned something useful.

In [None]:
from sklearn.metrics import *

In [None]:
# retrieve just the probabilities for the positive class
pos_probs = y_pred[:, 1]
# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y_valid.astype(str).astype(int), pos_probs)
# plot model roc curve
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_valid.astype(str).astype(int), y_pred[:,1])

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
import scikitplot as skplt
skplt.metrics.plot_roc(y_valid.astype(str).astype(int), y_pred, figsize=(10, 8))

# Make Submission #

Our predictions are binary 0 and 1, but you're allowed to submit probabilities instead. In scikit-learn, you would use the `predict_proba` method instead of `predict`.

In [None]:
# reading test data
X_test = pd.read_csv(data_dir / "test.csv", index_col='id')
X_test["null_count"] = X_test.isnull().sum(axis=1)

In [None]:
# sliced impucation of missing values
imp = SimpleImputer(missing_values=np.nan, strategy='median') # feel free to use others strategy
X_test[(X_test.null_count < 4)] = imp.fit_transform(X_test[(X_test.null_count < 4)])

In [None]:
# sliced impucation of missing values
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # feel free to use others strategy
X_test[((X_test.null_count > 3) &  (X_test.null_count < 7))] = imp.fit_transform(X_test[((X_test.null_count > 3) &  (X_test.null_count < 7))])

In [None]:
# sliced impucation of missing values
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # feel free to use others strategy
X_test[((X_test.null_count > 6) &  (X_test.null_count < 13))] = imp.fit_transform(X_test[((X_test.null_count > 6) &  (X_test.null_count < 13))])

In [None]:
# sliced impucation of missing values
imp = SimpleImputer(strategy='mean') # feel free to use others strategy
X_test[(X_test.null_count > 12)] = imp.fit_transform(X_test[(X_test.null_count > 12)])

In [None]:
z = np.abs(stats.zscore(X_test))

In [None]:
# impute media to outliear them
imp = SimpleImputer(strategy='median') # feel free to use others strategy
X_test[(z < 3).all(axis=1)] = imp.fit_transform(X_test[(z < 3).all(axis=1)])

In [None]:
# get predictions
y_pred = model.predict_proba(X_test)

In [None]:
X_test.groupby(['null_count'])['null_count'].count()

In [None]:
y_pred[:, 1]

In [None]:
y_pred_test = pd.Series(
    y_pred[:, 1],
    index=X_test.index,
    name='claim',
)

In [None]:
# Create submission file
y_pred_test.to_csv("submission.csv")