In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

from xgboost import XGBClassifier

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train =  pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')

# Exploratory Data Analyses - EDA

At first, lets do some EDA to get more acquainted with the dataset

Lets get some basic information about the dataset.

1 - Look at some samples with head function;

2 - Check how much rows does it has and also the data type of each of them;

3 - Look at some central tendency metrics.

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
print('Dataset shape: ', df_train.shape )

In [None]:
df_train.describe()

Lets do some statistics on missing values

In [None]:
df_train.isna().sum()

In [None]:
miss_perc = df_train.isnull().sum()/df_train.shape[0] * 100
miss_perc = miss_perc.sort_values(ascending=False)
plt.rc('font', family='serif', size=16)
plt.figure(figsize=(15,35))
plt.title('Percentage of missing values on each variable')
plt.xlabel('Percentage (%)')
plt.ylabel('Feature')
plt.barh(miss_perc.index, miss_perc.round(2), alpha=0.5)

# for column in df_train.columns :
#   print(column + ' ' + '%.2f' % miss_perc[column] + '%')

As can be seen, all featuares have almost the same quantity of missing values. The maximum percentage of missing values does not surpass the 1.6% of the overall data available. Given the quiantity of data available, during the modeling, we will stick with the strategy of drop the rows with missing values, however we will also impute with mean value as a baseline.

Now lets verify how is the target variable distributed

In [None]:
# plt.figure(figsize=(6,4))
# plt.hist(df_train['claim'], bins=2, color='#3498db', histtype='bar', edgecolor='white') 
sns.countplot(df_train['claim'])
plt.title('Distribution of classes in target variable (claim) \n')
plt.xlabel('Claim')
plt.ylabel('Count')

As we can see, they are equally distributed, thus we won't need to do any kind of special treatment to deal with imbalance.

# Feature Selection

Since we have a lot of features, it will be too much work to analyse one of them at once alone. Thus, lets use some method to help analysing those features that are most important to our analyses.

# Mutual Information

In [None]:
x = df_train.copy().drop('id', axis=1)
x.dropna(inplace=True)
y = x.pop('claim')

In [None]:
discrete_features = x.dtypes == int

In [None]:
def make_mi_scores(x, y, discrete_features):
    mi_scores = mutual_info_classif(x, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
mi_scores = make_mi_scores(x, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

In [None]:
plt.figure(dpi=100, figsize=(15, 35))
plot_mi_scores(mi_scores)

In [None]:
print('There are ', (mi_scores == 0).value_counts()[True], ' features with 0 score out of 118 features.')

In [None]:
def evaluate_metrics(model, x, y):
    y_pred = model.predict(x)
    acc = accuracy_score(y, y_pred)
    y_pred_prob = model.predict_proba(x)[:, 1]
    auc_roc = roc_auc_score(y, y_pred_prob)
    return {'accuracy' : acc, 'auc_roc_curve' : auc_roc}

# Baseline

Lets use to baseline models to choose the methods that will allow the model to achieve a good performance.

1 - Logistic Regression

2 - XGBoost

All the tests will be based on a fraction of 20% of all available data

# Strategy 1

<ul>
    <li><h3>Impute values with mean</h3></li>
    <li><h3>Use all features</h3></li>
</ul>

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
x = df_subset.drop(['id', 'claim'], axis=1)
y = df_subset['claim']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)
x_valid = imputer.transform(x_valid)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
# Logisct Classifier
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
model.score(x_valid, y_valid)
results = evaluate_metrics(model, x_valid, y_valid)
# y_pred_prob = model.predict_proba(x_valid)[:, 1]
# auc_roc = roc_auc_score(y_pred_prob, y_valid)
print(results)

In [None]:
y_pred = model.predict(x_valid)
accuracy_score(y_valid, y_pred)

In [None]:
# XGBoost Classifier
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
# accuracy_score(y_valid, y_pred)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier - 10 estimators
model = XGBClassifier(n_estimators=10, random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# Strategy 2

<ul>
    <li><h3>Impute values with mean</h3></li>
    <li><h3>Use features whose mutual information is bigger than 0</h3></li>
</ul>

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
df_subset = df_subset.drop(mi_scores[mi_scores == 0].index, axis=1)

x = df_subset.drop(['id', 'claim'], axis=1)
y = df_subset['claim']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)
x_valid = imputer.transform(x_valid)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
# Logisct Classifier
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# y_pred = model.predict(x_valid)
# predictions = [round(value) for value in y_pred]
# accuracy_score(y_valid, predictions)

In [None]:
# XGBoost Classifier - 10 estimators
model = XGBClassifier(n_estimators=10, random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# Strategy 3

<ul>
    <li><h3>Drop rows with nans</h3></li>
    <li><h3>Use all features</h3></li>
</ul>

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
df_subset = df_subset.dropna()

x = df_subset.drop(['id', 'claim'], axis=1)
y = df_subset['claim']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
# Logisct Classifier
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier - 10 estimators
model = XGBClassifier(n_estimators=10, random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# Strategy 4

<ul>
    <li><h3>Drop rows with nans</h3></li>
    <li><h3>Use features whose mutual information is bigger than 0</li>
</ul>

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
df_subset = df_subset.drop(mi_scores[mi_scores == 0].index, axis=1)
df_subset = df_subset.dropna()

x = df_subset.drop(['id', 'claim'], axis=1)
y = df_subset['claim']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
# Logistic Classifier
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(n_estimators=10, random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# Strategy 5

<ul>
    <li><h3>Use strategy 1</h3></li>
    <li><h3>Create a sintetic feature. Counting the number of nan values. This was as tip from the discussion board</ul>

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
x = df_subset.drop(['id', 'claim'], axis=1)
x["nan_count"] = x.isnull().sum(axis=1)
y = df_subset['claim']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)
x_valid = imputer.transform(x_valid)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
# Logistic Classifier
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(n_estimators=10, random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# Strategy 6

<ul>
    <li><h3>Use strategy 2</h3></li>
    <li><h3>Create a sintetic feature. Counting the number of nan values.</ul>

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
df_subset = df_subset.drop(mi_scores[mi_scores == 0].index, axis=1)

x = df_subset.drop(['id', 'claim'], axis=1)
x["nan_count"] = x.isnull().sum(axis=1)
y = df_subset['claim']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)
x_valid = imputer.transform(x_valid)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
# Logistic Classifier
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

In [None]:
# XGBoost Classifier
model = XGBClassifier(n_estimators=10, random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
model.fit(x_train, y_train)
results = evaluate_metrics(model, x_valid, y_valid)
print(results)

# Extreme Gradient Boosting (XGBoost) - Testing different configurations

In [None]:
from tqdm.notebook import tqdm

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
x = df_subset.drop(['id', 'claim'], axis=1)
x["nan_count"] = x.isnull().sum(axis=1)
y = df_subset['claim']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)
x_valid = imputer.transform(x_valid)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 1 - Testing different number of estimators 

In [None]:
def get_models_n_estimators():
    models = dict()
    trees = [10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000]
    for n in trees:
        models[str(n)] = XGBClassifier(n_estimators=n, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
    return models

In [None]:
models = get_models_n_estimators()
results, names = list(), list()
i = 0

for name, model in tqdm(models.items()):
    model.fit(x_train, y_train, verbose=True)
    scores = evaluate_metrics(model, x_valid, y_valid)
    results.append(scores)
    names.append(name)
    print(name, 'accuracy: %.3f auc_roc: %.3f' % (results[i]['accuracy'], results[i]['auc_roc_curve']))
    i += 1

# 2 - Testing different max_depth 

In [None]:
def get_models_n_depths():
    models = dict()
    for i in range(1,20):
        models[str(i)] = XGBClassifier(max_depth=i, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
    return models

In [None]:
models = get_models_n_depths()
results, names = list(), list()
i = 0

for name, model in tqdm(models.items()):
    model.fit(x_train, y_train, verbose=True)
    scores = evaluate_metrics(model, x_valid, y_valid)
    results.append(scores)
    names.append(name)
    print(name, 'accuracy: %.3f auc_roc: %.3f' % (results[i]['accuracy'], results[i]['auc_roc_curve']))
    i += 1

# 3 - Testing different subsamples

In [None]:
def get_models_subsamples():
    models = dict()
    for i in np.arange(0.1, 1.1, 0.1):
        key = '%.1f' % i
        models[key] = XGBClassifier(subsample=i,  tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
    return models

In [None]:
models = get_models_subsamples()
results, names = list(), list()
i = 0

for name, model in tqdm(models.items()):
    model.fit(x_train, y_train, verbose=True)
    scores = evaluate_metrics(model, x_valid, y_valid)
    results.append(scores)
    names.append(name)
    print(name, 'accuracy: %.3f auc_roc: %.3f' % (results[i]['accuracy'], results[i]['auc_roc_curve']))
    i += 1

# 4 - Testing different learning rates

In [None]:
def get_models_lr():
    models = dict()
    rates = [0.0001, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.12, 0.13, 0.3, 0.5, 1.0]
    for r in rates:
        key = '%.4f' % r
        models[key] = XGBClassifier(eta=r, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
    return models

In [None]:
models = get_models_lr()
results, names = list(), list()
i = 0

for name, model in tqdm(models.items()):
    model.fit(x_train, y_train, verbose=True)
    scores = evaluate_metrics(model, x_valid, y_valid)
    results.append(scores)
    names.append(name)
    print(name, 'accuracy: %.3f auc_roc: %.3f' % (results[i]['accuracy'], results[i]['auc_roc_curve']))
    i += 1

# 5 - Testing different number of features

In [None]:
def get_models_nfeatures():
    models = dict()
    for i in np.arange(0.1, 1.1, 0.1):
        key = '%.1f' % i
        models[key] = XGBClassifier(colsample_bytree=i, tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0)
    return models

In [None]:
models = get_models_nfeatures()
results, names = list(), list()
i = 0

for name, model in tqdm(models.items()):
    model.fit(x_train, y_train, verbose=True)
    scores = evaluate_metrics(model, x_valid, y_valid)
    results.append(scores)
    names.append(name)
    print(name, 'accuracy: %.3f auc_roc: %.3f' % (results[i]['accuracy'], results[i]['auc_roc_curve']))
    i += 1

# Grid Search 

Lets do a Grid search on some of the best parameters obtained above

In [None]:
df_subset = df_train.sample(frac=.20, random_state=42)
x_train = df_subset.drop(['id', 'claim'], axis=1)
y_train = df_subset['claim']
x_train["nan_count"] = x_train.isnull().sum(axis=1)

print("Using 80% of data for training and 20% for testing the baseline models")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

In [None]:
params = {'n_estimators' : [10, 50, 100, 150, 200],
          'max_depth' : [1,2,3],
          'subsample' : [0.8, 0.9, 1.0],
          'eta' : [0.12, 0.13],
          'colsample_bytree' : [0.1, 0.2]
         }

In [None]:
metrics = ['roc_auc']
grid_cv = GridSearchCV(XGBClassifier(tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0), param_grid=params, scoring=metrics, verbose=1, refit='roc_auc', return_train_score=False, n_jobs=-1, cv=3)

In [None]:
result = grid_cv.fit(x_train, y_train)

In [None]:
grid_cv.best_params_

In [None]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

# Fit model on entire dataset for submission

Lets fit the model on full dataset and submit it

In [None]:
x_train = df_train.drop(['id', 'claim'], axis=1)
x_train["nan_count"] = x_train.isnull().sum(axis=1)
y_train = df_train['claim']

print("Using all data for training and submiting")
print('x_train', x_train.shape, 'y_train', y_train.shape)
print('x_valid', x_valid.shape, 'y_valid', y_valid.shape)

# Impute and Scale the values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train = imputer.fit_transform(x_train)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

In [None]:
# model = XGBClassifier(n_estimators=250, eta=0.13, max_depth=5, subsample=0.9, colsample_bytree=0.1, tree_method='gpu_hist', predictor='gpu_predictor')
model = XGBClassifier(**grid_cv.best_params_, tree_method='gpu_hist', predictor='gpu_predictor')
model.fit(x_train, y_train)

In [None]:
df_test =  pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
x_test = df_test.drop(['id'], axis=1)
x_test["nan_count"] = x_test.isnull().sum(axis=1)

print("Using test data for predict and submiting")
print('x_test', x_test.shape)

# Impute and Scale the values
x_test = imputer.transform(x_test)
x_test = scaler.transform(x_test)

In [None]:
claim = model.predict(x_test)
ids = df_test['id'].values
submission = pd.DataFrame({'id' : ids, 'claim' : claim})

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)