In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

from tqdm import tqdm
from glob import glob
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import shap

plt.rcParams["figure.figsize"] = (12,8)
plt.rcParams['axes.titlesize'] = 16

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '/kaggle/input/insurance-churn-prediction-weekend-hackathon/Insurance_Churn_ParticipantsData/'

In [None]:
train = pd.read_csv(base_dir + 'Train.csv')
print(f'Number of rows in trainset: {train.shape[0]} \nNumber of columns in trainset: {train.shape[1]}')
train.head()

In [None]:
test = pd.read_csv(base_dir + 'Test.csv')
print(f'Number of rows in testset: {test.shape[0]} \nNumber of columns in testset: {test.shape[1]}')
test.head()

In [None]:
!pip install -q openpyxl

In [None]:
sub = pd.read_excel(base_dir + 'sample_submission.xlsx')
sub.head()

In [None]:
train.info()

- Feature0 to feature6 are numerical features with dtype float64
- Feature7 to feature15 seems to be categorical with dtype int64

In [None]:
train.describe().T

__Check for NaNs in train and test__

In [None]:
train.isna().sum(), test.isna().sum()

- There are no NaNs in the dataset

__Count plot of target__

In [None]:
ax = sns.countplot(data = train, x = 'labels', palette = 'Set3')

for p in ax.patches:
        ax.annotate('{:.1f}%'.format(100 * p.get_height() / len(train)), (p.get_x() + 0.1, p.get_height() + 5))

- From the countplot its clear that dataset is unbalanced.

__Boxplots of numerical features__

In [None]:
features = list(train.columns)
features.remove('labels')
features

In [None]:
numerical_features = [c for c in train.columns if train[c].dtype == 'float64']
numerical_features

In [None]:
fig, ax = plt.subplots(4, 2, figsize = (15, 10))
ax = ax.flatten()
for i, c in enumerate(numerical_features):
    sns.boxplot(x = train[c], ax = ax[i], palette = 'Set3')
plt.suptitle('Box Plot', fontsize = 25)
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(4, 2, figsize = (20, 15))
ax = ax.flatten()
for i, c in enumerate(numerical_features):
    sns.histplot(x = train[c], ax = ax[i], kde = True)
plt.suptitle('Histogram Plot', fontsize = 25)
fig.tight_layout()

- Numerical features seems to have lots of outliers.

__Value counts of int64 features__

In [None]:
int64_cols = [c for c in train.columns if train[c].dtype == 'int64']
print(f'There are {len(int64_cols)} features with int64 dtype: \n{int64_cols}')

#Check their unique values

print('Unique number of values in int64 features:')
for c in int64_cols:
    if c != 'labels':
        print(f'{c.upper()}: {train[c].nunique()}, {test[c].nunique()}')

In [None]:
fig, ax = plt.subplots(5, 2, figsize = (10, 20))
ax = ax.flatten()
for i, c in enumerate(int64_cols):
    a = sns.countplot(x = train[c], ax = ax[i], palette = 'Set3', hue = train['labels'])
    for p in a.patches:
        a.annotate('{:.1f}%'.format(100 * p.get_height() / len(train)), (p.get_x() + 0.1, p.get_height() + 5))
plt.suptitle('Count Plot of Categorical Features', fontsize = 20)
fig.tight_layout()

__Standardize Num Features and Label Encode Cat Features__

In [None]:
scl = StandardScaler()
train[numerical_features] = scl.fit_transform(train[numerical_features])
test[numerical_features] = scl.transform(test[numerical_features])

In [None]:
int64_cols.remove('labels')
for c in int64_cols: 
    lbl = LabelEncoder() 
    lbl.fit(list(train[c].astype(str).values) + list(test[c].astype(str).values)) #Takes care of cardinality mismatch
    train[c] = lbl.transform(list(train[c].astype(str).values))
    test[c] = lbl.transform(list(test[c].astype(str).values))

__Feature Selection using Forward Propagation__

In [None]:
X = train.drop('labels', axis = 1)
y = train['labels']

sfs = SFS(LogisticRegression(class_weight = 'balanced'),
           k_features = 10,
           forward = True,
           floating = False,
           scoring = 'f1',
           cv = 2)

sfs.fit(X,y)

print(f'Top 10 features selected using Forward Propagation: \n{sfs.k_feature_names_}')
print(f'Score: {sfs.k_score_}')

selected_features = list(sfs.k_feature_names_)

In [None]:
fig = plot_sfs(sfs.get_metric_dict(), kind = 'std_dev')
plt.title('Sequential Forward Selection')
plt.grid()
plt.show()

- From the above plot f1 score tend to flatten after 8th feature

In [None]:
train_df, valid_df = train_test_split(train, test_size = 0.2, random_state = 2021, stratify = train['labels'])

Xtrain = train_df[selected_features]
ytrain = train_df['labels']
Xvalid = valid_df[selected_features]
yvalid = valid_df['labels']
print(Xtrain.shape, ytrain.shape, Xvalid.shape, yvalid.shape)

In [None]:
num_pos_samples = train['labels'].value_counts().values[1]
total_samples = len(train['labels'])
scale_pos_weight = 100 - ( (num_pos_samples / total_samples) * 100 )
scale_pos_weight

In [None]:
import lightgbm as lgbm

params = {'num_leaves': 7,  # 2^max_depth - 1
          'min_child_samples': 100,
          'objective': 'binary',
          #'scale_pos_weight': scale_pos_weight, #99,
          'is_unbalance': 'true',
          'max_depth': 3,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "subsample_freq": 3,
          "subsample": 0.7,
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          'colsample_bytree': 0.9,
          'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
          'seed': 2021
         }

In [None]:
print(f'Training LightGBM..')
ltrain = lgbm.Dataset(Xtrain, label = ytrain)
lvalid = lgbm.Dataset(Xvalid, label = yvalid)

num_rounds = 10000
clf = lgbm.train(params, ltrain, num_rounds, valid_sets = [ltrain, lvalid], verbose_eval = 50, 
                    early_stopping_rounds = 100)

train_preds = clf.predict(Xtrain, num_iteration = clf.best_iteration)
print(f'Training ROC_AUC_SCORE: {roc_auc_score((train_preds > 0.5), ytrain)}')
print(f'Training F1 SCORE: {f1_score((train_preds > 0.5), ytrain)}')

valid_preds = clf.predict(Xvalid, num_iteration = clf.best_iteration)
print(f'Validation ROC_AUC_SCORE: {roc_auc_score((valid_preds > 0.5), yvalid)}')
print(f'Validation F1 SCORE: {f1_score((valid_preds > 0.5), yvalid)}')

test_preds = clf.predict(test[selected_features], num_iteration = clf.best_iteration)
print(test_preds[:10])

In [None]:
sub['labels'] = (test_preds > 0.5).astype(int)
ax = sns.countplot(data = sub, x = 'labels', palette = 'Set3')

for p in ax.patches:
        ax.annotate('{:.1f}%'.format(100 * p.get_height() / len(train)), (p.get_x() + 0.1, p.get_height() + 5))

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))