In [None]:
import numpy as np 
import pandas as pd
import plotly as py
from statistics import mean
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from umap import UMAP

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.ensemble import VotingClassifier

import optuna

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

**Results:**

1. XGB solo result - 0.81771
2. CB solo result - 0.81770
3. LGBM solo result - 0.81795
4. Voting result - 0.81791

# Basic information

In [None]:
train.head(3)

In [None]:
cols = train.drop(['id', 'claim'], axis = 1).columns.tolist()
def info(data):
    
    print(f'Length of data: {len(data)}')
    
    print('')
    
    x = pd.Series([])
    for i in data.columns.tolist():
        x = x.append(pd.Series([data[i].dtypes]))
    
    print(x.value_counts().to_frame().reset_index().rename(columns={0: 'count', 'index': 'type'}))
    
    print('')
    
    flag = True
    for i in cols:
        if data[i].isna().sum() == 0:
            flag = False
            break
            
    print(f'All features have missing values: {flag}')
    
    list_na = []
    for i in cols:
        list_na.append(data[i].isna().sum())
    print(f'Mean of missing values is {mean(list_na)} ({round((mean(list_na)/len(data)) * 100,2)}%)')
    print(f'Max of missing values has {cols[list_na.index(max(list_na))]}: {max(list_na)} ({round((max(list_na)/len(data)) * 100,2)}%)')
    print(f'Min of missing values has {cols[list_na.index(min(list_na))]}: {min(list_na)} ({round((min(list_na)/len(data)) * 100,2)}%)')

print('TRAINING DATASET INFORMATION')
print('')
info(train)
print('---------------------------------------------')
print('TEST DATASET INFORMATION')
print('')
info(test)

# EDA

In [None]:
fig = plt.figure(figsize = (15, 71))
cols = train.columns.tolist()[1:119]
for i in cols:
    plt.subplot(24,5,cols.index(i)+1)
    sns.set_style("white")
    plt.title(i, size = 12, fontname = 'monospace')
    a = sns.kdeplot(train[i], color = '#f9ba32', linewidth = 1.3)
    sns.kdeplot(test[i], color = '#426e86', linewidth = 1.3)
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)

plt.figtext(0.335, 1.02, 'Distribution of features', color = '#2f3131', fontname = 'monospace', size = 25)
plt.figtext(0.3, 1.01, 'train', color = '#f9ba32', fontname = 'monospace', size = 18)
plt.figtext(0.66, 1.01, 'test', color = '#426e86', fontname = 'monospace', size = 18)

plt.show()

In [None]:
plt.figure(figsize = (15, 5))
sns.set_style("white")
plt.title('Distribution of target', fontname = 'monospace', fontsize = 35, color = '#32384D', x = 0.5, y = 1.05)
a = sns.countplot(y = train['claim'], palette = (['#E29930', '#217CA3']))
a.set_yticklabels(['No claim', 'Claim'])
plt.axhline(0.5, 0, 0.951, color = '#211F30')
plt.xticks([])
plt.yticks(fontname = 'monospace', fontsize = 18)
plt.ylabel('')
plt.xlabel('')

a.text(210000, 0.05, '50.2%', fontname = 'monospace', fontsize = 40, color = 'white')
a.text(210000, 1.05, '49.8%', fontname = 'monospace', fontsize = 40, color = 'white')
a.text(215000, 0.3, '(480 404)', fontname = 'monospace', fontsize = 20, color = 'white')
a.text(215000, 1.3, '(477 515)', fontname = 'monospace', fontsize = 20, color = 'white')

a.spines['left'].set_linewidth(1.5)
for w in ['right', 'top', 'bottom']:
    a.spines[w].set_visible(False)
        
plt.show()

In [None]:
matrix = np.triu(train.drop('id', axis = 1).corr())
plt.figure(figsize = (15, 12))
sns.heatmap(train.drop('id', axis = 1).corr(), annot = False, cmap = 'Spectral', mask = matrix, vmin = -0.05, vmax = 0.05, linewidths = 0.1, linecolor = 'white', cbar = True)
plt.xticks(size = 8, fontname = 'monospace')
plt.yticks(size = 8, fontname = 'monospace')
plt.figtext(0.77, 0.8, '''All 118 features and the target variable
have a very small
correlation''', fontsize = 20, fontname = 'monospace', ha = 'right', color = '#f9ba32')
plt.show()

In [None]:
corr = train.drop('id', axis = 1).corr()['claim'].reset_index().drop(index=[118])
min_corr = corr.min()[1]
max_corr = corr.max()[1]
corr.query("claim == @min_corr | claim == @max_corr").rename(columns = {'index': 'feature'}).rename(index = {33: 'max_neg_correlation', 94: 'max_pos_correlation'})

# Preprocessing

In [None]:
features = train.columns.tolist()[1:119]

train['n_missing'] = train[features].isna().sum(axis = 1)
test['n_missing'] = test[features].isna().sum(axis = 1)

train['std'] = train[features].std(axis = 1)
test['std'] = test[features].std(axis = 1)

features += ['n_missing', 'std']

imputer = SimpleImputer(strategy = 'mean')
for i in features:
    train[i] = imputer.fit_transform(np.array(train[i]).reshape(-1,1))
    test[i] = imputer.transform(np.array(test[i]).reshape(-1,1))

sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

X = train.drop(['id', 'claim'], axis = 1)
y = train['claim']
test.drop('id', axis = 1, inplace = True)

Memory optimization. It would be nice to convert the data to float16, but the XGB for some reason does not support this format. The function is taken from [here](https://www.kaggle.com/rinnqd/reduce-memory-usage)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
reduce_mem_usage(train)
reduce_mem_usage(X)
reduce_mem_usage(test)

# UMAP

In [None]:
#umap = UMAP(n_components = 2, n_neighbors = 10, min_dist = 0.99).fit_transform(train.drop(['id', 'claim'], axis = 1).sample(150000, random_state = 228), train['claim'].sample(150000, random_state = 228))

plt.figure(figsize=(15, 12))
scu = sns.scatterplot(x = umap[:, 0], y = umap[:, 1], hue = train['claim'].sample(150000, random_state = 228), palette = ['#f9ba32','#426e86'], s = 5, edgecolor = 'none', alpha = 0.4)
plt.xticks([])
plt.yticks([])
for i in ['right', 'left', 'top', 'bottom']:
    scu.spines[i].set_visible(False)
plt.legend(ncol = 2, borderpad = 1, frameon = True, fontsize = 11)
scu.text(-4.6, 6.4, '''n_components = 2
n_neighbors = 10
min_dist = 0.99''', fontname = 'monospace', fontsize = 12, color = 'black')
plt.show()

# XGB

In [None]:
def objective(trial, data = X, target = y):

    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 10000, 50000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 228,
        'use_label_encoder': False,
        'eval_metric': 'auc'
    }
    
    model = XGBClassifier(**params)
    scores = []
    k = StratifiedKFold(n_splits = 2, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X, y)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 300, verbose = False)
        
        tr_preds = model.predict_proba(X_train)[:,1]
        tr_score = roc_auc_score(y_train, tr_preds)
        
        val_preds = model.predict_proba(X_val)[:,1]
        val_score = roc_auc_score(y_val, val_preds)

        scores.append((tr_score, val_score))
        
        print(f"Fold {i+1} | AUC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 300)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Mean AUC on 2 folds - 0.81532
# esr - 300
paramsXGB = {'max_depth': 3, 'learning_rate': 0.005702659398906191, 'n_estimators': 22404, 'min_child_weight': 25, 'gamma': 0.00010151247994797229, 'alpha': 7.148020356730985, 'lambda': 0.1378423649746119, 'colsample_bytree': 0.7969227570988136, 'subsample': 0.6382893449313995,
             'tree_method': 'gpu_hist',
             'booster': 'gbtree',
             'random_state': 228,
             'use_label_encoder': False,
             'eval_metric': 'auc'}
# Solo result - 0.81771

In [None]:
folds = StratifiedKFold(n_splits = 5, random_state = 228, shuffle = True)
predictions = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBClassifier(**paramsXGB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 300)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
ss['claim'] = predictions
ss.to_csv('xgb1', index = False)

# CatBoost

In [None]:
def objective(trial, data = X, target = y):
    
    params = {
        'depth': trial.suggest_int('depth', 2, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2),
        'iterations': trial.suggest_int('iterations', 10000, 50000),
        'max_bin': trial.suggest_int('max_bin', 1, 300),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Newton', 'Gradient']),
        'bootstrap_type': 'Bernoulli',
        'random_seed': 228,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'task_type': 'GPU'
    }
    
    model = CatBoostClassifier(**params)
    scores = []
    k = StratifiedKFold(n_splits = 2, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X, y)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 300, verbose = False)
        
        tr_preds = model.predict_proba(X_train)[:,1]
        tr_score = roc_auc_score(y_train, tr_preds)
        
        val_preds = model.predict_proba(X_val)[:,1]
        val_score = roc_auc_score(y_val, val_preds)

        scores.append((tr_score, val_score))
        
        print(f"Fold {i+1} | AUC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 300)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Mean AUC on 2 folds - 0.81518
# esr - 300
paramsCB = {'depth': 3, 'learning_rate': 0.017585381726501453, 'iterations': 11636, 'max_bin': 461, 'min_data_in_leaf': 162, 'l2_leaf_reg': 0.02724781040038058, 'subsample': 0.6892384815879177, 'grow_policy': 'Depthwise', 'leaf_estimation_method': 'Gradient',
            'bootstrap_type': 'Bernoulli',
            'random_seed': 228,
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'task_type': 'GPU' }
# Solo result - 0.81770

In [None]:
folds = StratifiedKFold(n_splits = 5, random_state = 228, shuffle = True)
predictions = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**paramsCB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 300)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
ss['claim'] = predictions
ss.to_csv('cb1', index = False)

# LGBM

In [None]:
def objective(trial, data = X, target = y):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 15000),
        'max_depth': trial.suggest_int('max_depth', 2, 3),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 50, 500),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 200),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'random_state': 228,
        'metric': 'auc',
        'device_type': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }
    
    model = LGBMClassifier(**params)
    scores = []
    k = StratifiedKFold(n_splits = 2, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X, y)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 300, verbose = False)
        
        tr_preds = model.predict_proba(X_train)[:,1]
        tr_score = roc_auc_score(y_train, tr_preds)
        
        val_preds = model.predict_proba(X_val)[:,1]
        val_score = roc_auc_score(y_val, val_preds)

        scores.append((tr_score, val_score))
        
        print(f"Fold {i+1} | AUC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Mean AUC on 2 folds - 0.8151
paramsLGBM = {'n_estimators': 11990, 'max_depth': 3, 'learning_rate': 0.016501612373246877, 'reg_alpha': 7.555087388180319, 'reg_lambda': 0.9534606245427513, 'num_leaves': 155, 'min_data_per_group': 177, 'min_child_samples': 150, 'colsample_bytree': 0.22781593823447946,
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'random_state': 228,
            'metric': 'auc',
            'device_type': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0}
# Solo result - 0.81795

In [None]:
folds = StratifiedKFold(n_splits = 5, random_state = 228, shuffle = True)
predictions = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**paramsLGBM)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 300)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
ss['claim'] = predictions
ss.to_csv('lgbm1', index = False)

# Voting time

In [None]:
xgb_model = XGBClassifier(**paramsXGB)
cb_model = CatBoostClassifier(**paramsCB)
lgbm_model = LGBMClassifier(**paramsLGBM)

# XGB solo result - 0.81771
# CB solo result - 0.81770
# LGBM solo result - 0.81795

In [None]:
folds = StratifiedKFold(n_splits = 5, random_state = 228, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = VotingClassifier(
            estimators = [
                ('xgb', xgb_model),
                ('cb', cb_model),
                ('lgbm', lgbm_model)       
            ],
            voting = 'soft',
            weights = [0.3, 0.3, 0.4],
            n_jobs = -1
        )
   
    model.fit(X_train, y_train)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits
    
ss['claim'] = predictions

In [None]:
ss.to_csv('voting', index = False)
ss