# TPS - NOV 2021

The main objective of this notebook is to learn for myself. I'm implementing different techniques that I learned in the previous TPS. But I cannot guarantee a high scoring notebook! Read on if you like this might help a few.

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cupy as cp
import pandas as pd
import cudf
import dask_cudf

import gc #to manage ram 
import subprocess

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

print(train.shape)
print(test.shape)

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
print(f'Number of missing values in training data: {train.isna().sum().sum()}')
print(f'Number of missing values in testing data: {test.isna().sum().sum()}')

No missing Values! 

In [None]:
Features = [col for col in train.columns if col not in ['id', 'target']]

By initial impressions the data seems to have no categorical features. Let's check if that is the case.

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
df = pd.concat([train[Features], test[Features]], axis=0)

cat_features = [col for col in Features if df[col].nunique() < 25]
cont_features = [col for col in Features if df[col].nunique() >= 25]

print(f'Total number of features: {len(Features)}')
print(f'Number of categorical features: {len(cat_features)}')
print(f'Number of continuos features: {len(cont_features)}')

plt.pie([len(cat_features), len(cont_features)], 
        labels=['Categorical', 'Continuos'],
        textprops={'fontsize': 13},
        autopct='%1.1f%%')
plt.show()

del df

Yes! We don't have any categorical fearures.

In [None]:
print(train['target'].value_counts())
sns.countplot(x = train['target'],data = train);

Distribution of the target value is balanced :)

In [None]:
#creating a random temperory dataframe to get an idea of how the data is distributed 
#For plotting distributions

np.random.seed(2110)
tmp_train = train.sample(10000)
tmp_test = test.sample(10000)

In [None]:
print("Feature distribution of features: ")
ncols = 5
nrows = 20

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 50), facecolor='#EAEAF2')

for r in range(nrows):
    for c in range(ncols):
        col = Features[r*ncols+c]
        sns.kdeplot(x=tmp_train[col], ax=axes[r, c], label='Train data')
        sns.kdeplot(x=tmp_test[col], ax=axes[r, c], color="orange", label='Test data')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()

del tmp_train
del tmp_test
gc.collect()

In [None]:
columns = 10
rows = 10
f=0
fig, ax_array = plt.subplots(rows, columns, squeeze=False)
for i,ax_row in enumerate(ax_array):
    for j,axes in enumerate(ax_row):
        axes.set_title('f'+str(f))
        axes.set_yticklabels([])
        axes.set_xticklabels([])
        col = 'f'+str(f)
        sns.set(rc = {'figure.figsize':(30,20)})
        g2 = sns.boxplot(train[col],ax=axes)
        g2.set(ylabel=None)
        g2.set(xticklabels=[])
        g2.set(yticklabels=[])
        f=f+1
plt.show()

In [None]:
train["mean"] = train[Features].mean(axis=1)
train["std"] = train[Features].std(axis=1)
train["min"] = train[Features].min(axis=1)
train["max"] = train[Features].max(axis=1)
train["sum"] = train[Features].sum(axis=1)

test["mean"] = test[Features].mean(axis=1)
test["std"] = test[Features].std(axis=1)
test["min"] = test[Features].min(axis=1)
test["max"] = test[Features].max(axis=1)
test["sum"] = test[Features].sum(axis=1)

Features.extend(['mean', 'std', 'min', 'max', 'sum'])

gc.collect()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
corr = train[Features+['target']].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

fig,ax=plt.subplots(figsize=(20,20))
ax.set_xticklabels(labels=corr.columns,fontsize=12)
ax.set_yticklabels(labels=corr.columns,fontsize=12)
sns.heatmap(corr,mask=mask,cmap='tab20c',linewidth=0.1)
plt.title('Correlation Map',color='blue',fontsize=12)
plt.show()

In [None]:
y = train['target']
train = train.drop(['target'], axis=1)

gc.collect()

Adding these features improved the score. We could add more features by performing clustering and PCA(next time, maybe).

Scaling is important!

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train[Features] = scaler.fit_transform(train[Features])
test[Features] = scaler.transform(test[Features])

In [None]:
cat_params = {    
    "objective": "CrossEntropy",
    "eval_metric" : "AUC",
    "task_type": "GPU",
    "grow_policy": "SymmetricTree",
    "learning_rate": 0.08,
    "n_estimators":  10_000,
    "random_strength" : 1.0,
    "max_bin": 128,
    "l2_leaf_reg": 0.002550319996478972,
    "max_depth": 4,
    "min_data_in_leaf": 193,
    'verbose': 0
}

The above hyper-parameters used in training are borrowed from the catboost model from the previous TPS(tuned using Optuna).

Let's train our Catboost model.

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    
    print(f"Fold: {fold}")
    
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**cat_params)
    
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=400, verbose=False)
    #model.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric = 'auc', verbose = 500, early_stopping_rounds = 200)
    
    pred = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)

In [None]:
#lightGBM didn't work 
"""import lightgbm as lgb
lgb_params = {
    'objective': 'binary',
    'n_estimators': 20000,
    'random_state': 42,
    'learning_rate': 8e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'device': 'gpu',
}used these parameters to get an initial score
params = {
        'objective': 'binary',
        'metric': 'AUC',
        'boosting_type': 'dart', # To improve AUC
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.1, 0.9),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 0.9),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'verbose': 0
    }range for optuna to search

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'AUC',
        'boosting_type': 'dart', # To improve AUC
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.1, 0.9),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 0.9),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'verbose': 0
    }
    # Learning
    gbm = LightGBM(params, lgb_train, lgb_valid, num_boost_round, verbose_eval, FEATS)
    # Prediction
    y_pred = gbm.predict(valid[FEATS], num_iteration=gbm.best_iteration)
    accuracy = roc_auc_score(y_va, y_pred, labels='ROC curve', average='weighted')
    print('ROC curve:', accuracy)
    ROC_curve(y_va, y_pred)
    # Finish
    print("Operation completed.")
    # Output
    return accuracy"""

Using Optuna for hyperparameter tuning for the first time. I am attaching this article to get better insight into Optuna implementation.
https://towardsdatascience.com/hyper-parameter-optimization-with-optuna-4920d5732edf. For Optuna we are required to define an Objective function with the loss function or evaluation metric to optimize. The below code is mostly reusable.

In [None]:
def fit_cat(trial, x_train, y_train, x_test, y_test):
    
    params = {'iterations':trial.suggest_int("iterations", 1000, 100000),
              'od_wait':trial.suggest_int('od_wait', 500, 5000),
              'task_type':"GPU",
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.02 , 0.06),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.30 , 0.33),
              'subsample': trial.suggest_uniform('subsample',0.8,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,50),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
              'bootstrap_type':'Poisson'
               }
    
    
    model = CatBoostClassifier(**params)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict_proba(x_train)[:,1]
    
    y_test_pred = model.predict_proba(x_test)[:,1]
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train roc_auc": roc_auc_score(y_train, y_train_pred),
        "valid roc_auc": roc_auc_score(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
def objective(trial):
    roc_auc = 0
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.30)
    model, log = fit_cat(trial, x_train, y_train, x_test, y_test)
    roc_auc += log['valid roc_auc']
        
    return roc_auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Number of completed trials: {}".format(len(study.trials)))

In [None]:
print("Best trial:")
trial = study.best_trial
print(trial)

As the trails are randomly initialised different runs result in different parameters but generally they are fairly close.

In [None]:
optuna.visualization.plot_param_importances(study)

Depth is most important feature followed by learning rate and subsample respectively.

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study, params=['depth', 'learning_rate', 'subsample'])

Lots of plots at our disposal. We can see how score is changing with respect to each parameter. Try tuning a little bit more, it might improve training time if not the final score. There are many more graphs and we can get better search space and we can use pruning to improve training time.

Let's our model with new hyperparameters.

In [None]:
new_params = {
     'iterations': 80203,
     'od_wait': 1765,
     'learning_rate': 0.02010888271017379,
     'reg_lambda': 0.3051769003766273,
     'subsample': 0.9155353016941578,
     'random_strength': 31.905377503941313,
     'depth': 6,
     'min_data_in_leaf': 14,
     'leaf_estimation_iterations': 7,
     'task_type':"GPU",
     'bootstrap_type':'Poisson',
}

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    
    print(f"Fold: {fold}")
    
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**new_params)
   
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=400, verbose=False)
    
    pred = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict_proba(test[Features])[:,1] / folds.n_splits 

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
sub['target'] = predictions
sub.to_csv('submission.csv', index = 0)
sub