# [Tabular Playground Series - Mar 2021 competition.](http://https://www.kaggle.com/c/tabular-playground-series-mar-2021)


## Import libraries & read data

In [None]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
!pip install sweetviz


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from pandas_profiling import ProfileReport
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

import imblearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
        
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

input_path = Path('/kaggle/input/tabular-playground-series-mar-2021/')

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

## Categorical values

In [None]:
for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(train[c].values)
        test[c] = lbl.transform(test[c].values)
        
display(train.head())

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.80)

## EDA

In [None]:
prof = ProfileReport(train)
#prof

In [None]:
import sweetviz as sv
#analyzing the dataset
df_analysis=sv.analyze(train)

#df_analysis.show_html('train_analysis.html')


## Target distribution


In [None]:
print(target.value_counts())

target.value_counts().plot(kind="bar");

The dataset is clearly imbalanced. We will have to deal with a classification on an imbalanced dataset.
There are two ways to deal with imbalanced data : 
* over-sampling : increase the number of instances in the minority class
* under-sampling : decrease the number of instances in the majority class

# Baseline with 'Simple Random Forest'

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(y_test, y_pred)
print(f'{score:0.5f}') # 0.87323 shows we're doing better than a dummy model

## Model comparison

In [None]:
from catboost import CatBoostClassifier
#!pip install --upgrade xgboost
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


model_names = [
                "Random Forest"
               , "CatBoost"
               , "LGBM"
              ]
models = [
    RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1)
    ,CatBoostClassifier(logging_level='Silent')
    ,LGBMClassifier()
]
perfs={}

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
    score = roc_auc_score(y_test, y_pred)
    print(name," : ", score)
    perfs[name]=score

In [None]:
print(perfs)

## What are the default CatBoost hyperparams ?

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(logging_level='Silent')

model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(y_test, y_pred)
print("Score : ", score)
for key, value in model.get_all_params().items():
    print(f"    {key}: {value}")

## Hyperparameters optimization
Let's try to optimize model hyperparameters to improve the score. We will use optuna.

### Catboost & optuna

In [None]:
import optuna
import catboost as cb

X = np.array(X_train)
y = np.array(y_train)

def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X,y, test_size=0.3)

    param = {
#        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        #"learning_rate": trial.suggest_float("colsample_bylevel", 0.09, 0.11),
        "depth": trial.suggest_int("depth", 4, 8),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", [
                                "Bayesian"
                               #, "Bernoulli"
                               , "MVS"]
        ),
        "auto_class_weights":"SqrtBalanced",
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict_proba(valid_x)[:, 1]
    score = roc_auc_score(valid_y, preds)
    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
print("Best trial until now:")
print('Number of finished trials:', len(study.trials))
print('Best value:', study.best_value)
print("Best trial: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

clf = cb.CatBoostClassifier(**study.best_trial.params)

train_x, valid_x, train_y, valid_y = train_test_split(np.array(X_train),np.array(y_train), test_size=0.3)

clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=1000)

y_pred = clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(y_test, y_pred)
print(f'{score:0.5f}')
submission['target'] = clf.predict_proba(test)[:, 1]
submission.to_csv('catboost_optuna.csv')

In [None]:
%%time
from sklearn.model_selection import KFold
folds = KFold(n_splits = 10, shuffle = True, random_state = 42)
X = np.array(X_train)
y = np.array(y_train)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    
    X_train_kf, X_val_kf = X_train.iloc[trn_idx], X_train.iloc[val_idx]
    y_train_kf, y_val_kf = y_train.iloc[trn_idx], y_train.iloc[val_idx]

    model = CatBoostClassifier(**study.best_trial.params)
   
    model.fit(X_train_kf, y_train_kf, eval_set = [(X_val_kf, y_val_kf)], verbose = False, early_stopping_rounds = 222)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
submission = pd.DataFrame({'id': test.index, 'target': predictions})
submission.to_csv('submission_optuna_catboost_kfold.csv', index = False)

In [None]:
plt.figure(figsize=(8,4))
plt.hist(y_pred[np.where(y_test == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(y_pred[np.where(y_test == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()

### LGBM & optuna

In [None]:
import optuna
from lightgbm import LGBMClassifier

X = np.array(X_train)
y = np.array(y_train)

def objective(trial):
    X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(X, y, test_size = 0.2, random_state = 0)

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.005, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'random_state': 42,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        'device': 'gpu'
    }
    
    model = LGBMClassifier(**params)  
    model.fit(X_train_opt, y_train_opt, eval_set = [(X_val_opt,y_val_opt)], early_stopping_rounds = 222, verbose = False)
    y_pred_opt = model.predict_proba(X_val_opt)[:,1]
    score = roc_auc_score(y_val_opt, y_pred_opt)
    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
print("Best trial until now:")
print('Number of finished trials:', len(study.trials))
print('Best value:', study.best_value)
print("Best trial: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

clf = LGBMClassifier(**study.best_trial.params)

train_x, valid_x, train_y, valid_y = train_test_split(np.array(X_train),np.array(y_train), test_size=0.3)

clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=1000)

y_pred = clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(y_test, y_pred)
print(f'{score:0.5f}')
submission['target'] = clf.predict_proba(test)[:, 1]
submission.to_csv('lgbm_optuna.csv')

In [None]:
%%time
from sklearn.model_selection import KFold
folds = KFold(n_splits = 10, shuffle = True, random_state = 42)
X = np.array(X_train)
y = np.array(y_train)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    
    X_train_kf, X_val_kf = X_train.iloc[trn_idx], X_train.iloc[val_idx]
    y_train_kf, y_val_kf = y_train.iloc[trn_idx], y_train.iloc[val_idx]

    model = LGBMClassifier(**study.best_trial.params)
   
    model.fit(X_train_kf, y_train_kf, eval_set = [(X_val_kf, y_val_kf)], verbose = False, early_stopping_rounds = 222)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
submission = pd.DataFrame({'id': test.index, 'target': predictions})
submission.to_csv('submission_optuna_lightgbm_kfold.csv', index = False)

In [None]:
plt.figure(figsize=(8,4))
plt.hist(y_pred[np.where(y_test == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(y_pred[np.where(y_test == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()

## Over sampling

In [None]:
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
print(Counter(y_over))

y_over.value_counts().plot(kind="bar");

In [None]:
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, train_size=0.60)
#clf = RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1)
clf=CatBoostClassifier(**study.best_trial.params)
clf.fit(X_train_over, y_train_over)
y_pred = clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(y_test, y_pred)
print(f'{score:0.5f}') # 0.87323 shows we're doing better than a dummy model
submission['target'] = clf.predict_proba(test)[:, 1]
submission.to_csv('catboost_optuna_oversampling.csv')

# Under sampling

In [None]:
# define oversampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
X_under, y_under = undersample.fit_resample(X_train, y_train)
print(Counter(y_under))

y_under.value_counts().plot(kind="bar");

In [None]:
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, train_size=0.60)
#clf = RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1)
clf=CatBoostClassifier(**study.best_trial.params)

clf.fit(X_train_under, y_train_under)
y_pred = clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(y_test, y_pred)
print(f'{score:0.5f}') # 0.87323 shows we're doing better than a dummy model
submission['target'] = clf.predict_proba(test)[:, 1]
submission.to_csv('catboost_optuna_undersampling.csv')

## SMOTE : TODO
See : https://medium.com/swlh/using-synthetic-data-for-imbalanced-classes-in-a-classification-model-83dfd3ab453c
