In [None]:
%%time

import os, psutil
import gc

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import itertools

from sklearn.model_selection import cross_validate,cross_val_score,train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score,make_scorer, precision_score, recall_score,f1_score, roc_curve,auc
from sklearn import ensemble,metrics,model_selection,neighbors,preprocessing, svm, tree
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
from optuna.integration import LightGBMPruningCallback

import scikitplot.metrics as skplot
import datatable as dt

from sklearn.cluster import KMeans
from pickle import *

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Helper functions

In [None]:
def cpu_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    return f'Memory Usage : {round(py.memory_info()[0]/2**30,2)}'

In [None]:
# function to reduce data memory size
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum()/1024**2
    numerics = ['int8', 'int16', 'int32','int64', 'float16','float32','float64']
    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(df[col].dtype)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum()/1024**2
    print(f'Memory reduced from {round(start_mem,2)} -> {round(end_mem,2)}.\nReduction in memory size by {round(((start_mem - end_mem)/start_mem)*100,2)}%')
    cpu_usage()

In [None]:
def get_data_info():
    print(str.center(' Train Info ',40, '-'))
    print(f'Rows : {train.shape[0]}, Columns : {train.shape[1]}')
    temp = train.dtypes.value_counts()
    print([f'{temp.index[i]} : {temp[i]}' for i in range(0,len(temp))])
    print(f'Target {train.target.value_counts(normalize = True).index[0]} : {train.target.value_counts(normalize = True)[0]*100}%')
    print(f'Target {train.target.value_counts(normalize = True).index[1]} : {train.target.value_counts(normalize = True)[1]*100}%')
    
    print(str.center(' Test Info ',40, '-'))
    print(f'Rows : {test.shape[0]}, Columns : {test.shape[1]}')
    temp = test.dtypes.value_counts()
    print([f'{temp.index[i]} : {temp[i]}' for i in range(0,len(temp))])

## Get data and compress 

In [None]:
cols_to_remove = ['id']

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-nov-2021/train.csv').to_pandas()
test = dt.fread('../input/tabular-playground-series-nov-2021/test.csv').to_pandas()

In [None]:
train.drop(cols_to_remove, axis = 1, inplace = True)
test.drop(cols_to_remove, axis = 1, inplace = True)

In [None]:
reduce_memory_usage(train)
reduce_memory_usage(test)

In [None]:
train['target'] = train['target'].astype(int).astype(object)

In [None]:
cat_cols = test.select_dtypes(include = bool).columns
cont_cols = test.select_dtypes(include = 'float16').columns

In [None]:
train_sub = train.sample(frac = .20, random_state = 42).reset_index(drop = True) # getting a sample of training data to run the models faster

## Checking distributions 

### Train and test distributions

In [None]:
# checking train and test distributions 
fig = plt.figure(figsize = (20,40))
for idx, col in enumerate(cont_cols):
    fig.add_subplot(np.ceil(len(cont_cols)/4),4,idx+1)
    sns.kdeplot(train[col],fill = True);
    sns.kdeplot(test[col], fill = True);
fig.tight_layout()

### Target distributions

In [None]:
# checking train data with target distributions 
fig = plt.figure(figsize = (20,40))
for idx, col in enumerate(cont_cols):
    fig.add_subplot(np.ceil(len(cont_cols)/4),4,idx+1)
    sns.kdeplot(train[train['target'] == 1][col],fill = True);
    sns.kdeplot(train[train['target'] == 0][col], fill = True);
fig.tight_layout()

- f34, 43, 55, 8, 91 etc. can differentiate the target to some extent
- Expecting to see these variables in the top features as well

## Baseline model fitting

- Building different models and evaluating the performance and fit timings

In [None]:
train['target'] = train['target'].astype(int)
train_sub = train.sample(frac = .20, random_state = 42).reset_index(drop = True) # getting a sample of training data to run the models faster
X_train, X_test, y_train, y_test = train_test_split(train_sub.drop(columns = 'target', axis = 1),train_sub['target'],
                                                    train_size = 0.8, test_size = 0.2, 
                                                    random_state = 42,stratify = train_sub['target'])
# logreg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()
cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent')

model_list = [rf, xgb, lgbm, cb]

scoring = {'auc_score' : make_scorer(roc_auc_score),
           'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

In [None]:
# fit list of models and return the metrics in a dataframe
def model_fit(model_list, cv = 3):
    results = pd.DataFrame(index = ['auc_score_train','auc_score_test','fit_time','precision', 'recall', 'f1_score'])
    for model in model_list:
        cv_score_list = []
        model.fit(X_train,y_train)
        y_preds = model.predict(X_test)
        roc_auc_score_test = roc_auc_score(y_test,y_preds)
        
        cv_score =  cross_validate(model,X_train, y_train, 
                         cv = StratifiedKFold(n_splits = cv, random_state = 42),
                         scoring = scoring, verbose = 2)
        
        cv_score_list.append(cv_score['test_auc_score'].mean())
        cv_score_list.append(roc_auc_score_test)
        cv_score_list.append(cv_score['fit_time'].mean())
        cv_score_list.append(cv_score['test_precision'].mean())
        cv_score_list.append(cv_score['test_recall'].mean())
        cv_score_list.append(cv_score['test_f1_score'].mean())
        
        results[model.__class__.__name__] = cv_score_list
        print(f'-----------------{model.__class__.__name__} Fitted -----------------')
    return results

In [None]:
# print important features from all models in the list
def model_feat_imp(X_train = X_train ,model_list = model_list, top_features = 10):
    all_feat_imp = {}
    imp_columns = []
    all_feat_imp['columns'] = X_train.columns
    for model in model_list:
        all_feat_imp[model.__class__.__name__] = model.feature_importances_
    all_feat_imp_df = pd.DataFrame(all_feat_imp)
    fig, axes = plt.subplots(1,4, figsize = (20,10))
    for idx in range(0,len(all_feat_imp_df.columns)-1):
        temp = all_feat_imp_df[['columns',all_feat_imp_df.columns[idx+1]]].sort_values(by = all_feat_imp_df.columns[idx+1], ascending = False)[:top_features]
        imp_columns.append(list(temp['columns'].values))
        sns.barplot(y = 'columns', x = temp.iloc[:,1], data = temp, palette='Oranges_r', ax = axes[idx]);  
        plt.suptitle('Feature importance across all models',fontsize = 15)
    print(f'Top features across all models are : {list(set(list(itertools.chain(*imp_columns))))}')
    print('--'*70)

In [None]:
%%time
all_models_results = model_fit(model_list, cv = 3)
all_models_results

- Catboost performs the best, and even slightly better than xgboost
- Running parameter tuning on the catboost model

In [None]:
model_feat_imp()

## Feature engg with top 10 imp features

In [None]:
def create_cluster_features(imp_features = ['f50', 'f41', 'f97', 'f91', 'f27', 'f43', 'f34', 'f8', 'f80', 'f55', 'f71'],
                           n_clusters = 12,
                           train = train,
                           test = test):
    n_clusters = 12
    cd_feature = True # cluster distance instead of cluster number
    cluster_cols = [f"f{i+100}" for i in range(n_clusters)]
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", max_iter=500, random_state=42)
    
    # train impute
    X_cd = kmeans.fit_transform(train[imp_features])
    imp_cluster_values_train = pd.DataFrame(X_cd, columns = cluster_cols)
    train = pd.concat([train,imp_cluster_values_train], axis = 1)
    
    # test impute
    X_cd = kmeans.transform(test[imp_features])
    imp_cluster_values_test = pd.DataFrame(X_cd, columns = cluster_cols)
    test = pd.concat([test,imp_cluster_values_test], axis = 1)
    return train, test

In [None]:
train, test = create_cluster_features()

In [None]:
%%time
all_models_results = model_fit(model_list, cv = 3)
model_feat_imp()
all_models_results

### Catboost Parameter tuning with Optuna

In [None]:
def objective(trial, X = train_sub.drop(columns = 'target', axis = 1), y = train_sub['target'].astype(int)):
    
    param_grid = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }
    
    cv = StratifiedKFold(shuffle= True, random_state= 42)
    cv_scores = np.empty(5)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X,y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    
    
        model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=42,
        border_count=64,
        verbose= False,
        **param_grid)    

        model.fit(X_train, y_train)

        y_preds = model.predict(X_test)
        auc_score = roc_auc_score(y_test, y_preds)
        cv_scores[idx] = auc_score
    
    return np.mean(cv_scores)

In [None]:
# %%time
# from optuna.samplers import TPESampler
# import multiprocessing
# study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed= 42))
# study.optimize(objective, n_trials = 100)

In [None]:
# from pickle import *
# pickle.dump(study.best_trial.params, open('CatBoost_Hyperparameter.pickle', 'wb'))
# print('CatBoost Hyperparameter:', study.best_trial.params)

In [None]:
# chosen from optuna 100 trials
cb_best_params =  {'iterations': 291, 
                          'depth': 4, 
                          'learning_rate': 0.1900531202663395, 
                          'random_strength': 16, 
                          'bagging_temperature': 2.2769339956247365, 
                          'od_type': 'IncToDec'}

In [None]:
cb_model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=42,
        border_count=64,
        **cb_best_params)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_sub.drop(columns = 'target', axis = 1),train_sub['target'],
                                                    train_size = 0.8, test_size = 0.2, 
                                                    random_state = 42,stratify = train_sub['target'])
cb_model.fit(X_train,y_train)
y_preds = cb_model.predict(X_test)
roc_auc_score(y_test,y_preds)

In [None]:
# making baseline submission
preds = cb_model.predict(test)
ss = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
ss['target'] = preds
ss.to_csv('./baseline_submission.csv', index=False)