In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from lightgbm import LGBMClassifier ## Add for XGBoost, CatBoost as well

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

import optuna
from optuna.samplers import TPESampler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

### Let's have an Overview of the Data

In [None]:
print(df_train.info())
print('*****')
print(df_test.info())

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
features = [col for col in df_train.columns if 'f' in col]

In [None]:
print(features)

## Observing the test data

In [None]:
import seaborn as sns

sns.countplot(x = 'target', data = df_train)

## Conclusion
The distribution of data with target = 0 and target = 1 made are the same. This is one less thing to worry about :)

## Observing the distribution of data in the train and test set

In [None]:
import matplotlib.pyplot as plt

for idx, feature in enumerate(features):
    plt.hist(df_train[feature], bins=30, alpha=0.5, label='Train set')
    plt.hist(df_test[feature], bins=30, alpha=0.5, label='Test set')
    plt.title(feature + " Train/Test")
    plt.xlabel(feature)
    plt.ylabel('Frequency')

    plt.legend()
    plt.show()

## Conclusion
The train and test data distribution are similar. This is great!!!

In [None]:
print(df_train[features].isna().sum().sum())
print(df_test[features].isna().sum().sum())

No missing values in train and test data. That is one less thing to worry about

In [None]:
X = df_train[features].copy()
y = df_train['target'].copy()

x_test = df_test[features].copy()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[features] = scaler.fit_transform(X[features])
x_test[features] = scaler.transform(x_test[features])

### Implementing LGBM Classifier

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

### Hyperparameter Optimization Usinf optuna

def lgb_objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'n_estimators': trial.suggest_int("n_estimators", 64, 8192),
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.25, log=True),
        'num_leaves': trial.suggest_int("num_leaves", 20, 3000),
        'max_depth': trial.suggest_int("max_depth", 3, 12),
        'feature_fraction': trial.suggest_float("feature_fraction", 0.1, 1.0),
        'min_gain_to_split' : trial.suggest_int('min_gain_to_split', 0, 15),
        'min_data_in_leaf' : trial.suggest_int("min_data_in_leaf", 100, 1000),
        'lambda_l1': trial.suggest_loguniform("lambda_l1", 1e-8, 100.0),
        'lambda_l2': trial.suggest_loguniform("lambda_l2", 1e-8, 100.0),
        'bagging_fraction' : trial.suggest_float("bagging_fraction", 0, 0.8),
        'bagging_freq' : trial.suggest_int("bagging_freq", 1, 100),
        'seed': 42,
        'deterministic': True,
        'metric' : 'auc',
        'verbose':-1
    }
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42)
    
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train,
             eval_set = [(X_train, y_train), (X_val, y_val)],
              early_stopping_rounds = 100,
              eval_metric = 'auc',
              verbose = 0
            )
    pred_val = model.predict(X_val)
    
    return roc_auc_score(y_val, pred_val)

In [None]:
sampler = TPESampler(seed = 42)
study = optuna.create_study(study_name = 'lgbm_hpo',direction = 'maximize',sampler = sampler)

study.optimize(lgb_objective, n_trials = 10)

In [None]:
lgbmparams = study.best_params

In [None]:
RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

test_preds_lgb = []
mean_auc = 0

model = LGBMClassifier(**lgbmparams)
best_lgb_model = None
best_roc_score_lgb = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.loc[train_idx], X.loc[val_idx]
    y_train, y_val = y.loc[train_idx], y.loc[val_idx]
    
    model.fit(X_train, y_train,
             verbose = 0,
             eval_set = [(X_val, y_val)],
             eval_metric = 'auc',
             early_stopping_rounds = 100)
   
    y_pred = model.predict(X_val)
    score = roc_auc_score(y_val, y_pred)
    mean_auc += score
    
    
    if score>best_roc_score_lgb:
        best_roc_score_lgb = score
        best_lgb_model = model
    
    print(f"Fold {fold}'s score: {score}")
        
    test_preds_lgb.append(model.predict(x_test))

print("==========================================")
print(f"Mean auc of all folds: {mean_auc / n_splits}")

In [None]:
feature_impt=pd.DataFrame()
feature_impt['features']=best_lgb_model.feature_name_
feature_impt['importance']=best_lgb_model.feature_importances_

feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (20,25))
sns.barplot(x=feature_impt['importance'],y=feature_impt['features'],data=feature_impt);

In [None]:
#final_lgb = pd.DataFrame()
#final_lgb['id'] = df_test['id']
#final_lgb['target'] = np.mean(test_preds_lgb, axis = 0)

In [None]:
#final_lgb.to_csv('final_lgb', index=False)

### Implementing XGB Classifier

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb

def xgb_objective(trial):
    params = {
        'boosting_type': 'gbtree',
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.25, log=True),
        'verbose':0,
        'gamma':trial.suggest_float("gamma", 1.0, 10),
        'max_depth': trial.suggest_int("max_depth", 3, 12),
        'lambda' : trial.suggest_loguniform("lambda", 1e-8, 100.0),
        'alpha' : trial.suggest_float("alpha", 1.0, 10),
        'eval_metric' : 'auc',
        'seed': 42,
        'objective': 'binary:hinge',
        'deterministic': True
    }
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42)
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train,
              eval_set = [(X_train, y_train), (X_val, y_val)],
              early_stopping_rounds = 100,
              eval_metric = 'auc',
              verbose = 0
            )
    pred_val = model.predict(X_val)
    
    return roc_auc_score(y_val, pred_val)

In [None]:
sampler = TPESampler(seed = 42)
study = optuna.create_study(study_name = 'xgb_hpo',direction = 'maximize',sampler = sampler)

study.optimize(xgb_objective, n_trials = 5)

In [None]:
xgbparams = study.best_params

In [None]:
print(xgbparams)

In [None]:
from sklearn.model_selection import StratifiedKFold

RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

test_preds_xgb = []
mean_auc = 0

model = xgb.XGBClassifier(**xgbparams)
best_xgb_model = None
best_roc_score_xgb = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.loc[train_idx], X.loc[val_idx]
    y_train, y_val = y.loc[train_idx], y.loc[val_idx]
    
    model.fit(X_train, y_train,
             verbose = 0,
             eval_set = [(X_val, y_val)],
             eval_metric = 'auc',
             early_stopping_rounds = 100)
    
    y_pred = model.predict(X_val)
    score = roc_auc_score(y_val, y_pred)
    mean_auc += score
    
    
    if score>best_roc_score_xgb:
        best_roc_score_xgb = score
        best_xgb_model = model
    
    print(f"Fold {fold}'s score: {score}")
        
    test_preds_xgb.append(model.predict(x_test))

print("==========================================")
print(f"Mean auc of all folds: {mean_auc / n_splits}")

In [None]:
feature_impt=pd.DataFrame(list(best_xgb_model.get_booster().get_fscore().items()),
columns=['feature','importance']).sort_values('importance', ascending=False)
feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (20,25))
sns.barplot(x=feature_impt['importance'],y=feature_impt['feature'],data=feature_impt);

In [None]:
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['target'] = np.mean(test_preds_xgb, axis = 0)

In [None]:
submission.to_csv('submission', index=False)

### Implementing CATBoost Classifier

Uncomment the following lines to implement CATBoost model.

In [None]:
#from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score

#import catboost as catb

#def catb_objective(trial):
#    params = {
#        'iterations' : trial.suggest_int("iterations", 50,100),
#        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.25),
#        'depth': trial.suggest_int("depth", 3, 12),
#        'boosting_type': 'Plain',
#        'objective': 'CrossEntropy',
#        'random_seed': 42,
#        'eval_metric' : 'AUC',
#        'bootstrap_type': 'Bernoulli',
#        'logging_level': None,
#        'verbose':-1
#    }
    
#    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42)
    
#    model = catb.CatBoostClassifier(**params)
#    model.fit(X_train, y_train,
#              eval_set = [(X_val, y_val)],
#              early_stopping_rounds = 100,
#              verbose = 0
#            )
#    pred_val = model.predict(X_val)
    
#    return roc_auc_score(y_val, pred_val)

In [None]:
#sampler = TPESampler(seed = 42)
#study = optuna.create_study(study_name = 'catb_hpo',direction = 'maximize',sampler = sampler)

#study.optimize(catb_objective, n_trials = 10)

In [None]:
#catbparams = study.best_params

In [None]:
#RANDOM_SEED = 42
#n_splits = 5
#skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

#test_preds_catb = []
#mean_auc = 0

#model = catb.CatBoostClassifier(**catbparams)
#best_catb_model = None
#best_roc_score_catb = 0

#for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
#    X_train, X_val = X.loc[train_idx], X.loc[val_idx]
#    y_train, y_val = y.loc[train_idx], y.loc[val_idx]
    
#    model.fit(X_train, y_train,
#             verbose = 0,
#             eval_set = [(X_val, y_val)],
#             early_stopping_rounds = 100)
    
#    y_pred = model.predict(X_val)
#    score = roc_auc_score(y_val, y_pred)
#    mean_auc += score
    
    
#    if score>best_roc_score_catb:
#        best_roc_score_catb = score
#        best_catb_model = model
    
#    print(f"Fold {fold}'s score: {score}")
        
#    test_preds_catb.append(model.predict(x_test))

#print("==========================================")
#print(f"Mean auc of all folds: {mean_auc / n_splits}")

In [None]:
#feature_impt=pd.DataFrame()
#feature_impt['features']=best_catb_model.feature_name_
#feature_impt['importance']=best_catb_model.feature_importances_

#feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
#plt.figure(figsize = (20,25))
#sns.barplot(y=feature_impt['features'],data=feature_impt);

In [None]:
#final_catb = pd.DataFrame()
#final_catb['id'] = df_test['id']
#final_catb['target'] = np.mean(test_preds_catb, axis = 0)