# Welcome to the third tabular playground of 2021

In this notebook I did a fast exploratory data analysis. Then, I tuned lgbm hyperparameters using Optuna. Finally, I use a small trick that enables me to obtain a 15th place in last tabular playground.

# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold

from sklearn.metrics import roc_auc_score
import optuna

plt.style.use('fivethirtyeight')

In [None]:
train=pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

# Exploratory Data Analysis

In [None]:
print(f'Number of rows in training set: {train.shape[0]}')
print(f'Number of features in training set: {train.shape[1]}')
print(f'Number of rows in test set: {test.shape[0]}')
print(f'Number of features in test set: {test.shape[1]}')

There are more columns than in the previous tabular playground.

In [None]:
# checking the number of categorical and continuous variables
train.columns

In [None]:
# separating categorical columns from continuous ones
cat_var=[f'cat{i}' for i in range(19)]
cont_var=[f'cont{i}' for i in range(11)]
columns=cat_var+cont_var

cat7, cat8 and cat10 have very high cardinality. We will need to explore thme later.

# Preprocessing

In [None]:
# Simple preprocessing using OnehotEncoder

full=pd.concat([train,test],axis=0)

full=pd.get_dummies(full,columns=cat_var)

#for cat in cat_var:
#    le=LabelEncoder()
#    full[cat]=le.fit_transform(full[cat])
    
train=full.iloc[:len(train),:]
test=full.iloc[len(train):,:]

columns=[column for column in train.columns if column not in ['id','target']]

In [None]:
X=train[columns]
y=train.target
X.shape

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# LGBM Baseline

In [None]:
lgb=LGBMClassifier()
lgb.fit(X_train,y_train,eval_set=(X_test,y_test),early_stopping_rounds=200, verbose=False)
predictions=lgb.predict_proba(X_test)[:,1]

auc=roc_auc_score(y_test,predictions)

print(f'Baseline Score: {auc}')

# Hyperparameter tuning using Optuna

In [None]:
def objective(trial,X=X,y=y):
    
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    
    
    lgb_params={
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2),
        'max_depth': trial.suggest_int('max_depth', 6, 127),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        'random_state': 2021,
        'metric': 'auc',
        'n_estimators': 20000,
        'n_jobs': -1,
        'cat_feature': [x for x in range(len(cat_var))],
        'bagging_seed': 2021,
        'feature_fraction_seed': 2021,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
        'cat_smooth': trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20)}
   
    lgb=LGBMClassifier(**lgb_params)
    lgb.fit(X_train,y_train,eval_set=(X_test,y_test),eval_metric='auc',early_stopping_rounds=100,verbose=False)
    predictions=lgb.predict_proba(X_test)[:,1]
        
    return roc_auc_score(y_test,predictions)

In [None]:
study = optuna.create_study(direction='maximize') 
study.optimize(objective, timeout=3600*7)

In [None]:
study.best_params

In [None]:
lgb_params={'learning_rate': 0.00605886703283976,
 'max_depth': 42,
 'num_leaves': 108,
 'reg_alpha': 0.9140720355379223,
 'reg_lambda': 9.97396811596188,
 'colsample_bytree': 0.2629101393563821,
 'min_child_samples': 61,
 'subsample_freq': 2,
 'subsample': 0.8329687190743886,
 'max_bin': 899,
 'min_data_per_group': 73,
 'cat_smooth': 21,
 'cat_l2': 11,
            'random_state': 2021,
            'metric': 'auc',
            'n_estimators': 20000,
            'n_jobs': -1,
            'bagging_seed': 2021,
            'feature_fraction_seed': 2021
           }

# Extreme tuning strategy

In [None]:
f1= 0.7434828307047571 
f2= 1.3786330168495677
f3= 46
f4= 27

In [None]:
%%time

kf=StratifiedKFold(n_splits=5,random_state=48,shuffle=True)

# we will store our final predictions in preds
preds = np.zeros(test.shape[0])
#store rmse of each iterations
auc=[]
i=0

# --------------------------------------------------------------------------------
# Phase 1: create the pretrained model
for idx_train,idx_test in kf.split(X,y):
    
    X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
    y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]

    
    model=LGBMClassifier(**lgb_params)
    
    model.fit(X_train,y_train,eval_set=(X_test,y_test),early_stopping_rounds=300,verbose=False,eval_metric='auc')
    
    predictions=model.predict_proba(X_test,num_iteration=model.best_iteration_)[:,1]
    
    auc.append(roc_auc_score(y_test,predictions))
    
    print('First Round:')
    
    print(f'RMSE {auc[i]}')
    
    auc_tuned=[]
    params = lgb_params.copy()
    
    # -----------------------------------------------------------------------------
    # Phase 2: iterations where we decrease the learning rate and regularization params    
    for t in range(1,18):
        
        
        if t >1:    
                    
            params['reg_lambda'] *=  f1
            params['reg_alpha'] += f2
            params['num_leaves'] += f3
            params['min_child_samples'] -= f4
        
        if params['min_child_samples']<1:
            params['min_child_samples']=1
            
           
        params['learning_rate']=0.003
        
              
        model=LGBMClassifier(**params).fit(X_train,y_train,eval_set=(X_test,y_test),eval_metric='auc',early_stopping_rounds=200,verbose=False,init_model=model)
        
        predictions=model.predict_proba(X_test, num_iteration= model.best_iteration_)[:,1]
        
        auc_tuned.append(roc_auc_score(y_test,predictions))
        
        print(f'RMSE tuned {t}: {auc_tuned[t-1]}')
        
    print(f'Improvement of {auc_tuned[t-1]-auc[i]}')
    
    # ---------------------------------------------------------------------------
    # Inference time: calculate predictions for test set
    
    preds+=model.predict_proba(test[columns],num_iteration=model.best_iteration_)[:,1]/kf.get_n_splits()
        
    i+=1

%%time

kf=StratifiedKFold(n_splits=5,random_state=48,shuffle=True)

# we will store our final predictions in preds
preds = np.zeros(test.shape[0])
#store rmse of each iterations
auc=[]
i=0

# --------------------------------------------------------------------------------
# Phase 1: create the pretrained model
for idx_train,idx_test in kf.split(X,y):
    
    X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
    y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]

    
    model=LGBMClassifier(**lgb_params)
    
    model.fit(X_train,y_train,eval_set=(X_test,y_test),early_stopping_rounds=300,verbose=False,eval_metric='auc')
    
    predictions=model.predict_proba(X_test,num_iteration=model.best_iteration_)[:,1]
    
    auc.append(roc_auc_score(y_test,predictions))
    
    print('First Round:')
    
    print(f'RMSE {auc[i]}')
    
    auc_tuned=[]
    params = lgb_params.copy()
    
    # -----------------------------------------------------------------------------
    # Phase 2: iterations where we decrease the learning rate and regularization params    
    for t in range(1,18):
        
        
        if t >1:    
                    
            params['reg_lambda'] *=  f1
            params['reg_alpha'] += f2
            params['num_leaves'] += f3
            params['min_child_samples'] -= f4
        
        if params['min_child_samples']<1:
            params['min_child_samples']=1
            
           
        params['learning_rate']=0.003
        
              
        model=LGBMClassifier(**params).fit(X_train,y_train,eval_set=(X_test,y_test),eval_metric='auc',early_stopping_rounds=200,verbose=False,init_model=model)
        
        predictions=model.predict_proba(X_test, num_iteration= model.best_iteration_)[:,1]
        
        auc_tuned.append(roc_auc_score(y_test,predictions))
        
        print(f'RMSE tuned {t}: {auc_tuned[t-1]}')
        
    print(f'Improvement of {auc_tuned[t-1]-auc[i]}')
    
    # ---------------------------------------------------------------------------
    # Inference time: calculate predictions for test set
    
    preds+=model.predict_proba(test[columns],num_iteration=model.best_iteration_)[:,1]/kf.get_n_splits()
        
    i+=1

# Making final submission

In [None]:
# Create submission file
test['target']=preds
test=test[['id','target']]
test.to_csv('submission.csv',index=False)

# Thanks for reading