## Introduction

This notebook is a very basic and simple example of LightGBM model and a showcase of a package octopus-ml:
[https://github.com/gershonc/octopus-ml](https://github.com/gershonc/octopus-ml)

In [None]:
# Octopus ML pakage - github.com/gershonc/octopus-ml
!pip install octopus-ml

In [None]:
import warnings
warnings.simplefilter("ignore")
import seaborn as sns 
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import tracemalloc
from pandas_summary import DataFrameSummary
from sklearn.metrics import classification_report
%matplotlib inline
sns.set_style("whitegrid")

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

#check out https://github.com/gershonc/octopus-ml
import octopus_ml as oc

import optuna
import lightgbm as lgbm 

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold , StratifiedKFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")

## EDA

In [None]:
train_df.head(2)

In [None]:
# Data shape 
print ("Train set: ",train_df.shape)
print ("Test set: ",test_df.shape)

In [None]:
# DataFrane Summary by pandas summary package (extension of pandas.describe method) 
dfs = DataFrameSummary(train_df)
dfs.summary()

In [None]:
# Top 10 sparse features, mainly labs results 
pd.Series(1 - train_df.count() / len(train_df)).sort_values(ascending=False).head(10)

In [None]:
# Categorical features

categorical_features=[]
for c in train_df.columns:
    col_type = train_df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        train_df[c] = train_df[c].astype('category')
        categorical_features.append(c)
print (categorical_features)

In [None]:
# Target distribution analysis
fig, ax =plt.subplots(1,2)


plt.style.use('fivethirtyeight')
plt.figure(figsize=(3,4))
sns.set_context("paper", font_scale=1.2)                                                  
sns.countplot('target',data=train_df, ax=ax[0])
train_df['target'].value_counts().plot.pie(explode=[0,0.2],autopct='%1.2f%%',ax=ax[1])
fig.show()

In [None]:
sns.displot(data = train_df, kind = 'hist', x = 'cont1', hue = 'target', multiple = 'stack',bins=25,height = 4, aspect = 1.7)


In [None]:
sns.displot(data = train_df, kind = 'hist', x = 'cont2', hue = 'target', multiple = 'stack',bins=25,height = 4, aspect = 1.7)


## Data pre-processing


In [None]:
features=train_df.columns.to_list()
print ('Number of features ', len(features))

features_remove=['target']
for f in features_remove:
    features.remove(f)
    
X=train_df[features]
y=train_df['target']

## HPO - Hyper Paramaters Optimization  

In [None]:
data = X
target = y

def objective(trial , data = data , target = target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
            test_size = 0.10 , random_state = 42)
    
    params = {
        #'power': trial.suggest_categorical("power", [True, False]),
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 10),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-5 , 10),
        'num_leaves' : trial.suggest_int('num_leaves' , 11 , 300),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.1),
        'max_depth' : trial.suggest_int('max_depth' , 5 , 20),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 100),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
        'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-5 , 1),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021,555]),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0),
        #'resample': trial.suggest_categorical("resample", [None, 'random']),
        'metric' : 'auc',
        'device_type' : 'gpu',
    }
    model = lgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 200 , \
             verbose = False)
    preds = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y , preds)
    return auc

In [None]:
study = optuna.create_study(direction = 'maximize' , study_name = 'lgbm')
study.optimize(objective , n_trials = 20)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)

## ML modeling with Octopus-ml

In [None]:


params = {'resample': None,
         'learning_rate': 0.01,
         'power': False,
         'boosting_type': 'gbdt',
         'num_leaves': 153,
         'max_depth': 14,
         'max_delta_step': 9,
         'reg_alpha': 14.206069641010822,
         'reg_lambda': 4.35151505977074,
         'colsample_bytree': 0.23599717695150987,
         'cat_smooth': 49.698724437071206,
         'cat_l2': 19
         }
params_additional={'verbose': -1}
params.update(params_additional)


clf,arr_f1_weighted,arr_f1_macro,arr_f1_positive,prediction_folds,preds_folds,y_folds= oc.cv(X,y,0.5,1,shuffle=True,params=study.best_trial.params)


In [None]:
oc.cv_plot(arr_f1_weighted,arr_f1_macro,arr_f1_positive,'TBS match 2021 - Kaggle compatition')


In [None]:
print(classification_report(y_folds, prediction_folds))


In [None]:
oc.roc_curve_plot(y_folds,preds_folds)


In [None]:
feature_imp_list=oc.plot_imp(clf,X,'LightGBM Mortality Kaggle',num=30)


In [None]:
top_features=feature_imp_list.sort_values(by='Value', ascending=False).head(32)
top_features


In [None]:
list_for_correlations=top_features['Feature'].to_list()
list_for_correlations.append('target')
oc.correlations(train_df,list_for_correlations)

In [None]:
def Kaggle_submission(file_name,model,test_data,ids_list):
    if TARGET in test_data.columns:
        test_data.drop([TARGET],axis=1,inplace=True)
    #test_pred=model.predict(test_data[features])[:,1]
    test_pred=model.predict(test_data[features])
    print (test_pred[1:2])

    submit=pd.DataFrame()
    submit['id'] = ids_list
    submit['target'] = test_pred
    submit.to_csv(file_name,index=False)
    return submit

In [None]:
# Categorical features on testset

categorical_features=[]
for c in test_df.columns:
    col_type = train_df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        test_df[c] = test_df[c].astype('category')
        categorical_features.append(c)
print (categorical_features)

TARGET="target"
submit=Kaggle_submission("LGBM_baseline_v15.csv",clf,test_df,test_df['id'])

In [None]:
submit.head(10)