## **Importing Libraries**

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import PowerTransformer
import optuna
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import pickle

In [5]:
!pip3 freeze > requirements.txt

## **Loading Data**

In [None]:
df = pd.read_csv('/content/training_data_skf_no_smote.csv')
df.shape, testing_data.shape

((9864, 19), (2466, 18))

In [None]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,kfold
0,0,0.0,0,0.0,12,354.0,0.0,0.018182,0.0,0.0,May,2,7,1,2,New_Visitor,True,0,4
1,0,0.0,0,0.0,8,764.666667,0.025,0.04375,0.0,0.0,Nov,3,2,4,10,Returning_Visitor,False,0,2
2,3,157.4,0,0.0,9,128.5,0.036364,0.081818,0.0,0.0,Jul,3,2,1,3,Returning_Visitor,True,0,4
3,3,120.0,0,0.0,5,198.0,0.0,0.014286,0.0,0.0,May,3,3,4,2,New_Visitor,True,0,2
4,4,37.25,1,5.0,50,1295.008333,0.000893,0.015595,0.0,0.0,Nov,3,2,4,2,Returning_Visitor,True,1,3


## **Checking missing values and Constant Features**

In [None]:
df.isnull().sum()

id                         0
Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
kfold                      0
dtype: int64

In [None]:
df.nunique() == 1

id                         False
Administrative             False
Administrative_Duration    False
Informational              False
Informational_Duration     False
ProductRelated             False
ProductRelated_Duration    False
BounceRates                False
ExitRates                  False
PageValues                 False
SpecialDay                 False
Month                      False
OperatingSystems           False
Browser                    False
Region                     False
TrafficType                False
VisitorType                False
Weekend                    False
Revenue                    False
kfold                      False
dtype: bool

### **Data Description**

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,9864.0,4931.5,2847.635862,0.0,2465.75,4931.5,7397.25,9863.0
Administrative,9864.0,2.301906,3.312831,0.0,0.0,1.0,4.0,27.0
Administrative_Duration,9864.0,79.932422,173.310874,0.0,0.0,7.0,91.770833,2720.5
Informational,9864.0,0.492498,1.252128,0.0,0.0,0.0,0.0,24.0
Informational_Duration,9864.0,33.723487,137.813445,0.0,0.0,0.0,0.0,2252.033333
ProductRelated,9864.0,31.724047,44.342798,0.0,7.0,18.0,38.0,705.0
ProductRelated_Duration,9864.0,1189.601572,1820.278385,0.0,185.625,607.544643,1471.89375,43171.23338
BounceRates,9864.0,0.022265,0.048563,0.0,0.0,0.003175,0.017077,0.2
ExitRates,9864.0,0.043117,0.048575,0.0,0.014286,0.025309,0.05,0.2
PageValues,9864.0,5.858316,18.620643,0.0,0.0,0.0,0.0,361.763742


## **Creating list of categorical, numerical and useful columns**

In [None]:
useful_cols = [col for col in df.columns if col not in ['id', 'Revenue', 'kfold']]
categorical = [col for col in useful_cols if df[col].dtype in ['object', 'bool']]
numerical = [col for col in useful_cols if col not in categorical]

## **Some Transformations:**
- One-Hot-Encoding categorical variables
- Encoding ``Weekend`` variable
- Applying Power Transformation to numerical columns

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=7)

In [None]:
le = preprocessing.LabelEncoder() 

df_train.Weekend = le.fit_transform(df_train.Weekend)
df_test.Weekend = le.transform(df_test.Weekend)


dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
df_train = pd.DataFrame(dv.fit_transform(dicts),columns=list(dv.get_feature_names_out()))
df_test = pd.DataFrame(dv.transform(dicts),columns=list(dv.get_feature_names_out()))

useful_cols = [col for col in df_train.columns if col not in ['id', 'Revenue', 'kfold']]
categorical = [col for col in useful_cols if df_train[col].dtype in ['object', 'bool']]
numerical = [col for col in useful_cols if col not in categorical]

pt = PowerTransformer()
pt_num_tr = pd.DataFrame(pt.fit_transform(df_train[useful_cols]),columns=useful_cols)
pt_num_ts = pd.DataFrame(pt.transform(df_test[useful_cols]),columns=useful_cols)
df_train = pd.concat([df_train.drop(useful_cols, axis = 1),pt_num_tr],axis=1)
df_test = pd.concat([df_test.drop(useful_cols, axis = 1),pt_num_ts],axis=1)

useful_cols = [col for col in df_train.columns if col not in ['id', 'Revenue', 'kfold']]
categorical = [col for col in useful_cols if df_train[col].dtype in ['object', 'bool']]
numerical = [col for col in useful_cols if col not in categorical]
scaler= preprocessing.RobustScaler()

df_train.Revenue = df_train.Revenue.astype('int')
df_train.kfold = df_train.kfold.astype('int')
df_test.Revenue = df_test.Revenue.astype('int')
df_test.kfold = df_test.kfold.astype('int')

# **Hyper-Tuning and Modelling**
- **Hypertuning using Optuna:** 
  - I've selected f1-score as metrics to decide the final paramters because our data is unbalanced.
  - For this pupose i've split the data into 5 Stratified Folds which is shown in a separate notebook.
  - Next I've calculated f1-score for both training and validation set.
  - For the final function i'm returning the mean_difference of these f1-scores i.e. mean(training_f1) - mean(validation_f1).
  - This actually returns a value and we need this distance to be as minimum as possible. Why? because if we observe then validation_f1 must be greater and thats possible when the value is as small as possible (negatively)
- **Model Building:**
  - Plan here is to build 4 different models each optimized for hyper-parameters using Optuna.
  - Finally we use these models with Voting Classifier and get the results using all 4 models.

### **Model-1: XGBClassifier:**
- Hypertuning using optuna
- Model-building

In [None]:
def objective(trial): 
    scores_valid = []
    scores_train = []
    for fold in range(5):
      xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
      xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

      ytrain = xtrain.Revenue
      yvalid = xvalid.Revenue
      
      xtrain = xtrain[useful_cols]
      xvalid = xvalid[useful_cols]


      params = {'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 3, 20), 
            'max_depth': trial.suggest_int('max_depth', 10, 400), 
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']), 
            'class_weight': trial.suggest_categorical('class_weight', ['balanced'])
            }

      model = DecisionTreeClassifier(**params, random_state=42)
      model.fit(xtrain,ytrain)                                                      # Training the Model on training set

      # Predictions and Evaluation
      preds_train = model.predict(xtrain) 
      preds_valid = model.predict(xvalid)                              
      f1_score_train = metrics.f1_score(ytrain, preds_train)
      f1_score_valid = metrics.f1_score(yvalid, preds_valid)                    
      print(f"Fold: {fold}, f1-Score-train: {f1_score_train}")
      print(f"Fold: {fold}, f1-Score-valid: {f1_score_valid}")
      scores_valid.append(f1_score_valid)
      scores_train.append(f1_score_train)
    return np.mean(scores_train) - np.mean(scores_valid) 
study = optuna.create_study(direction='minimize')                            
study.optimize(objective, n_trials=2)

[32m[I 2021-12-14 11:52:50,603][0m A new study created in memory with name: no-name-0a6b70d4-8ac6-447b-93f0-0a8ff5931470[0m
[32m[I 2021-12-14 11:52:50,845][0m Trial 0 finished with value: 0.012715465302303053 and parameters: {'max_leaf_nodes': 16, 'max_depth': 257, 'criterion': 'gini', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.012715465302303053.[0m


Fold: 0, f1-Score-train: 0.6485260770975058
Fold: 0, f1-Score-valid: 0.6334841628959277
Fold: 1, f1-Score-train: 0.6857855361596009
Fold: 1, f1-Score-valid: 0.6519524617996604
Fold: 2, f1-Score-train: 0.6582478295185478
Fold: 2, f1-Score-valid: 0.6646525679758308
Fold: 3, f1-Score-train: 0.6432748538011697
Fold: 3, f1-Score-valid: 0.6224188790560472
Fold: 4, f1-Score-train: 0.6594511016621569
Fold: 4, f1-Score-valid: 0.6592
Fold: 0, f1-Score-train: 0.6532319391634982
Fold: 0, f1-Score-valid: 0.6412213740458015
Fold: 1, f1-Score-train: 0.6420391577391946
Fold: 1, f1-Score-valid: 0.6179604261796042
Fold: 2, f1-Score-train: 0.6589916633584756
Fold: 2, f1-Score-valid: 0.6636363636363637
Fold: 3, f1-Score-train: 0.6358715265247203
Fold: 3, f1-Score-valid: 0.6120689655172413


[32m[I 2021-12-14 11:52:51,131][0m Trial 1 finished with value: 0.009393152133642335 and parameters: {'max_leaf_nodes': 18, 'max_depth': 368, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.009393152133642335.[0m


Fold: 4, f1-Score-train: 0.6342525399129173
Fold: 4, f1-Score-valid: 0.6425339366515838


In [None]:
scores_train = []
scores_valid = []
scores_test = []
final_test_predictions = []
for fold in range(5):
  xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
  xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
  xtest = df_test.copy()

  ytrain = xtrain.Revenue
  yvalid = xvalid.Revenue
  ytest = xtest.Revenue
  
  
  xtrain = xtrain[useful_cols]
  xvalid = xvalid[useful_cols]
  xtest = xtest[useful_cols]

  xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
  xvalid[numerical] = scaler.transform(xvalid[numerical])
  xtest[numerical] = scaler.transform(xtest[numerical])

  params = {'max_leaf_nodes': 6, 'max_depth': 254, 'criterion': 'entropy', 'class_weight': 'balanced'}

  model = DecisionTreeClassifier(**params, random_state=42)
  model.fit(xtrain,ytrain)  
  
  preds_valid = model.predict(xvalid)
  preds_train = model.predict(xtrain)
  test_preds = model.predict(xtest)  
  final_test_predictions.append(test_preds)
  f1_score_valid = metrics.f1_score(yvalid, preds_valid)
  f1_score_train = metrics.f1_score(ytrain, preds_train)
  f1_score_test = metrics.f1_score(ytest, test_preds)
  print(f'Fold {fold} f1-score-train: ', f1_score_train)
  print(f'Fold {fold} f1-score-Valid: ', f1_score_valid)
  print(f'Fold {fold} f1-score-Test: ', f1_score_test)
  scores_train.append(f1_score_train)
  scores_valid.append(f1_score_valid)
  scores_test.append(f1_score_test)
    
print(np.mean(scores_train), np.std(scores_train))
print(np.mean(scores_valid), np.std(scores_valid))
print(np.mean(scores_test), np.std(scores_test))

Fold 0 f1-score-train:  0.6709021601016518
Fold 0 f1-score-Valid:  0.6746166950596253
Fold 0 f1-score-Test:  0.6716417910447762
Fold 1 f1-score-train:  0.6736930860033726
Fold 1 f1-score-Valid:  0.6643478260869565
Fold 1 f1-score-Test:  0.6718696979979639
Fold 2 f1-score-train:  0.6689566337483899
Fold 2 f1-score-Valid:  0.6828478964401294
Fold 2 f1-score-Test:  0.6718696979979639
Fold 3 f1-score-train:  0.6754348748408995
Fold 3 f1-score-Valid:  0.6576271186440678
Fold 3 f1-score-Test:  0.6718696979979639
Fold 4 f1-score-train:  0.6703250316589278
Fold 4 f1-score-Valid:  0.6782006920415226
Fold 4 f1-score-Test:  0.6718696979979639
0.6718623572706482 0.00235978928339725
0.6715280456544603 0.009241150486280813
0.6718241166073263 9.116278127510035e-05


### **Model-2: MLPClassifier**

In [None]:
def objective(trial): 
  scores_valid = []
  scores_train = []
  for fold in range(5):
  
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.Revenue
    yvalid = xvalid.Revenue
    
    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]

    xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
    xvalid[numerical] = scaler.transform(xvalid[numerical])

    params = {
        'alpha': trial.suggest_loguniform('alpha',1e-3,1), 
        'hidden_layer_sizes': trial.suggest_int('hidden_layer_sizes',5,20),
        'max_iter': trial.suggest_int('max_iter',30,70)}
    
    model= MLPClassifier(**params,random_state=7,tol=1e-4)
    model.fit(xtrain,ytrain)

    # Predictions and Evaluation
    preds_train = model.predict(xtrain) 
    preds_valid = model.predict(xvalid)                              
    f1_score_train = metrics.f1_score(ytrain, preds_train)
    f1_score_valid = metrics.f1_score(yvalid, preds_valid)                    
    print(f"Fold: {fold}, f1-Score-train: {f1_score_train}")
    print(f"Fold: {fold}, f1-Score-valid: {f1_score_valid}")
    scores_valid.append(f1_score_valid)
    scores_train.append(f1_score_train)
  return np.mean(scores_train) - np.mean(scores_valid)
study = optuna.create_study(direction='minimize')                            
study.optimize(objective, n_trials=10)


[32m[I 2021-12-14 09:25:06,141][0m A new study created in memory with name: no-name-1335cf0d-48da-4c4b-99a2-fc9ce5999b2d[0m


Fold: 0, f1-Score-train: 0.6859430604982206
Fold: 0, f1-Score-valid: 0.6514886164623467
Fold: 1, f1-Score-train: 0.6822388717496696
Fold: 1, f1-Score-valid: 0.689165186500888
Fold: 2, f1-Score-train: 0.6862575626620571
Fold: 2, f1-Score-valid: 0.6946308724832214
Fold: 3, f1-Score-train: 0.6912928759894459
Fold: 3, f1-Score-valid: 0.6137184115523466


[32m[I 2021-12-14 09:25:12,556][0m Trial 0 finished with value: 0.022685724547222952 and parameters: {'alpha': 0.008577818477079875, 'hidden_layer_sizes': 17, 'max_iter': 46}. Best is trial 0 with value: 0.022685724547222952.[0m


Fold: 4, f1-Score-train: 0.688077430708315
Fold: 4, f1-Score-valid: 0.6713780918727915
Fold: 0, f1-Score-train: 0.6855895196506551
Fold: 0, f1-Score-valid: 0.6598290598290599
Fold: 1, f1-Score-train: 0.6876328091797705
Fold: 1, f1-Score-valid: 0.7032590051457975
Fold: 2, f1-Score-train: 0.6798452943704341
Fold: 2, f1-Score-valid: 0.6863711001642037
Fold: 3, f1-Score-train: 0.6946264744429882
Fold: 3, f1-Score-valid: 0.6142857142857143


[32m[I 2021-12-14 09:25:18,287][0m Trial 1 finished with value: 0.016759675365156212 and parameters: {'alpha': 0.014553730682436575, 'hidden_layer_sizes': 19, 'max_iter': 39}. Best is trial 1 with value: 0.016759675365156212.[0m


Fold: 4, f1-Score-train: 0.6781708369283865
Fold: 4, f1-Score-valid: 0.6783216783216782
Fold: 0, f1-Score-train: 0.689068100358423
Fold: 0, f1-Score-valid: 0.6619718309859155
Fold: 1, f1-Score-train: 0.6761187416925122
Fold: 1, f1-Score-valid: 0.6857142857142857
Fold: 2, f1-Score-train: 0.6803135888501743
Fold: 2, f1-Score-valid: 0.6891891891891893
Fold: 3, f1-Score-train: 0.6938239159001314
Fold: 3, f1-Score-valid: 0.6310160427807487


[32m[I 2021-12-14 09:25:25,985][0m Trial 2 finished with value: 0.017424369758328728 and parameters: {'alpha': 0.0212980010814328, 'hidden_layer_sizes': 12, 'max_iter': 63}. Best is trial 1 with value: 0.016759675365156212.[0m


Fold: 4, f1-Score-train: 0.6788094180364282
Fold: 4, f1-Score-valid: 0.6631205673758865
Fold: 0, f1-Score-train: 0.6803672933974639
Fold: 0, f1-Score-valid: 0.6689536878216124
Fold: 1, f1-Score-train: 0.6768826619964974
Fold: 1, f1-Score-valid: 0.6797153024911032
Fold: 2, f1-Score-train: 0.6651785714285715
Fold: 2, f1-Score-valid: 0.6850258175559379
Fold: 3, f1-Score-train: 0.6815203145478375
Fold: 3, f1-Score-valid: 0.6368515205724509


[32m[I 2021-12-14 09:25:32,818][0m Trial 3 finished with value: 0.006437743241638927 and parameters: {'alpha': 0.15018277561775956, 'hidden_layer_sizes': 9, 'max_iter': 58}. Best is trial 3 with value: 0.006437743241638927.[0m


Fold: 4, f1-Score-train: 0.6747932085328691
Fold: 4, f1-Score-valid: 0.6760070052539404
Fold: 0, f1-Score-train: 0.6657906263688129
Fold: 0, f1-Score-valid: 0.647359454855196
Fold: 1, f1-Score-train: 0.6594885598923285
Fold: 1, f1-Score-valid: 0.6606822262118491
Fold: 2, f1-Score-train: 0.6589595375722542
Fold: 2, f1-Score-valid: 0.6689419795221843
Fold: 3, f1-Score-train: 0.6631944444444444
Fold: 3, f1-Score-valid: 0.6261180679785331


[32m[I 2021-12-14 09:25:37,591][0m Trial 4 finished with value: 0.008106441456344093 and parameters: {'alpha': 0.0037575421589616264, 'hidden_layer_sizes': 9, 'max_iter': 35}. Best is trial 3 with value: 0.006437743241638927.[0m


Fold: 4, f1-Score-train: 0.661693725318122
Fold: 4, f1-Score-valid: 0.6654929577464789
Fold: 0, f1-Score-train: 0.6713286713286714
Fold: 0, f1-Score-valid: 0.6428571428571428
Fold: 1, f1-Score-train: 0.6532438478747203
Fold: 1, f1-Score-valid: 0.6750902527075813
Fold: 2, f1-Score-train: 0.6557085739671258
Fold: 2, f1-Score-valid: 0.6666666666666667
Fold: 3, f1-Score-train: 0.6767111493758071
Fold: 3, f1-Score-valid: 0.63986013986014


[32m[I 2021-12-14 09:25:44,490][0m Trial 5 finished with value: 0.007712692356729445 and parameters: {'alpha': 0.6007113972305819, 'hidden_layer_sizes': 10, 'max_iter': 56}. Best is trial 3 with value: 0.006437743241638927.[0m


Fold: 4, f1-Score-train: 0.6619593998234775
Fold: 4, f1-Score-valid: 0.6559139784946237
Fold: 0, f1-Score-train: 0.6929936305732484
Fold: 0, f1-Score-valid: 0.6599664991624791
Fold: 1, f1-Score-train: 0.6701525054466231
Fold: 1, f1-Score-valid: 0.6956521739130433
Fold: 2, f1-Score-train: 0.6716483516483517
Fold: 2, f1-Score-valid: 0.6858108108108109
Fold: 3, f1-Score-train: 0.6840707964601771
Fold: 3, f1-Score-valid: 0.6181818181818182


[32m[I 2021-12-14 09:25:53,392][0m Trial 6 finished with value: 0.013470914844944226 and parameters: {'alpha': 0.6896823101923782, 'hidden_layer_sizes': 18, 'max_iter': 70}. Best is trial 3 with value: 0.006437743241638927.[0m


Fold: 4, f1-Score-train: 0.6851299531316575
Fold: 4, f1-Score-valid: 0.6770293609671848
Fold: 0, f1-Score-train: 0.6770186335403726
Fold: 0, f1-Score-valid: 0.6620450606585788
Fold: 1, f1-Score-train: 0.6768160069595477
Fold: 1, f1-Score-valid: 0.6901408450704225
Fold: 2, f1-Score-train: 0.678540399652476
Fold: 2, f1-Score-valid: 0.6933333333333332
Fold: 3, f1-Score-train: 0.6807174887892378
Fold: 3, f1-Score-valid: 0.5908256880733945


[32m[I 2021-12-14 09:25:59,379][0m Trial 7 finished with value: 0.015217652229312928 and parameters: {'alpha': 0.021208151887414133, 'hidden_layer_sizes': 20, 'max_iter': 41}. Best is trial 3 with value: 0.006437743241638927.[0m


Fold: 4, f1-Score-train: 0.6707692307692308
Fold: 4, f1-Score-valid: 0.6714285714285715
Fold: 0, f1-Score-train: 0.6696388765046812
Fold: 0, f1-Score-valid: 0.6397188049209139
Fold: 1, f1-Score-train: 0.6646234676007006
Fold: 1, f1-Score-valid: 0.6982456140350878
Fold: 2, f1-Score-train: 0.6823027718550106
Fold: 2, f1-Score-valid: 0.6918032786885245
Fold: 3, f1-Score-train: 0.6782608695652174
Fold: 3, f1-Score-valid: 0.6278659611992945


[32m[I 2021-12-14 09:26:04,096][0m Trial 8 finished with value: 0.009597334269128899 and parameters: {'alpha': 0.8550184024585615, 'hidden_layer_sizes': 17, 'max_iter': 34}. Best is trial 3 with value: 0.006437743241638927.[0m


Fold: 4, f1-Score-train: 0.6642952423299243
Fold: 4, f1-Score-valid: 0.6535008976660682
Fold: 0, f1-Score-train: 0.6524390243902439
Fold: 0, f1-Score-valid: 0.651085141903172
Fold: 1, f1-Score-train: 0.6445993031358885
Fold: 1, f1-Score-valid: 0.650994575045208
Fold: 2, f1-Score-train: 0.6483420593368238
Fold: 2, f1-Score-valid: 0.6733668341708543
Fold: 3, f1-Score-train: 0.6617710583153348
Fold: 3, f1-Score-valid: 0.6123893805309735


[32m[I 2021-12-14 09:26:07,829][0m Trial 9 finished with value: 0.004917391425939499 and parameters: {'alpha': 0.09631013728513668, 'hidden_layer_sizes': 7, 'max_iter': 30}. Best is trial 9 with value: 0.004917391425939499.[0m


Fold: 4, f1-Score-train: 0.6434176111595465
Fold: 4, f1-Score-valid: 0.6381461675579323


In [None]:
scores_train = []
scores_valid = []
final_test_predictions = []
for fold in range(5):
  xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
  xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
  xtest = df_test.copy()

  ytrain = xtrain.Revenue
  yvalid = xvalid.Revenue
  
  
  xtrain = xtrain[useful_cols]
  xvalid = xvalid[useful_cols]
  xtest = xtest[useful_cols]

  xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
  xvalid[numerical] = scaler.transform(xvalid[numerical])
  xtest[numerical] = scaler.transform(xtest[numerical])

  params ={'alpha': 0.09631013728513668, 'hidden_layer_sizes': 7, 'max_iter': 30}

  model= MLPClassifier(**params,random_state=7,tol=1e-4)
  model.fit(xtrain,ytrain) 

  preds_valid = model.predict(xvalid)
  preds_train = model.predict(xtrain)
  test_preds = model.predict(xtest)  
  final_test_predictions.append(test_preds)
  f1_score_valid = metrics.f1_score(yvalid, preds_valid)
  f1_score_train = metrics.f1_score(ytrain, preds_train)
  print(f'Fold {fold} f1-score-train: ', f1_score_train)
  print(f'Fold {fold} f1-score-Valid: ', f1_score_valid)
  scores_train.append(f1_score_train)
  scores_valid.append(f1_score_valid)
    
print(np.mean(scores_train), np.std(scores_train))
print(np.mean(scores_valid), np.std(scores_valid))

Fold 0 f1-score-train:  0.6524390243902439
Fold 0 f1-score-Valid:  0.651085141903172
Fold 1 f1-score-train:  0.6445993031358885
Fold 1 f1-score-Valid:  0.650994575045208
Fold 2 f1-score-train:  0.6483420593368238
Fold 2 f1-score-Valid:  0.6733668341708543
Fold 3 f1-score-train:  0.6617710583153348
Fold 3 f1-score-Valid:  0.6123893805309735
Fold 4 f1-score-train:  0.6434176111595465
Fold 4 f1-score-Valid:  0.6381461675579323
0.6501138112675675 0.006628514547372634
0.645196419841628 0.01993928674931871


### **Model-3: LGBM**

In [None]:
def objective(trial):
    scores_valid = []
    scores_train = [] 
    for fold in range(5):
      xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
      xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

      ytrain = xtrain.Revenue
      yvalid = xvalid.Revenue
      
      xtrain = xtrain[useful_cols]
      xvalid = xvalid[useful_cols]

      xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
      xvalid[numerical] = scaler.transform(xvalid[numerical])
 
      param = {"objective": trial.suggest_categorical("objective", ['binary']),
                    "boosting_type": trial.suggest_categorical("boosting_type", ['gbdt']),
                    "num_leaves": trial.suggest_int("num_leaves", 5, 100),
                    "max_depth": trial.suggest_int("max_depth", 5, 100),
                    "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, step=0.01),
                    "n_estimators": trial.suggest_int("n_estimators", 20,2000),        
                    "reg_alpha": trial.suggest_float("reg_alpha", 0.001, 40.0),
                    "reg_lambda": trial.suggest_float("reg_lambda", 0.001, 10.0),
                    # "random_state": trial.suggest_categorical("random_state", [24]),
                    # "bagging_seed": trial.suggest_categorical("bagging_seed", [24]),
                    # "feature_fraction_seed": trial.suggest_categorical("feature_fraction_seed", [24]), 
                    # "n_jobs": trial.suggest_categorical("n_jobs", [4]), 
                    "subsample": trial.suggest_float("subsample",0.01, 0.5, step=0.01),
                    "subsample_freq": trial.suggest_int("subsample_freq", 3, 20),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 0.9, step=0.01),
                    # "device_type": trial.suggest_categorical("device_type", ["GPU"]),
                    'min_child_samples': trial.suggest_int('min_child_samples', 5, 70),
                    'min_child_weight': trial.suggest_int('min_child_weight', 5,70)}

      model = LGBMClassifier(**param)
      model.fit(xtrain, ytrain,
              eval_set=[(xvalid, yvalid)],
              eval_metric="f1",
              early_stopping_rounds=50,
              verbose=False)


      # Predictions and Evaluation
      preds_train = model.predict(xtrain) 
      preds_valid = model.predict(xvalid)
      f1_score_train = metrics.f1_score(ytrain, preds_train)
      f1_score_valid = metrics.f1_score(yvalid, preds_valid)               
      print(f"Fold: {fold}, f1-Score-train: {f1_score_train}")
      print(f"Fold: {fold}, f1-Score-valid: {f1_score_valid}")
      scores_valid.append(f1_score_valid)
      scores_train.append(f1_score_train)
    return np.mean(scores_train) - np.mean(scores_valid)
study = optuna.create_study(direction='minimize')                        
study.optimize(objective, n_trials=10)                                      


[32m[I 2021-12-14 09:30:02,631][0m A new study created in memory with name: no-name-7f6f39c6-2a33-401c-8cad-99a4f83b04f7[0m


Fold: 0, f1-Score-train: 0.6838365896980462
Fold: 0, f1-Score-valid: 0.647985989492119
Fold: 1, f1-Score-train: 0.6848847139197267
Fold: 1, f1-Score-valid: 0.6821428571428572
Fold: 2, f1-Score-train: 0.6718954248366014
Fold: 2, f1-Score-valid: 0.687813021702838
Fold: 3, f1-Score-train: 0.6910814304179232
Fold: 3, f1-Score-valid: 0.6192170818505338


[32m[I 2021-12-14 09:30:04,255][0m Trial 0 finished with value: 0.021397373381658924 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 8, 'max_depth': 59, 'learning_rate': 0.321, 'n_estimators': 389, 'reg_alpha': 2.18756112559954, 'reg_lambda': 3.787879005151599, 'subsample': 0.32, 'subsample_freq': 9, 'colsample_bytree': 0.5, 'min_child_samples': 41, 'min_child_weight': 48}. Best is trial 0 with value: 0.021397373381658924.[0m


Fold: 4, f1-Score-train: 0.6907085277895629
Fold: 4, f1-Score-valid: 0.6782608695652174
Fold: 0, f1-Score-train: 0.0
Fold: 0, f1-Score-valid: 0.0
Fold: 1, f1-Score-train: 0.0
Fold: 1, f1-Score-valid: 0.0
Fold: 2, f1-Score-train: 0.0
Fold: 2, f1-Score-valid: 0.0
Fold: 3, f1-Score-train: 0.0
Fold: 3, f1-Score-valid: 0.0


[32m[I 2021-12-14 09:30:05,165][0m Trial 1 finished with value: 0.0 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 87, 'max_depth': 77, 'learning_rate': 0.081, 'n_estimators': 269, 'reg_alpha': 35.97250316999857, 'reg_lambda': 1.9455331881532867, 'subsample': 0.06999999999999999, 'subsample_freq': 4, 'colsample_bytree': 0.6900000000000001, 'min_child_samples': 66, 'min_child_weight': 36}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.0
Fold: 4, f1-Score-valid: 0.0
Fold: 0, f1-Score-train: 0.6583961010190518
Fold: 0, f1-Score-valid: 0.6460481099656357
Fold: 1, f1-Score-train: 0.6474245115452931
Fold: 1, f1-Score-valid: 0.6581352833638026
Fold: 2, f1-Score-train: 0.6443322625057366
Fold: 2, f1-Score-valid: 0.6689895470383275
Fold: 3, f1-Score-train: 0.6632434745400085
Fold: 3, f1-Score-valid: 0.6216696269982238


[32m[I 2021-12-14 09:30:06,428][0m Trial 2 finished with value: 0.0026237678208478332 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 47, 'max_depth': 17, 'learning_rate': 0.331, 'n_estimators': 469, 'reg_alpha': 30.654408167803027, 'reg_lambda': 8.742258358130245, 'subsample': 0.45, 'subsample_freq': 9, 'colsample_bytree': 0.49, 'min_child_samples': 26, 'min_child_weight': 32}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.6504209127159947
Fold: 4, f1-Score-valid: 0.6558558558558559
Fold: 0, f1-Score-train: 0.6575342465753424
Fold: 0, f1-Score-valid: 0.6426116838487973
Fold: 1, f1-Score-train: 0.6492505353319058
Fold: 1, f1-Score-valid: 0.6573426573426573
Fold: 2, f1-Score-train: 0.6564952048823016
Fold: 2, f1-Score-valid: 0.6688741721854304
Fold: 3, f1-Score-train: 0.667520273154076
Fold: 3, f1-Score-valid: 0.6161971830985916


[32m[I 2021-12-14 09:30:07,742][0m Trial 3 finished with value: 0.009417468929340478 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 8, 'max_depth': 29, 'learning_rate': 0.301, 'n_estimators': 1027, 'reg_alpha': 36.0216439809581, 'reg_lambda': 5.087814051161674, 'subsample': 0.48000000000000004, 'subsample_freq': 20, 'colsample_bytree': 0.74, 'min_child_samples': 16, 'min_child_weight': 62}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.6496644295302014
Fold: 4, f1-Score-valid: 0.6483516483516485
Fold: 0, f1-Score-train: 0.6606170598911071
Fold: 0, f1-Score-valid: 0.6231884057971014
Fold: 1, f1-Score-train: 0.6332453825857519
Fold: 1, f1-Score-valid: 0.643510054844607
Fold: 2, f1-Score-train: 0.6453048509123276
Fold: 2, f1-Score-valid: 0.6531986531986531
Fold: 3, f1-Score-train: 0.6579177602799651
Fold: 3, f1-Score-valid: 0.6181818181818182


[32m[I 2021-12-14 09:30:09,434][0m Trial 4 finished with value: 0.014042603639739193 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 74, 'max_depth': 21, 'learning_rate': 0.221, 'n_estimators': 373, 'reg_alpha': 23.24714584974088, 'reg_lambda': 4.54950214801149, 'subsample': 0.37, 'subsample_freq': 18, 'colsample_bytree': 0.21000000000000002, 'min_child_samples': 64, 'min_child_weight': 16}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.6663793103448277
Fold: 4, f1-Score-valid: 0.6551724137931035
Fold: 0, f1-Score-train: 0.0
Fold: 0, f1-Score-valid: 0.0
Fold: 1, f1-Score-train: 0.0
Fold: 1, f1-Score-valid: 0.0
Fold: 2, f1-Score-train: 0.0
Fold: 2, f1-Score-valid: 0.0


[32m[I 2021-12-14 09:30:10,218][0m Trial 5 finished with value: 0.0 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 12, 'max_depth': 75, 'learning_rate': 0.291, 'n_estimators': 389, 'reg_alpha': 29.51110709016299, 'reg_lambda': 8.82667339789244, 'subsample': 0.03, 'subsample_freq': 13, 'colsample_bytree': 0.41000000000000003, 'min_child_samples': 34, 'min_child_weight': 68}. Best is trial 1 with value: 0.0.[0m


Fold: 3, f1-Score-train: 0.0
Fold: 3, f1-Score-valid: 0.0
Fold: 4, f1-Score-train: 0.0
Fold: 4, f1-Score-valid: 0.0
Fold: 0, f1-Score-train: 0.5669144981412639
Fold: 0, f1-Score-valid: 0.5816876122082586
Fold: 1, f1-Score-train: 0.5645086166744294
Fold: 1, f1-Score-valid: 0.5380116959064327
Fold: 2, f1-Score-train: 0.5657959586002956
Fold: 2, f1-Score-valid: 0.576271186440678
Fold: 3, f1-Score-train: 0.5654405474764755
Fold: 3, f1-Score-valid: 0.5448154657293497


[32m[I 2021-12-14 09:30:11,007][0m Trial 6 finished with value: 0.001047979763173279 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 26, 'max_depth': 72, 'learning_rate': 0.341, 'n_estimators': 29, 'reg_alpha': 26.0082644485212, 'reg_lambda': 9.788490162559917, 'subsample': 0.23, 'subsample_freq': 18, 'colsample_bytree': 0.37, 'min_child_samples': 5, 'min_child_weight': 7}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.5542617605961806
Fold: 4, f1-Score-valid: 0.5708955223880596
Fold: 0, f1-Score-train: 0.6754684838160137
Fold: 0, f1-Score-valid: 0.6644628099173553
Fold: 1, f1-Score-train: 0.6669538991813874
Fold: 1, f1-Score-valid: 0.6654867256637168
Fold: 2, f1-Score-train: 0.664069264069264
Fold: 2, f1-Score-valid: 0.6808510638297872
Fold: 3, f1-Score-train: 0.6681280140289347
Fold: 3, f1-Score-valid: 0.624087591240876


[32m[I 2021-12-14 09:30:12,313][0m Trial 7 finished with value: 0.009738477979066684 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 14, 'max_depth': 76, 'learning_rate': 0.451, 'n_estimators': 325, 'reg_alpha': 15.765100896455968, 'reg_lambda': 5.3296625153379225, 'subsample': 0.31, 'subsample_freq': 11, 'colsample_bytree': 0.39, 'min_child_samples': 17, 'min_child_weight': 39}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.6479500891265597
Fold: 4, f1-Score-valid: 0.6389891696750902
Fold: 0, f1-Score-train: 0.0
Fold: 0, f1-Score-valid: 0.0
Fold: 1, f1-Score-train: 0.0
Fold: 1, f1-Score-valid: 0.0
Fold: 2, f1-Score-train: 0.0
Fold: 2, f1-Score-valid: 0.0


[32m[I 2021-12-14 09:30:13,103][0m Trial 8 finished with value: 0.0 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 73, 'max_depth': 8, 'learning_rate': 0.251, 'n_estimators': 1532, 'reg_alpha': 22.564704638747262, 'reg_lambda': 1.1438857198508958, 'subsample': 0.04, 'subsample_freq': 20, 'colsample_bytree': 0.8, 'min_child_samples': 66, 'min_child_weight': 47}. Best is trial 1 with value: 0.0.[0m


Fold: 3, f1-Score-train: 0.0
Fold: 3, f1-Score-valid: 0.0
Fold: 4, f1-Score-train: 0.0
Fold: 4, f1-Score-valid: 0.0
Fold: 0, f1-Score-train: 0.6732605729877216
Fold: 0, f1-Score-valid: 0.6729475100942127
Fold: 1, f1-Score-train: 0.6775732788002726
Fold: 1, f1-Score-valid: 0.6759388038942977
Fold: 2, f1-Score-train: 0.6748549982940977
Fold: 2, f1-Score-valid: 0.6763540290620872
Fold: 3, f1-Score-train: 0.6809232858112695
Fold: 3, f1-Score-valid: 0.6501377410468319


[32m[I 2021-12-14 09:30:15,406][0m Trial 9 finished with value: 0.002569261142603696 and parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 5, 'max_depth': 99, 'learning_rate': 0.261, 'n_estimators': 667, 'reg_alpha': 22.921097059670963, 'reg_lambda': 3.125899064451485, 'subsample': 0.43, 'subsample_freq': 16, 'colsample_bytree': 0.060000000000000005, 'min_child_samples': 47, 'min_child_weight': 27}. Best is trial 1 with value: 0.0.[0m


Fold: 4, f1-Score-train: 0.6711726934775262
Fold: 4, f1-Score-valid: 0.6895604395604396


In [None]:
scores_train = []
scores_valid = []
final_test_predictions = []
for fold in range(5):
  xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
  xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
  xtest = df_test.copy()

  ytrain = xtrain.Revenue
  yvalid = xvalid.Revenue
  
  
  xtrain = xtrain[useful_cols]
  xvalid = xvalid[useful_cols]
  xtest = xtest[useful_cols]

  xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
  xvalid[numerical] = scaler.transform(xvalid[numerical])
  xtest[numerical] = scaler.transform(xtest[numerical])

  params = {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 47, 'max_depth': 17, 'learning_rate': 0.331,
            'n_estimators': 469, 'reg_alpha': 30.654408167803027, 'reg_lambda': 8.742258358130245,
            'subsample': 0.45, 'subsample_freq': 9, 'colsample_bytree': 0.49, 'min_child_samples': 26, 'min_child_weight': 32}

  model = LGBMClassifier(**params)
  model.fit(xtrain, ytrain,
          eval_set=[(xvalid, yvalid)],
          # eval_metric="f1",
          early_stopping_rounds=50,
          verbose=False)

  preds_valid = model.predict(xvalid)
  preds_train = model.predict(xtrain)
  test_preds = model.predict(xtest)  
  final_test_predictions.append(test_preds)
  f1_score_valid = metrics.f1_score(yvalid, preds_valid)
  f1_score_train = metrics.f1_score(ytrain, preds_train)
  print(f'Fold {fold} f1-score-train: ', f1_score_train)
  print(f'Fold {fold} f1-score-Valid: ', f1_score_valid)
  scores_train.append(f1_score_train)
  scores_valid.append(f1_score_valid)
    
print(np.mean(scores_train), np.std(scores_train))
print(np.mean(scores_valid), np.std(scores_valid))

Fold 0 f1-score-train:  0.646354733405876
Fold 0 f1-score-Valid:  0.6341463414634146
Fold 1 f1-score-train:  0.6523143164693219
Fold 1 f1-score-Valid:  0.6455981941309256
Fold 2 f1-score-train:  0.6307007786429365
Fold 2 f1-score-Valid:  0.6476578411405296
Fold 3 f1-score-train:  0.6627721720658525
Fold 3 f1-score-Valid:  0.6285714285714286
Fold 4 f1-score-train:  0.666322846828262
Fold 4 f1-score-Valid:  0.6581740976645436
0.6516929694824498 0.012699109840988555
0.6428295805941684 0.010441504353759687


### **Model-4: BaggingClassifier**

In [None]:
# %%time
# X_train, X_test, y_train, y_test = train_test_split(X_res,y_res, test_size = 0.2, random_state=22)
def objective(trial):
    scores_valid = [] 
    scores_train = []
    for fold in range(5):
      xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
      xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
      
      valid_ids = xvalid.id.values.tolist()

      ytrain = xtrain.Revenue
      yvalid = xvalid.Revenue
      
      xtrain = xtrain[useful_cols]
      xvalid = xvalid[useful_cols]

      xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
      xvalid[numerical] = scaler.transform(xvalid[numerical])
      params = {'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
        'max_samples' : trial.suggest_int('max_samples',30, 50)}
      
      model = BaggingClassifier(**params, random_state=7)
      model.fit(xtrain,ytrain)                                                      

      # Predictions and Evaluation
      preds_train = model.predict(xtrain) 
      preds_valid = model.predict(xvalid)                              
      f1_score_train = metrics.f1_score(ytrain, preds_train)
      f1_score_valid = metrics.f1_score(yvalid, preds_valid)                    
      print(f"Fold: {fold}, f1-Score-train: {f1_score_train}")
      print(f"Fold: {fold}, f1-Score-valid: {f1_score_valid}")
      scores_valid.append(f1_score_valid)
      scores_train.append(f1_score_train)
    return np.mean(scores_train) - np.mean(scores_valid)
study_rf = optuna.create_study(direction='minimize')                             
study_rf.optimize(objective, n_trials=10)                                        


[32m[I 2021-12-14 09:33:17,747][0m A new study created in memory with name: no-name-cca09199-1500-4e6c-b032-c3b84bdd6d46[0m


Fold: 0, f1-Score-train: 0.6386554621848739
Fold: 0, f1-Score-valid: 0.6254295532646047
Fold: 1, f1-Score-train: 0.6411238825031929
Fold: 1, f1-Score-valid: 0.6537785588752196
Fold: 2, f1-Score-train: 0.6424242424242426
Fold: 2, f1-Score-valid: 0.6533996683250415
Fold: 3, f1-Score-train: 0.652265988987717
Fold: 3, f1-Score-valid: 0.6172839506172839


[32m[I 2021-12-14 09:33:35,635][0m Trial 0 finished with value: 0.003588766098660656 and parameters: {'n_estimators': 660, 'max_samples': 37}. Best is trial 0 with value: 0.003588766098660656.[0m


Fold: 4, f1-Score-train: 0.6425806451612903
Fold: 4, f1-Score-valid: 0.6492146596858639
Fold: 0, f1-Score-train: 0.637136465324385
Fold: 0, f1-Score-valid: 0.6180555555555556
Fold: 1, f1-Score-train: 0.638533674339301
Fold: 1, f1-Score-valid: 0.6537102473498233
Fold: 2, f1-Score-train: 0.6437473141383756
Fold: 2, f1-Score-valid: 0.65016501650165
Fold: 3, f1-Score-train: 0.6534066864155734
Fold: 3, f1-Score-valid: 0.6197183098591549


[32m[I 2021-12-14 09:33:51,744][0m Trial 1 finished with value: 0.0048564795183294596 and parameters: {'n_estimators': 569, 'max_samples': 38}. Best is trial 0 with value: 0.003588766098660656.[0m


Fold: 4, f1-Score-train: 0.6444348202685146
Fold: 4, f1-Score-valid: 0.6513274336283186
Fold: 0, f1-Score-train: 0.6375
Fold: 0, f1-Score-valid: 0.615916955017301
Fold: 1, f1-Score-train: 0.6429184549356224
Fold: 1, f1-Score-valid: 0.6571936056838367
Fold: 2, f1-Score-train: 0.6473840918757975
Fold: 2, f1-Score-valid: 0.659016393442623
Fold: 3, f1-Score-train: 0.6530785562632696
Fold: 3, f1-Score-valid: 0.618374558303887


[32m[I 2021-12-14 09:34:08,226][0m Trial 2 finished with value: 0.00374674446513934 and parameters: {'n_estimators': 562, 'max_samples': 46}. Best is trial 0 with value: 0.003588766098660656.[0m


Fold: 4, f1-Score-train: 0.646804835924007
Fold: 4, f1-Score-valid: 0.658450704225352
Fold: 0, f1-Score-train: 0.6388642413487133
Fold: 0, f1-Score-valid: 0.6230636833046471
Fold: 1, f1-Score-train: 0.6385904598195102
Fold: 1, f1-Score-valid: 0.6524822695035463
Fold: 2, f1-Score-train: 0.6464300983326208
Fold: 2, f1-Score-valid: 0.6534653465346535
Fold: 3, f1-Score-train: 0.6494277236116999
Fold: 3, f1-Score-valid: 0.6134751773049644


[32m[I 2021-12-14 09:34:23,488][0m Trial 3 finished with value: 0.004079980817153772 and parameters: {'n_estimators': 547, 'max_samples': 37}. Best is trial 0 with value: 0.003588766098660656.[0m


Fold: 4, f1-Score-train: 0.645877829987185
Fold: 4, f1-Score-valid: 0.6563039723661487
Fold: 0, f1-Score-train: 0.6389380530973451
Fold: 0, f1-Score-valid: 0.62778730703259
Fold: 1, f1-Score-train: 0.6241134751773049
Fold: 1, f1-Score-valid: 0.656934306569343
Fold: 2, f1-Score-train: 0.6384648931530746
Fold: 2, f1-Score-valid: 0.6533333333333333
Fold: 3, f1-Score-train: 0.6543624161073825
Fold: 3, f1-Score-valid: 0.6223776223776225


[32m[I 2021-12-14 09:34:38,019][0m Trial 4 finished with value: -0.0017015677981319088 and parameters: {'n_estimators': 533, 'max_samples': 32}. Best is trial 4 with value: -0.0017015677981319088.[0m


Fold: 4, f1-Score-train: 0.6407599309153714
Fold: 4, f1-Score-valid: 0.6447140381282496
Fold: 0, f1-Score-train: 0.6494178525226392
Fold: 0, f1-Score-valid: 0.6375838926174496
Fold: 1, f1-Score-train: 0.6407263294422827
Fold: 1, f1-Score-valid: 0.657142857142857
Fold: 2, f1-Score-train: 0.6443478260869565
Fold: 2, f1-Score-valid: 0.6566164154103853
Fold: 3, f1-Score-train: 0.6551433389544687
Fold: 3, f1-Score-valid: 0.618629173989455


[32m[I 2021-12-14 09:35:05,642][0m Trial 5 finished with value: 0.0034866978096215195 and parameters: {'n_estimators': 981, 'max_samples': 43}. Best is trial 4 with value: -0.0017015677981319088.[0m


Fold: 4, f1-Score-train: 0.6420182688125272
Fold: 4, f1-Score-valid: 0.6442477876106195
Fold: 0, f1-Score-train: 0.6340160284951024
Fold: 0, f1-Score-valid: 0.6252158894645942
Fold: 1, f1-Score-train: 0.6261558784676354
Fold: 1, f1-Score-valid: 0.6498194945848376
Fold: 2, f1-Score-train: 0.6388888888888888
Fold: 2, f1-Score-valid: 0.6478405315614618
Fold: 3, f1-Score-train: 0.6517328825021133
Fold: 3, f1-Score-valid: 0.6123893805309735


[32m[I 2021-12-14 09:35:22,309][0m Trial 6 finished with value: 0.003321168254306728 and parameters: {'n_estimators': 584, 'max_samples': 31}. Best is trial 4 with value: -0.0017015677981319088.[0m


Fold: 4, f1-Score-train: 0.6300527240773286
Fold: 4, f1-Score-valid: 0.6289752650176679
Fold: 0, f1-Score-train: 0.6417445482866044
Fold: 0, f1-Score-valid: 0.6193771626297577
Fold: 1, f1-Score-train: 0.6470838654746701
Fold: 1, f1-Score-valid: 0.6655052264808362
Fold: 2, f1-Score-train: 0.6472851646002565
Fold: 2, f1-Score-valid: 0.6600985221674878
Fold: 3, f1-Score-train: 0.6542530681337283
Fold: 3, f1-Score-valid: 0.6232394366197183


[32m[I 2021-12-14 09:35:40,302][0m Trial 7 finished with value: 0.002531123916723499 and parameters: {'n_estimators': 631, 'max_samples': 48}. Best is trial 4 with value: -0.0017015677981319088.[0m


Fold: 4, f1-Score-train: 0.6454388984509466
Fold: 4, f1-Score-valid: 0.6549295774647887
Fold: 0, f1-Score-train: 0.6435164835164835
Fold: 0, f1-Score-valid: 0.6247877758913413
Fold: 1, f1-Score-train: 0.6414770287677115
Fold: 1, f1-Score-valid: 0.658450704225352
Fold: 2, f1-Score-train: 0.6509635974304068
Fold: 2, f1-Score-valid: 0.6567656765676567
Fold: 3, f1-Score-train: 0.6544836379090523
Fold: 3, f1-Score-valid: 0.6172839506172839


[32m[I 2021-12-14 09:36:02,408][0m Trial 8 finished with value: 0.002991477285005817 and parameters: {'n_estimators': 741, 'max_samples': 45}. Best is trial 4 with value: -0.0017015677981319088.[0m


Fold: 4, f1-Score-train: 0.6378331148973351
Fold: 4, f1-Score-valid: 0.6560283687943262
Fold: 0, f1-Score-train: 0.6364444444444444
Fold: 0, f1-Score-valid: 0.6313993174061434
Fold: 1, f1-Score-train: 0.6366782006920415
Fold: 1, f1-Score-valid: 0.658273381294964
Fold: 2, f1-Score-train: 0.642146257031588
Fold: 2, f1-Score-valid: 0.6534653465346535
Fold: 3, f1-Score-train: 0.6536502546689305
Fold: 3, f1-Score-valid: 0.61101243339254


[32m[I 2021-12-14 09:36:24,567][0m Trial 9 finished with value: 0.001656237087323098 and parameters: {'n_estimators': 788, 'max_samples': 34}. Best is trial 4 with value: -0.0017015677981319088.[0m


Fold: 4, f1-Score-train: 0.6340819022457067
Fold: 4, f1-Score-valid: 0.6405693950177936


In [None]:
scores_train = []
scores_valid = []
for fold in range(5):
  xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
  xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
  
  valid_ids = xvalid.id.values.tolist()

  ytrain = xtrain.Revenue
  yvalid = xvalid.Revenue
  
  xtrain = xtrain[useful_cols]
  xvalid = xvalid[useful_cols]


  xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
  xvalid[numerical] = scaler.transform(xvalid[numerical])
  xtest[numerical] = scaler.transform(xtest[numerical])

  params = {'n_estimators': 533, 'max_samples': 32}

  model = BaggingClassifier(**params, random_state=7)
  model.fit(xtrain,ytrain)

  preds_valid = model.predict(xvalid)
  preds_train = model.predict(xtrain) 
  f1_score_valid = metrics.f1_score(yvalid, preds_valid)
  f1_score_train = metrics.f1_score(ytrain, preds_train)
  print(f'Fold {fold} f1-score-train: ', f1_score_train)
  print(f'Fold {fold} f1-score-Valid: ', f1_score_valid)
  scores_train.append(f1_score_train)
  scores_valid.append(f1_score_valid)
    
print(np.mean(scores_train), np.std(scores_train))
print(np.mean(scores_valid), np.std(scores_valid))

Fold 0 f1-score-train:  0.6389380530973451
Fold 0 f1-score-Valid:  0.62778730703259
Fold 1 f1-score-train:  0.6241134751773049
Fold 1 f1-score-Valid:  0.656934306569343
Fold 2 f1-score-train:  0.6384648931530746
Fold 2 f1-score-Valid:  0.6533333333333333
Fold 3 f1-score-train:  0.6543624161073825
Fold 3 f1-score-Valid:  0.6223776223776225
Fold 4 f1-score-train:  0.6407599309153714
Fold 4 f1-score-Valid:  0.6447140381282496
0.6393277536900958 0.009596487598381852
0.6410293214882277 0.013719848550097944


#### **Setting Models and Parameters for Voting Classifier**

In [None]:
params_bg = {'n_estimators': 533, 'max_samples': 32}
model_bg = BaggingClassifier(**params_bg, random_state=7)

params_lgb = {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 47, 'max_depth': 17, 'learning_rate': 0.331,
            'n_estimators': 469, 'reg_alpha': 30.654408167803027, 'reg_lambda': 8.742258358130245,
            'subsample': 0.45, 'subsample_freq': 9, 'colsample_bytree': 0.49, 'min_child_samples': 26, 'min_child_weight': 32}
model_lgb = LGBMClassifier(**params_lgb, random_state=6)


params_mlp = {'alpha': 0.09631013728513668, 'hidden_layer_sizes': 7, 'max_iter': 30}
model_mlp= MLPClassifier(**params_mlp,random_state=17,tol=1e-4)


params_dt = {'max_leaf_nodes': 6, 'max_depth': 254, 'criterion': 'entropy', 'class_weight': 'balanced'}

model_dt = DecisionTreeClassifier(**params_dt, random_state=42)


### **Final Model: Voting Classifier**

In [None]:
scores_train = []
scores_valid = []
for fold in range(5):
  xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
  xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
  xtest = df_test.copy()

  ytrain = xtrain.Revenue
  yvalid = xvalid.Revenue
  
  
  xtrain = xtrain[useful_cols]
  xvalid = xvalid[useful_cols]
  xtest = xtest[useful_cols]

  xtrain[numerical] = scaler.fit_transform(xtrain[numerical])
  xvalid[numerical] = scaler.transform(xvalid[numerical])
  xtest[numerical] = scaler.transform(xtest[numerical])

  model_vclf = VotingClassifier(estimators=[ ('BaggingClassifier', model_bg),      
                                              ('LightGBM', model_lgb), 
                                              ('MLPClassifier', model_mlp),
                                              ('DecisionTree', model_dt)
                                              ],
                                                voting='hard') 
  model_vclf.fit(xtrain,ytrain)

  preds_valid = model_vclf.predict(xvalid)
  preds_train = model_vclf.predict(xtrain)
  f1_score_valid = metrics.f1_score(yvalid, preds_valid)
  f1_score_train = metrics.f1_score(ytrain, preds_train)
  print(f"Training Acc for fold: {fold}: {model_vclf.score(xtrain,ytrain)}")
  print(f"Validation Acc for fold: {fold}: {model_vclf.score(xvalid,yvalid)}")
  print(f'Fold {fold} f1-score-train: ', f1_score_train)
  print(f'Fold {fold} f1-score-Valid: ', f1_score_valid)
  scores_train.append(f1_score_train)
  scores_valid.append(f1_score_valid)
print(np.mean(scores_train), np.std(scores_train))   
print(np.mean(scores_valid), np.std(scores_valid))

Training Acc for fold: 0: 0.9016601191230516
Validation Acc for fold: 0: 0.8991383679675621
Fold 0 f1-score-train:  0.6672384219554032
Fold 0 f1-score-Valid:  0.6666666666666665
Training Acc for fold: 1: 0.8996324927132175
Validation Acc for fold: 1: 0.9016725798276736
Fold 1 f1-score-train:  0.6541484716157204
Fold 1 f1-score-Valid:  0.6523297491039427
Training Acc for fold: 2: 0.8987454061589152
Validation Acc for fold: 2: 0.8966041561074506
Fold 2 f1-score-train:  0.6575225032147451
Fold 2 f1-score-Valid:  0.6709677419354839
Training Acc for fold: 3: 0.9000126726650615
Validation Acc for fold: 3: 0.8920425747592499
Fold 3 f1-score-train:  0.6697362913352868
Fold 3 f1-score-Valid:  0.6243386243386243
Training Acc for fold: 4: 0.8992650785605677
Validation Acc for fold: 4: 0.902129817444219
Fold 4 f1-score-train:  0.6550976138828633
Fold 4 f1-score-Valid:  0.6608084358523726
0.6607486604008038 0.006462188851865252
0.655022243579418 0.01656541906558489


## **We can use the final model i.e. ``voting classifier`` for deployment as it has decent mean_f1-score and satisfactory standard deviation.**

In [None]:
"""Save the Bagging CLF model"""
output_file = f'model_bg.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump(model_bg, f_out)

print(f'the model is saved to {output_file}')



"""Save the LGB model"""
output_file = f'model_lgb.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump(model_lgb, f_out)


print(f'the model is saved to {output_file}')



"""Save the DT model"""
output_file = f'model_dt.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump(model_dt, f_out)
print(f'the model is saved to {output_file}')



"""Save the MLP model"""
output_file = f'model_mlp.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump(model_mlp, f_out)
print(f'the model is saved to {output_file}')


"""Save the Voting CLF model"""
output_file = f'model_vclf.bin'
with open(output_file, 'wb') as f_out:
    pickle.dump(model_vclf, f_out)
print(f'the model is saved to {output_file}')

the model is saved to model_bg.bin
the model is saved to model_lgb.bin
the model is saved to model_dt.bin
the model is saved to model_mlp.bin
the model is saved to model_vclf.bin


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn import preprocessing
import pickle

In [None]:
model_file = 'model_bg.bin'

with open(model_file, 'rb') as f_in:
    model_bg = pickle.load(f_in)


model_file = 'model_lgb.bin'
with open(model_file, 'rb') as f_in:
    model_lgb = pickle.load(f_in)


model_file = 'model_dt.bin'
with open(model_file, 'rb') as f_in:
    model_dt = pickle.load(f_in)


model_file = 'model_mlp.bin'
with open(model_file, 'rb') as f_in:
    model_mlp = pickle.load(f_in)


model_file = 'model_vclf.bin'
with open(model_file, 'rb') as f_in:
    model_vclf = pickle.load(f_in)

In [None]:
def preprocess_train(df_train, y_train):
  useful_cols = [col for col in df_train.columns if col not in ['id', 'Revenue', 'kfold']]
  categorical = [col for col in useful_cols if df_train[col].dtype in ['object', 'bool']]
  numerical = [col for col in useful_cols if col not in categorical]

  # Label-Encoding boolean variable:
  le = preprocessing.LabelEncoder()
  df_train.Weekend = le.fit_transform(df_train.Weekend)

  # OHE
  dicts = df_train.to_dict(orient='records')
  dv = DictVectorizer(sparse=False)
  df_train = pd.DataFrame(dv.fit_transform(dicts),columns=list(dv.get_feature_names_out()))
  columns = list(dv.get_feature_names_out())

  # PT
  pt = PowerTransformer()
  pt_num_tr = pd.DataFrame(pt.fit_transform(df_train[columns]),columns=columns)
  df_train = pd.concat([df_train.drop(columns, axis = 1),pt_num_tr],axis=1)

  # Scaling
  scaler= preprocessing.RobustScaler()
  df_train = scaler.fit_transform(df_train)

  # Models
  params_bg = {'n_estimators': 533, 'max_samples': 32}
  model_bg = BaggingClassifier(**params_bg, random_state=7)

  params_lgb = {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 47,
                'max_depth': 17, 'learning_rate': 0.331,
              'n_estimators': 469, 'reg_alpha': 30.654408167803027, 'reg_lambda': 8.742258358130245,
              'subsample': 0.45, 'subsample_freq': 9,
                'colsample_bytree': 0.49, 'min_child_samples': 26, 'min_child_weight': 32}
  model_lgb = LGBMClassifier(**params_lgb, random_state=6)


  params_mlp = {'alpha': 0.09631013728513668, 'hidden_layer_sizes': 7, 'max_iter': 30}
  model_mlp= MLPClassifier(**params_mlp,random_state=17,tol=1e-4)


  params_dt = {'max_leaf_nodes': 6, 'max_depth': 254, 'criterion': 'entropy', 'class_weight': 'balanced'}
  model_dt = DecisionTreeClassifier(**params_dt, random_state=42)

  # Final Model 
  model = VotingClassifier(estimators=[ ('BaggingClassifier', model_bg),      
                                              ('LightGBM', model_lgb), 
                                              ('MLPClassifier', model_mlp),
                                              ('DecisionTree', model_dt)],
                                                voting='hard') 
  model.fit(df_train,y_train)

  return le, dv, pt, scaler,model


def predict(df,pt,scaler,dv, model):
    df.Weekend = le.transform(df.Weekend)
    dicts = df.to_dict(orient='records')
    X = dv.transform(dicts)
    X = pt.transform(X)
    X = scaler.transform(X)
    y_pred = model.predict(X)

    return y_pred

In [None]:
"""Trying out Model-Prediction on first 20 rows"""
xtrain = df.drop('Revenue', axis = 1).copy()
ytrain = df.Revenue.copy()
le, dv, pt, scaler,model= preprocess_train(xtrain, ytrain)

df_q = df.head(20)
xtrain = df_q.drop(['Revenue', 'kfold'],axis = 1)
ytrain = df_q.Revenue
preds_train = predict(xtrain,pt,scaler, dv, model)
print([(i,j) for i, j in zip(ytrain, preds_train)][:20])

[(0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (1, 1)]
