In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datatable as dt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from datatable.models import Ftrl
import pickle
import xgboost as xgb
from sklearn.metrics import make_scorer, matthews_corrcoef
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df = pd.read_csv('final_df_train_3.0.csv')
df.fillna(0,inplace=True)
df.drop(['Unnamed: 0','kurtosis','max_num_value','skewness','skewness_imp_st','mean_num_value','sum_num_value_imp_st','sum_num_value','mean_num_value_imp_st'],axis=1,inplace=True)

In [2]:
df_test = pd.read_csv('final_df_test_3.0.csv')
df_test.fillna(0,inplace=True)
df_test.drop(['Unnamed: 0','kurtosis','max_num_value','skewness','skewness_imp_st','mean_num_value','sum_num_value_imp_st','sum_num_value','mean_num_value_imp_st'],axis=1,inplace=True)

In [4]:
Y = df['Response']
X= df.drop(['Id','Response'],axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=42, shuffle=True)

# Base Model

In [6]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
y_pred = clf.predict(X_test)

In [8]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
matthews_corrcoef(y_test,y_pred)

-0.00027405881426615394

# Custom Ensemble

In [6]:
X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train,y_train,test_size=0.5, random_state=42, shuffle=True)

In [6]:
model_xgb=xgb.XGBClassifier(booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=3.2, learning_rate=0.5,
       max_delta_step=0, max_depth=3, min_child_weight=12, missing=1,
       n_estimators=65, n_jobs=1,random_state=0,
       objective='binary:logistic', reg_alpha=1.6,
       reg_lambda=0.2, scale_pos_weight=1, seed=None, silent=True,
       subsample=1,use_label_encoder=False)

model_dt=DecisionTreeClassifier(splitter='best',min_weight_fraction_leaf=0.1,min_samples_leaf=1,max_leaf_nodes=30,max_features='sqrt',max_depth=9)

model_lgb = lgb.LGBMClassifier(colsample_bytree= 0.43455527965836527,
 min_child_samples= 198,
 min_child_weight= 10.0,
 num_leaves= 12,
 reg_alpha= 0.1,
 reg_lambda= 0.1,
 subsample= 0.36181919064442924)

model_adb = AdaBoostClassifier(n_estimators=7,learning_rate=0.97,algorithm='SAMME.R')

model_gbdt = GradientBoostingClassifier(n_estimators=16,min_samples_split=0.5,min_samples_leaf=0.2,max_features=7,max_depth=21.0,learning_rate=1)

model_rf=RandomForestClassifier(n_estimators=1800,min_samples_split=10,min_samples_leaf=2,max_features='sqrt',max_depth=10,bootstrap=True)

#model_svm = SVC(kernel = 'rbf', gamma = 0.0001, C = 10, probability = True)

In [24]:
models_to_train = [model_xgb,model_dt,model_lgb,model_adb,model_gbdt,model_rf]

In [26]:
from copy import copy
df1 = copy(X_train1)
df1['Response'] = y_train1

In [27]:
def custom_ensemble(model_list,dateframe,x2,y2,xtest,ytest,k):
    n = len(dateframe)//k
    sample_df = dateframe.sample(n)
    
    x1 = sample_df.drop('Response',axis=1)
    y1 = sample_df['Response']
    
    predict_df = pd.DataFrame()
    predict_test_df = pd.DataFrame()
    predict_df_kaggle = pd.DataFrame()
    
    predict_prob_df = pd.DataFrame()
    predict_test_prob_df = pd.DataFrame()
    predict_prob_df_kaggle = pd.DataFrame()
    
    for i in tqdm(range(k)):
        model = model_list[i]
        model.fit(x1,y1)
        
        y_pred = model.predict(x2)
        y_pred_prob = model.predict_proba(x2)
        
        predict_df[str(i+1)+'_prediction'] = y_pred
        predict_prob_df[str(i+1)+'_prediction_prob 0'] = y_pred_prob[:,0]
        predict_prob_df[str(i+1)+'_prediction_prob 1'] = y_pred_prob[:,1]
        
        
        y_pred_test = model.predict(xtest)
        y_pred_prob_test = model.predict_proba(xtest)
        
        predict_test_df[str(i+1)+'_prediction'] = y_pred_test
        predict_test_prob_df[str(i+1)+'_prediction_prob 0'] = y_pred_prob_test[:,0]
        predict_test_prob_df[str(i+1)+'_prediction_prob 1'] = y_pred_prob_test[:,1]
        
        '''y_pred_kaggle = model.predict(df_test.drop('Id',axis=1))
        y_pred_prob_kaggle = model.predict_proba(df_test.drop('Id',axis=1))
        
        predict_df_kaggle[str(i+1)+'_prediction'] = y_pred_kaggle
        predict_prob_df_kaggle[str(i+1)+'_prediction_prob 0'] = y_pred_prob_kaggle[:,0]
        predict_prob_df_kaggle[str(i+1)+'_prediction_prob 1'] = y_pred_prob_kaggle[:,1]'''
        
        print(f"After {i+1}th model the final shape of train data is {predict_df.shape} and {predict_prob_df.shape}")
        print(f"After {i+1}th model the final shape of test data is {predict_test_df.shape} and {predict_test_prob_df.shape}")
        
    return predict_df, predict_prob_df, predict_test_df, predict_test_prob_df

In [29]:
import warnings
warnings.filterwarnings('ignore')
df_train, df_train_prob_df, df_test, df_test_prob_df = custom_ensemble(models_to_train,df1,X_train2,y_train2,X_test,y_test,len(models_to_train))

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




 17%|██████████████                                                                      | 1/6 [00:23<01:56, 23.40s/it]

After 1th model the final shape of train data is (473499, 1) and (473499, 2)
After 1th model the final shape of test data is (236750, 1) and (236750, 2)


 33%|████████████████████████████                                                        | 2/6 [00:27<00:48, 12.10s/it]

After 2th model the final shape of train data is (473499, 2) and (473499, 4)
After 2th model the final shape of test data is (236750, 2) and (236750, 4)


 50%|██████████████████████████████████████████                                          | 3/6 [00:49<00:49, 16.50s/it]

After 3th model the final shape of train data is (473499, 3) and (473499, 6)
After 3th model the final shape of test data is (236750, 3) and (236750, 6)


 67%|████████████████████████████████████████████████████████                            | 4/6 [01:04<00:31, 16.00s/it]

After 4th model the final shape of train data is (473499, 4) and (473499, 8)
After 4th model the final shape of test data is (236750, 4) and (236750, 8)


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [01:09<00:12, 12.06s/it]

After 5th model the final shape of train data is (473499, 5) and (473499, 10)
After 5th model the final shape of test data is (236750, 5) and (236750, 10)


100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [10:27<00:00, 104.56s/it]

After 6th model the final shape of train data is (473499, 6) and (473499, 12)
After 6th model the final shape of test data is (236750, 6) and (236750, 12)





In [40]:
metamodel = xgb.XGBClassifier(booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=3.2, learning_rate=0.5,
       max_delta_step=0, max_depth=3, min_child_weight=12, missing=1,
       n_estimators=65, n_jobs=1,random_state=0,
       objective='binary:logistic', reg_alpha=1.6,
       reg_lambda=0.2, scale_pos_weight=1, seed=None, silent=True,
       subsample=1,use_label_encoder=False)

In [33]:
metamodel.fit(df_train,y_train2)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=3.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=3,
              min_child_weight=12, missing=1, monotone_constraints='()',
              n_estimators=65, n_jobs=1, num_parallel_tree=1, random_state=0,
              reg_alpha=1.6, reg_lambda=0.2, scale_pos_weight=1, seed=0,
              silent=True, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [37]:
X_test.shape

(236750, 282)

In [39]:
y_pred = metamodel.predict(df_test)
matthews_corrcoef(y_pred,y_test)

0.27024204458214207

In [41]:
metamodel.fit(df_train_prob_df,y_train2)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=3.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=3,
              min_child_weight=12, missing=1, monotone_constraints='()',
              n_estimators=65, n_jobs=1, num_parallel_tree=1, random_state=0,
              reg_alpha=1.6, reg_lambda=0.2, scale_pos_weight=1, seed=0,
              silent=True, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [42]:
y_pred = metamodel.predict(df_test_prob_df)
matthews_corrcoef(y_pred,y_test)

0.2882907990820504

# Voting Classifier

In [7]:
estimator = []
estimator.append(('xgb', model_xgb))
estimator.append(('dt', model_dt))
estimator.append(('lgm', model_lgb))
estimator.append(('adb',model_adb))
estimator.append(('gbdt',model_gbdt))
estimator.append(('rf',model_rf))

In [8]:
from sklearn.ensemble import VotingClassifier
vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
vot_soft.fit(X_train, y_train)
y_soft = vot_soft.predict(X_test)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [9]:
matthews_corrcoef(y_test,y_soft)

0.1334819530614064

In [10]:
y_pred_test = vot_soft.predict(df_test.drop('Id',axis=1))

In [11]:
c=0
for i in y_pred_test:
    if i == 1:
        c+=1
c

155

## Summarizing the result using Python Pretty Table 

In [1]:
from prettytable import PrettyTable

myTable = PrettyTable(["Model Name", "Train Result", "Test/Validation Result", "Kaggle Private Result","Kaggle Public Result"])

myTable.add_row(["XGBoost", "0.325", "0.303", "0.307","0.298"])
myTable.add_row(["Random Forest", "0.237", "0.198", "----","----"])
myTable.add_row(["Light GBM", "0.321", "0.303", "0.303","0.305"])
myTable.add_row(["Ada Boost", "0.221", "0.206", "0.214","0.218"])
myTable.add_row(["Custom Ensemble", "----", "0.288", "----","----"])
myTable.add_row(["Voting Classifier", "----", "0.133", "----","----"])

print(myTable)

+-------------------+--------------+------------------------+-----------------------+----------------------+
|     Model Name    | Train Result | Test/Validation Result | Kaggle Private Result | Kaggle Public Result |
+-------------------+--------------+------------------------+-----------------------+----------------------+
|      XGBoost      |    0.325     |         0.303          |         0.307         |        0.298         |
|   Random Forest   |    0.237     |         0.198          |          ----         |         ----         |
|     Light GBM     |    0.321     |         0.303          |         0.303         |        0.305         |
|     Ada Boost     |    0.221     |         0.206          |         0.214         |        0.218         |
|  Custom Ensemble  |     ----     |         0.288          |          ----         |         ----         |
| Voting Classifier |     ----     |         0.133          |          ----         |         ----         |
+------------------

# At the end I conclude that XGBoost with some hyperparameter tuning giving me the best Result 