In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import ADS

In [None]:
input_ads_pre = pd.read_csv('../input/titanic/train_data.csv')
input_ads_pre.drop(columns=['Unnamed: 0','Title_1','Title_2','Title_3','Title_4'],inplace=True) #Dropping un-necessary columns
#-----------------------------------------------------------------
print(input_ads_pre.shape)
input_ads_pre.head()

# Null Check

In [None]:
pd.DataFrame(input_ads_pre.isnull().sum()).T

# Description of Target Variable

In [None]:
#Total survived vs not-survived split in the training data
input_ads_pre['Survived'].value_counts()

# Shuffling the data

In [None]:
from sklearn.utils import shuffle
#np.random.seed(100)

#----------------------------------------------------
input_ads = shuffle(input_ads_pre,random_state=100)
print(input_ads.shape)
input_ads = input_ads.reset_index(drop=True)
input_ads.head(3)

# Train-Test manipulation of data

In [None]:
target = 'Survived' #To predict

#--------------------------------------------------------------------------------
#Splitting into X & Y datasets (supervised training)
X = input_ads[[cols for cols in list(input_ads.columns) if target not in cols]]
y = input_ads[target]

#--------------------------------------------------------------------------------
#Since test data is already placed in the input folder separately, we will just import it
test_ads_pre = pd.read_csv('../input/titanic/test_data.csv')
test_ads_pre.drop(columns=['Unnamed: 0','Title_1','Title_2','Title_3','Title_4'],inplace=True) #Dropping un-necessary columns
test_ads = shuffle(test_ads_pre,random_state=100)
test_ads = test_ads.reset_index(drop=True)

#Splitting into X & Y datasets (supervised training)
X_test = test_ads[[cols for cols in list(test_ads.columns) if target not in cols]]
y_test = test_ads[target]

print('Train % of total data:',100 * X.shape[0]/(X.shape[0] + X_test.shape[0]))
#--------------------------------------------------------------------------------
#Manipulation of datasets for convenience and consistency
X_arr = np.array(X)
X_test_arr = np.array(X_test)

y_arr = np.array(y).reshape(X_arr.shape[0],1)
y_test_arr = np.array(y_test).reshape(X_test_arr.shape[0],1)

#--------------------------------------------------------------------------------
#Basic Summary
print(X_arr.shape)
print(X_test_arr.shape)
print(y_arr.shape)

# Stacking Wrapper Logic from scratch

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Importing all the necessary models for base model purpose

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

## UDF for Stacking Ensemble Training (Clf)

In [None]:
def stacking_ensemble_clf_training(data_x,data_y,estimator_tray,meta_estimator,passthrough=True):

    fit_level_1 = [] #Model trays for level 1
    fit_level_2 = [] #model Trays for level 2
    train_pred_arr = np.array([np.nan] * len(data_x)).reshape(len(data_x),1)

    for estimator in estimator_tray: #Creating level 1 models

        estimator.fit(data_x,data_y)
        pred_temp = np.array(estimator.predict(data_x)).reshape(len(data_x),1)
        train_pred_arr = np.append(train_pred_arr,pred_temp,axis=-1)
        fit_level_1.append(estimator)

    train_pred_arr = train_pred_arr[:,1:] #Removing the first null column

    if passthrough==False: #If training data doesnt needs to be passed to the level 2

        meta_estimator.fit(train_pred_arr,data_y)
        fit_level_2.append(meta_estimator)
        #pred_meta = meta_estimator.predict(X_test_arr)

    elif passthrough==True: #If training data needs to be passed to the level 2

        train_pass = np.append(data_x,train_pred_arr,axis=-1)
        meta_estimator.fit(train_pass,data_y)
        fit_level_2.append(meta_estimator)
        
        #pred_meta = meta_estimator.predict(X_test_arr)

    return fit_level_1,fit_level_2[0]


## UDF for Stacking Ensemble Prediction

In [None]:
def stacking_ensemble_clf_predict(data_x_test,data_y_test,fit_level_1_tray,meta_estimator,passthrough=True):

    train_pred_arr = np.array([np.nan] * len(data_x_test)).reshape(len(data_x_test),1) #Initializing level 1 prediction array

    for estimator in fit_level_1_tray: #Predcitng for each base model

        pred_temp = np.array(estimator.predict(data_x_test)).reshape(len(data_x_test),1)
        train_pred_arr = np.append(train_pred_arr,pred_temp,axis=-1)

    train_pred_arr = train_pred_arr[:,1:] #Removing the first null column

    if passthrough==False: #Should match with training

        pred_meta = meta_estimator.predict(train_pred_arr)

    elif passthrough==True: #Should match with training

        train_pass = np.append(data_x_test,train_pred_arr,axis=-1)
        pred_meta = meta_estimator.predict(train_pass)

    return pred_meta


## Invoking UDF for Stacking Ensemble Training (Clf)

In [None]:
#Level 1 models
log_reg = LogisticRegression(solver='sag',random_state=100)
sgd_clf = SGDClassifier(random_state=100)
knn = KNeighborsClassifier(n_neighbors=3)
dt_clf = DecisionTreeClassifier(random_state=100)
rf_clf = RandomForestClassifier(random_state=100)
svc = SVC(random_state=100)

xgb_ = xgb.XGBClassifier(random_state=100) #Meta Classifier
#-----------------------------------------------------------------------------------------------------------------
level_1 = [log_reg,sgd_clf,knn,dt_clf,rf_clf,svc]
#meta = xgb_

#-----------------------------------------------------------------------------------------------------------------
fit_level_1,meta_estimator = stacking_ensemble_clf_training(data_x=X_arr,
                                                            data_y=y_arr,
                                                            estimator_tray=level_1,
                                                            meta_estimator=xgb_,
                                                            passthrough=True)


## Invoking prediction UDF for Stacking Ensemble

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

final_preds = stacking_ensemble_clf_predict(data_x_test=X_test_arr,
                                            data_y_test=y_test_arr,
                                            fit_level_1_tray=fit_level_1,
                                            meta_estimator=meta_estimator,
                                            passthrough=True)

#-------------------------------------------------------------------------------
#Evaluation of the manual ensemble
print('ROC AUC of test set :',roc_auc_score(y_test_arr,final_preds))
print('Accuracy of test set :',accuracy_score(y_test_arr,final_preds))

# Sklearn Benchmark

In [None]:
#Base Estimators
estimators_list = [
('log_reg', LogisticRegression(solver='sag',random_state=100)),
('sgd_clf', SGDClassifier(random_state=100)),
('knn', KNeighborsClassifier(n_neighbors=3)),
('dt_clf', DecisionTreeClassifier(random_state=100)),
('rf_clf', RandomForestClassifier(random_state=100)),
('svc', SVC(random_state=100))]

xgb_ = xgb.XGBClassifier(random_state=100) #Meta Classifier

## Sklearn implementation of Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

stacking = StackingClassifier(estimators=estimators_list,
                              final_estimator=xgb_,
                              stack_method='predict',
                              passthrough=True,
                              n_jobs=-1)

stacking.fit(X_arr,y_arr)
stacking_pred = stacking.predict(X_test_arr)

#--------------------------------------------------------------------------
#Evaluating
print('ROC AUC of test set :',roc_auc_score(y_test_arr,stacking_pred))
print('Accuracy of test set :',accuracy_score(y_test_arr,stacking_pred))

### Insights: The accuracy and ROC-AUC scores for both sklearn and manual implementations are same indicating correct implementation of the logic

# END