In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train_data.csv
/kaggle/input/titanic/test_data.csv


# Input of ADS

In [2]:
input_ads_pre = pd.read_csv('../input/titanic/train_data.csv')
input_ads_pre.drop(columns=['Unnamed: 0','Title_1','Title_2','Title_3','Title_4'],inplace=True) #Dropping un-necessary columns
#-----------------------------------------------------------------
print(input_ads_pre.shape)
input_ads_pre.head()

(792, 12)


Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Family_size,Emb_1,Emb_2,Emb_3
0,1,0,1,0.275,0.014151,0,0,1,0.1,0,0,1
1,2,1,0,0.475,0.139136,1,0,0,0.1,1,0,0
2,3,1,0,0.325,0.015469,0,0,1,0.0,0,0,1
3,4,1,0,0.4375,0.103644,1,0,0,0.1,0,0,1
4,5,0,1,0.4375,0.015713,0,0,1,0.0,0,0,1


# Null Check

In [3]:
pd.DataFrame(input_ads_pre.isnull().sum()).T

Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Family_size,Emb_1,Emb_2,Emb_3
0,0,0,0,0,0,0,0,0,0,0,0,0


# Description of target variable

In [4]:
#Total survived vs not-survived split in the training data
input_ads_pre['Survived'].value_counts()

0    486
1    306
Name: Survived, dtype: int64

# Shuffling the ADS

In [5]:
from sklearn.utils import shuffle
#np.random.seed(100)

input_ads = shuffle(input_ads_pre,random_state=100)
print(input_ads.shape)
input_ads = input_ads.reset_index(drop=True)
input_ads.head(3)

(792, 12)


Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Family_size,Emb_1,Emb_2,Emb_3
0,371,1,1,0.3125,0.108215,1,0,0,0.1,1,0,0
1,556,0,1,0.775,0.051822,1,0,0,0.0,0,0,1
2,624,0,1,0.2625,0.01533,0,0,1,0.0,0,0,1


# Manipulation of data into train and test

In [6]:
target = 'Survived' #To predict

#--------------------------------------------------------------------------------
#Splitting into X & Y datasets (supervised training)
X = input_ads[[cols for cols in list(input_ads.columns) if target not in cols]]
y = input_ads[target]

#--------------------------------------------------------------------------------
#Since test data is already placed in the input folder separately, we will just import it
test_ads_pre = pd.read_csv('../input/titanic/test_data.csv')
test_ads_pre.drop(columns=['Unnamed: 0','Title_1','Title_2','Title_3','Title_4'],inplace=True) #Dropping un-necessary columns
test_ads = shuffle(test_ads_pre,random_state=100)
test_ads = test_ads.reset_index(drop=True)

#Splitting into X & Y datasets (supervised training)
X_test = test_ads[[cols for cols in list(test_ads.columns) if target not in cols]]
y_test = test_ads[target]

print('Train % of total data:',100 * X.shape[0]/(X.shape[0] + X_test.shape[0]))
#--------------------------------------------------------------------------------
#Manipulation of datasets for convenience and consistency
X_arr = np.array(X)
X_test_arr = np.array(X_test)

y_arr = np.array(y).reshape(X_arr.shape[0],1)
y_test_arr = np.array(y_test).reshape(X_test_arr.shape[0],1)

#--------------------------------------------------------------------------------
#Basic Summary
print(X_arr.shape)
print(X_test_arr.shape)
print(y_arr.shape)

Train % of total data: 88.78923766816143
(792, 11)
(100, 11)
(792, 1)


# Independent Logistic Regression & Decision Tree model

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import tree
#max_depth=2,min_samples_split=7,min_samples_leaf=8,

#Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=100)
dt_clf.fit(X_arr,y_arr)
sklearn_preds_dt = dt_clf.predict(X_test_arr)

#---------------------------------------------------------------------------
#Logistic Regression
lr_clf = LogisticRegression(solver='sag',random_state=100)
lr_clf.fit(X_arr,y_arr)
sklearn_preds_lr = lr_clf.predict(X_test_arr)

#Evaluation of the predictions
print('#- Decision Tree ---------------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,sklearn_preds_dt))
print('Accuracy of test set :',accuracy_score(y_test_arr,sklearn_preds_dt),'\n')

print('#- Logistic Regression ---------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,sklearn_preds_lr))
print('Accuracy of test set :',accuracy_score(y_test_arr,sklearn_preds_lr),'\n')

#- Decision Tree ---------------------------------------------------#
ROC AUC of test set : 0.7986111111111112
Accuracy of test set : 0.82 

#- Logistic Regression ---------------------------------------------#
ROC AUC of test set : 0.5
Accuracy of test set : 0.64 



# Bagging wrapper logic from scratch

## UDF for bootstrap sampling

In [8]:
#UDF for bootstrapping sampling logic
def bootstrapped_sample(arr,random_state):
    
    np.random.seed(random_state)
    
    boot_sample_idx = np.random.choice(a=range(len(arr)),size=len(arr),replace=True)
    boot_sample = arr[boot_sample_idx]
    
    return boot_sample

## UDF for Bagging (works very similar for regression as well)

In [9]:
def bagging_ensemble_clf(estimator,X_arr_,y_arr_,test,n_iters,threshold=0.5):

    pred_array = np.array([np.nan]*len(test)).reshape((len(test)),1)

    joint_arr = np.append(X_arr_,y_arr_,axis=-1)
    
    #Bootstrapping and model building interatively
    for n in range(n_iters):

        sample = bootstrapped_sample(arr=joint_arr,random_state=n)

        #print('Shape before :',sample.shape)
        X_sample = sample[:,0:-1]
        y_sample = sample[:,-1]

        estimator.fit(X_sample,y_sample)
        pred_array_temp = np.array(estimator.predict(test)).reshape((len(test)),1)

        pred_array = np.append(pred_array,pred_array_temp,axis=-1)
        #print('Pred array shape :',pred_array.shape)


    #--------------------------------------------------------------------------------------------------------
    #Aggregation
    pred_array = pred_array[:,1:]
    #print(pred_array)
    
    pred = np.sum(pred_array,axis=1) 
    #print(pred)
    
    n_preds = pred_array.shape[1]
    #print(n_preds)
    
    pred = pred/n_preds
    #print(pred)
    
    pred = (pred>threshold).astype(int)
    print('Unique preds :',np.unique(pred))
    

    return pred

## Invoking Bagginf UDF with Decision Tree model 

In [10]:
decision_tree = DecisionTreeClassifier(random_state=100)

preds_dt = bagging_ensemble_clf(estimator=decision_tree,X_arr_=X_arr,y_arr_=y_arr,test=X_test_arr,n_iters=500)
print(preds_dt.shape)

print('#- Manual Bagging w/ Decision Tree ---------------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,preds_dt))
print('Accuracy of test set :',accuracy_score(y_test_arr,preds_dt),'\n')

Unique preds : [0 1]
(100,)
#- Manual Bagging w/ Decision Tree ---------------------------------------------------#
ROC AUC of test set : 0.8081597222222222
Accuracy of test set : 0.84 



## Invoking Bagginf UDF with Logistic Regression model 

In [11]:
log_reg = LogisticRegression(solver='saga',random_state=100)

preds_lr = bagging_ensemble_clf(estimator=log_reg,X_arr_=X_arr,y_arr_=y_arr,test=X_test_arr,n_iters=500)
print(preds_lr.shape)

print('#- Manual Bagging w/ Logistic Regression Tree ---------------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,preds_lr))
print('Accuracy of test set :',accuracy_score(y_test_arr,preds_lr),'\n')

Unique preds : [0]
(100,)
#- Manual Bagging w/ Logistic Regression Tree ---------------------------------------------------#
ROC AUC of test set : 0.5
Accuracy of test set : 0.64 



# Random Forest Classifier (Very simple after this point)

In [12]:
decision_tree = DecisionTreeClassifier(max_features='sqrt',random_state=100) #For RF only DT can be the estimator & max_features='sqrt'

preds_rf = bagging_ensemble_clf(estimator=decision_tree,X_arr_=X_arr,y_arr_=y_arr,test=X_test_arr,n_iters=500)
print(preds_rf.shape)

print('#- Manual Random Forest ---------------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,preds_rf))
print('Accuracy of test set :',accuracy_score(y_test_arr,preds_rf),'\n')

Unique preds : [0 1]
(100,)
#- Manual Random Forest ---------------------------------------------------#
ROC AUC of test set : 0.8142361111111112
Accuracy of test set : 0.84 



# Sklearn Benchmarking

# Bagging with DT

In [13]:
from sklearn.ensemble import BaggingClassifier

decision_tree = DecisionTreeClassifier(random_state=100)

bagging_skl = BaggingClassifier(base_estimator=decision_tree,
                                n_estimators=500,
                                max_features=1.0,
                                bootstrap=True,
                                random_state=100,
                                n_jobs=-1)
bagging_skl.fit(X_arr,y_arr)
bagging_skl_pred = bagging_skl.predict(X_test_arr)

#--------------------------------------------------------------------------------------------------------
print('#- Sklearn Bagging Classifier ---------------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,bagging_skl_pred))
print('Accuracy of test set :',accuracy_score(y_test_arr,bagging_skl_pred),'\n')

#- Sklearn Bagging Classifier ---------------------------------------------------#
ROC AUC of test set : 0.8081597222222222
Accuracy of test set : 0.84 



# Sklearn Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_skl = RandomForestClassifier(n_estimators=500,
                                max_features='sqrt',
                                bootstrap=True,
                                random_state=100,
                                n_jobs=-1)
rf_skl.fit(X_arr,y_arr)
rf_skl_pred = rf_skl.predict(X_test_arr)

#--------------------------------------------------------------------------------------------------------
print('#- Sklearn Bagging Classifier ---------------------------------------------------#')
print('ROC AUC of test set :',roc_auc_score(y_test_arr,rf_skl_pred))
print('Accuracy of test set :',accuracy_score(y_test_arr,rf_skl_pred),'\n')

#- Sklearn Bagging Classifier ---------------------------------------------------#
ROC AUC of test set : 0.828125
Accuracy of test set : 0.85 



## Insights : The manual implementations are giving almost identical quality of predictions for normal bagging and random forest with the sklearn versions

# END