# Stacking

### Importing Libraries

In [0]:
#importing necessary libraries
import numpy as np
import pandas as pd

### Importing Dataset

In [0]:
data = pd.read_csv('data_cleaned.csv')

#printing the first few rows
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [0]:
#size of the data
data.shape

(891, 25)

In [0]:
#checking missing values
data.isnull().sum()

Survived      0
Age           0
Fare          0
Pclass_1      0
Pclass_2      0
Pclass_3      0
Sex_female    0
Sex_male      0
SibSp_0       0
SibSp_1       0
SibSp_2       0
SibSp_3       0
SibSp_4       0
SibSp_5       0
SibSp_8       0
Parch_0       0
Parch_1       0
Parch_2       0
Parch_3       0
Parch_4       0
Parch_5       0
Parch_6       0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

### Separating Dependent and Independent Variables

In [0]:
#independent variables
x = data.drop(["Survived"], axis = 1)

#dependent variables
y = data['Survived']

x.shape, y.shape

((891, 24), (891,))

### Making test and training set

In [0]:
from sklearn.model_selection import train_test_split as tts

train_x, test_x, train_y, test_y = tts (x, y, random_state = 9 , stratify = y)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((668, 24), (223, 24), (668,), (223,))

## Base models 

In [0]:
#importing predictive models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

<img src="stacking image.png" alt="Drawing" style="width: 400px;"/>

### Model Training and Predictions 

In [0]:
def model_predictions(model, train_x, train_y, test_x):
    
    #train the model
    model.fit(train_x,train_y)
    
    #score on the training data
    score=model.score(train_x,train_y)
    print('Training Score:', score)
    
    #storing predictions for train and test
    pred_train=model.predict(train_x)
    pred_test=model.predict(test_x)
    return pred_train, pred_test

In [0]:
#Model 1 - Logistic Regression
LR=LogisticRegression()
M1_train, M1_test = model_predictions(LR, train_x, train_y, test_x)

Training Score: 0.808383233533


In [0]:
#Model 2 - Decision Tree
DT=DecisionTreeClassifier()
M2_train, M2_test = model_predictions(DT, train_x, train_y, test_x)

Training Score: 0.986526946108


In [0]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X=train_x)

train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [0]:
#Model 3 - k Nearest Neighbour
knn=KNeighborsClassifier()
M3_train, M3_test = model_predictions(knn, train_x, train_y, test_x)

Training Score: 0.847305389222


## Stacking Model

<img src="stacking image 2.png" alt="Drawing" style="width: 500px;"/>

In [0]:
#Creating a New train dataframe
train_prediction = {
              'LR': M1_train,
              'DT': M2_train,
              'knn': M3_train
              }
train_predictions = pd.DataFrame(train_prediction)
train_predictions.head()

Unnamed: 0,LR,DT,knn
0,1,1,0
1,1,1,1
2,1,1,0
3,1,0,1
4,0,0,0


In [0]:
#Creating a New test dataframe
test_prediction = {
              'LR': M1_test,
              'DT': M2_test,
              'knn': M3_test
              }
test_predictions = pd.DataFrame(test_prediction)
test_predictions.head()

Unnamed: 0,LR,DT,knn
0,0,0,0
1,1,1,1
2,0,0,0
3,0,1,0
4,0,0,0


In [0]:
# Stacker Model
model = KNeighborsClassifier()
model.fit(train_predictions, train_y)
model.score(test_predictions,test_y)

0.73542600896860988

## Variants of Stacking

In [0]:
from sklearn.model_selection import train_test_split as tts

train_x, test_x, train_y, test_y = tts (x, y, random_state = 9 , stratify = y)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((668, 24), (223, 24), (668,), (223,))

### Function that gives kfold predictions

<img src="stacking image 3.png" alt="Drawing" style="width: 300px;"/>

In [0]:
from sklearn.model_selection import KFold

def kfold_predictions( model , n_splits , train_x , train_y, test_x ):
    ## Initializing empty Numpy Arrays
    train_pred = np.empty((0,0) , int)
    skfold = KFold(n_splits, random_state = 9)
  
    #For every permutation of KFold
    for i,j in skfold.split(train_x, train_y):
        x_train, x_test = train_x.iloc[i], train_x.iloc[j]
        y_train, y_test = train_y.iloc[i], train_y.iloc[j]
    
        #Train a model on training set
        model.fit( X = x_train, y = y_train)
    
        #Predict on Chosen Group
        tmp = model.predict(x_test)
    
        #Storing train predictions
        train_pred = np.append(train_pred, tmp)
    
    #Training on complete data and generating Predictionson Test set
    model.fit(train_x, train_y)
    test_pred = model.predict(test_x)
  
    return train_pred, test_pred

In [0]:
## Base Linear Regression model
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
M1_train, M1_test = kfold_predictions( LR, 10, train_x, train_y, test_x) 
#lr_train_pred.shape, lr_test_pred.shape, data_x.shape, data_y.shape

In [0]:
# Base Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
M2_train, M2_test = kfold_predictions(DT, 10, train_x, train_y, test_x) 
#dt_train_pred.shape, dt_test_pred.shape, data_x.shape, data_y.shape

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X=train_x)

train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [0]:
train_x = pd.DataFrame(train_x)
test_x = pd.DataFrame(test_x)

In [0]:
# Base K-Neighbors Model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
M3_train, M3_test= kfold_predictions( knn, 10, train_x, train_y, test_x) 
#knn_train_pred.shape, knn_test_pred.shape, data_x.shape, data_y.shape

In [0]:
# Arranging Train KFold Predictions
train_prediction = {
              'LR': M1_train,
              'KNN': M3_train,
              'DT': M2_train
              }
train_predictions = pd.DataFrame(train_prediction)
train_predictions.head()

Unnamed: 0,LR,KNN,DT
0,1,0,1
1,1,1,1
2,1,0,1
3,1,1,1
4,0,0,0


In [0]:
# Arranging test KFold Predictions
test_prediction = {
              'LR': M1_test,
              'KNN': M3_test,
              'DT': M2_test
              }
test_predictions = pd.DataFrame(test_prediction)
test_predictions.head()

Unnamed: 0,LR,KNN,DT
0,0,0,0
1,1,1,1
2,0,0,0
3,0,0,1
4,0,0,0


<img src="stacking image 4.png" alt="Drawing" style="width: 500px;"/>

In [0]:
# Final Model taking the previous predictions as training set
model= KNeighborsClassifier()
model.fit(train_predictions, train_y)
final = model.predict(test_predictions)
model.score( test_predictions ,test_y)

0.7847533632286996