#  Stacking (Stacked Generalization)

<img src="images/stacking.png" width="700">

In [1]:
#  loading the dataset
import pandas as pd
data = pd.read_csv("files/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Getting describe of data
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [3]:
# Separating features and labels
X = data.drop(columns = 'Outcome')
y = data['Outcome']

# Splitting train and Test
from sklearn.model_selection import train_test_split
train,val_train,test,val_test = train_test_split(X,y,test_size=0.5, random_state= 355)

In [4]:
# let's split the training set again into training and test dataset 
x_train,x_test,y_train,y_test =  train_test_split(train,test,test_size=0.2, random_state= 355)

####  We will use KNN and SVM algorithm as our base models.

####  Let's fit both of the models first on the x_train and y_train data.

In [5]:
#  fitting KNN classifier over data
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)

# getting knn score
print ('Knn score -> ', knn.score(x_test,y_test))

Knn score ->  0.7402597402597403


In [6]:
#  fitting random forest classifier
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier()
rand_clf.fit(x_train,y_train)

# getting rf score
print ('rf score -> ', rand_clf.score(x_test,y_test))

rf score ->  0.8181818181818182


In [7]:
#  fitting SVM Classifier
from sklearn.svm import SVC
from sklearn import tree
svm = SVC()
svm.fit(x_train,y_train)


# getting rf score
print ('svm score -> ', svm.score(x_test,y_test))

svm score ->  0.7402597402597403


In [8]:
predict_val1 = knn.predict(val_train)
predict_val2 = svm.predict(val_train)
predict_val3 = rand_clf.predict(val_train)

###  Lets stack the prediciton values for validation set together as  " predict_val "

In [12]:
import numpy as np

# Stacking all predictions together
predict_val = np.column_stack((predict_val1, predict_val2, predict_val3))
predict_val

array([[0, 0, 0],
       [0, 0, 0],
       [1, 1, 0],
       ...,
       [1, 0, 1],
       [0, 0, 0],
       [1, 0, 0]], dtype=int64)

###  Lets get the prediction for all the base models on the test set    =>    x_test

In [13]:
predict_test1 = knn.predict(x_test)
predict_test2 = svm.predict(x_test)
predict_test3 = rand_clf.predict(x_test)

In [14]:
predict_test = np.column_stack((predict_test1,predict_test2, predict_test3))
predict_test

array([[1, 0, 1],
       [0, 0, 0],
       [1, 1, 0],
       [1, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [1, 1, 1],
       [1, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [1, 0, 1],
       [1, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0,

In [16]:
svm = SVC()
svm.fit(predict_val,val_test)
svm.score(predict_test,y_test)

0.7922077922077922

In [17]:
rand_clf = RandomForestClassifier()

rand_clf.fit(predict_val,val_test)

rand_clf.score(predict_test,y_test)

0.7922077922077922

In [20]:
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    "n_estimators" : [90,100,115],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf' : [1,2,3,4,5],
    'min_samples_split': [4,5,6,7,8],
    'max_features' : ['auto','log2']
}

# Importing gridsearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=rand_clf,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [21]:
#  Tune the model
grid_search.fit(predict_val,val_test)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  3.4min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [4, 5, 6, 7, 8],
                         'n_estimators': [90, 100, 115]},
             verbose=3)

In [22]:
#  Getting the Best parameters
grid_search.best_params_

{'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 100}

In [23]:
#  Training again the model with best parameters
rand_clf = RandomForestClassifier( criterion='gini',max_features = 'auto',min_samples_leaf =1,min_samples_split= 4,
                                  n_estimators =90)

In [24]:
rand_clf.fit(predict_val,val_test)

RandomForestClassifier(min_samples_split=4, n_estimators=90)

In [25]:
rand_clf.score(predict_test,y_test)

0.8051948051948052