# Voting Methods

**The code is included here as an example for you to refer to for your future projects.**

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

RSEED=42

### Read CSV and splitting data

In [3]:
# Import diabetes data
df = pd.read_csv('data/pima-native-americans-diabetes.csv', header=None)
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [4]:
# Define features and target and split into train and test set
y = df[8]
X = df.drop(8, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RSEED)

In [5]:
y_train.shape, y_test.shape

((576,), (192,))

In [6]:
y_train.value_counts(), y_test.value_counts()

(8
 0    375
 1    201
 Name: count, dtype: int64,
 8
 0    125
 1     67
 Name: count, dtype: int64)

## Max Voting 

In [10]:
from sklearn.ensemble import VotingClassifier

model1 = LogisticRegression(random_state = RSEED)
model2 =  KNeighborsClassifier()
model3 = DecisionTreeClassifier(random_state = RSEED)

model = VotingClassifier(estimators = [('lr', model1), ('knn', model2), ('dt', model3)], voting = 'hard')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.75

In [17]:
model3.fit(X_train,y_train).score(X_train,y_train)

1.0

## Averaging

In [23]:
model1 = LogisticRegression(random_state = RSEED)
model2 = KNeighborsClassifier()
model3 = DecisionTreeClassifier(random_state = RSEED)

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1 = model1.predict_proba(X_test)
pred2 = model2.predict_proba(X_test)
pred3 = model3.predict_proba(X_test)

finalpred = (pred1 + pred2 + pred3) / 3
finalpred = np.argmax(finalpred.round(0), axis = 1)
(y_test == finalpred).sum() / len(finalpred)

0.765625

## Weighted Average

In [26]:
model1 = LogisticRegression(random_state = RSEED)
model2 = KNeighborsClassifier()
model3 = DecisionTreeClassifier(random_state = RSEED)

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1 = model1.predict_proba(X_test)
pred2 = model2.predict_proba(X_test)
pred3 = model3.predict_proba(X_test)

acc1 = accuracy_score(y_test, model1.predict(X_test))
acc2 = accuracy_score(y_test, model2.predict(X_test))
acc3 = accuracy_score(y_test, model3.predict(X_test))

acc_sum = acc1 + acc2 + acc3

weight1 = acc1/acc_sum
weight2 = acc2/acc_sum
weight3 = acc3/acc_sum

finalpred = (pred1*weight1 + pred2*weight2 + pred3*weight3)
finalpred = np.argmax(finalpred.round(0), axis = 1)
(y_test == finalpred).sum() / len(finalpred)


0.7604166666666666


In [27]:
print(accuracy_score(y_test, finalpred))

0.7604166666666666


## Stacking 

In [51]:
# Implementation of Stacking in Scikit-Learn
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_curve, roc_auc_score

estimators = [
    ('dt', DecisionTreeClassifier(random_state = RSEED)),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier(random_state = RSEED))
]

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Construct a pipeline with StackingClassifier
pipe = Pipeline([
    ('stacking_clf', stacking_clf)
])

# Define hyperparameters only for LogisticRegression()
hyperparams = {
    'stacking_clf__final_estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'stacking_clf__final_estimator__penalty': ['l1', 'l2'],
    'stacking_clf__final_estimator__solver': ['liblinear', 'saga']
}

meta = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())

# Create GridSearchCV object
grid_model = GridSearchCV(pipe, param_grid=hyperparams, cv=5, scoring='accuracy', verbose=5, n_jobs=-1)

grid_model.fit(X_train, y_train)
best_model = grid_model.best_estimator_
print('Best params:', grid_model.best_params_)    
print("--------"*10)


Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best params: {'stacking_clf__final_estimator__C': 1, 'stacking_clf__final_estimator__penalty': 'l1', 'stacking_clf__final_estimator__solver': 'liblinear'}
--------------------------------------------------------------------------------


In [52]:
y_pred_test = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(f"Recall: {recall_score(y_test, y_pred_test)}")
print(f"Precision: {precision_score(y_test, y_pred_test)}")


Accuracy: 0.7447916666666666
Recall: 0.5223880597014925
Precision: 0.6730769230769231


In [29]:
clf

In order to simplify the example above, the stacking model we have created has only two levels. The **DecisionTree, KNN and RandomForest** models are built at **level zero**, while a **LogisticRegression** model is built at **level one**.

 

## Stacking Classifier explanation

### Stacking without `StackingClassifier`

We take 3 base classifiers and one final estimator separately. Initialize the object for each classifier

In [30]:
# Base Estimators
dt = DecisionTreeClassifier(random_state = RSEED)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state = RSEED)

# final estimator
final_est = LogisticRegression()


For stacking classification we divide our train dataset into two parts
1. With the first part, we train our base estimators  
2. And with the second part we predict probabilities from base estimator and train the final estimator on the probabilities  

In [31]:
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train, y_train, stratify=y_train, random_state=RSEED)

In [32]:
# Fit all the base estimators on the 1st half of the train dataset
dt_model = dt.fit(X_train_1, y_train_1)
knn_model = knn.fit(X_train_1, y_train_1)
rf_model = rf.fit(X_train_1, y_train_1)

# Then with the second half of the train dataset we predict the probabilities from the base estimators
dt_probab = dt_model.predict_proba(X_train_2)[:,1]
knn_probab = knn_model.predict_proba(X_train_2)[:,1]
rf_probab = rf_model.predict_proba(X_train_2)[:,1]

In [33]:
# Then we combine all the probabilities and form a training data (probabilities) for the final estimator
lr_X = pd.concat([
            pd.DataFrame(dt_probab), 
            pd.DataFrame(knn_probab),
            pd.DataFrame(rf_probab),
        ], axis=1)

In [34]:
lr_X

Unnamed: 0,0,0.1,0.2
0,1.0,0.6,0.70
1,1.0,0.8,0.76
2,1.0,0.0,0.20
3,1.0,0.8,0.70
4,0.0,0.4,0.11
...,...,...,...
139,0.0,0.0,0.35
140,0.0,0.2,0.10
141,0.0,0.0,0.20
142,0.0,0.0,0.07


In [35]:
# Fit the final estimator on the combined probabilities and target values
final_est.fit(lr_X, y_train_2)

For the test dataset, we do the same thing as while training
1. predict probabilities from base estimators on the test dataset
2. combine the probabilities from base estimator to form a test dataset for final estimator
3. predict with final estimator on test data (probabilities)

In [36]:
dt_pred = dt_model.predict_proba(X_test)[:,1]
knn_pred = knn_model.predict_proba(X_test)[:,1]
rf_pred = rf_model.predict_proba(X_test)[:,1]

comb_pred = pd.concat([
            pd.DataFrame(dt_pred), 
            pd.DataFrame(knn_pred),
            pd.DataFrame(rf_pred),
        ], axis=1)

pred_final = final_est.predict(comb_pred)

In [37]:
from sklearn.metrics import f1_score, recall_score


print(accuracy_score(y_test, pred_final))


0.765625
