# Objective

Examine how voting classifiers work using artificial data

# Setup

In [None]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Voting classifiers

In [None]:
# Split data
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

#generate artificial data
X, y = make_moons(n_samples=50000, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Is our data balanced?

In [None]:
import pandas as pd
pd.crosstab(index=y, columns="count")   

col_0,count
row_0,Unnamed: 1_level_1
0,25000
1,25000


**Warning**: In Scikit-Learn 0.20, some hyperparameters (`solver`, `n_estimators`, `gamma`, etc.) start issuing warnings about the fact that their default value will change in Scikit-Learn 0.22. To avoid these warnings and ensure that this notebooks keeps producing the same outputs as in the book, I set the hyperparameters to their old default value.  In your own code, you can simply rely on the latest default values instead.

# Lets define 3 models and perform Ensemble Learning - Voting

In [None]:
#Import VotingClassifier

from sklearn.ensemble import VotingClassifier

#Import Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Create 3 classifiers
log_clf = LogisticRegression()#solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier()#n_estimators=50, random_state=42)
svm_clf = SVC()#gamma="auto", random_state=42)

# Creating Voting Ensemble
voting_clf = VotingClassifier(
               estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)]
              ,voting='hard'
              )

In [None]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("train:",clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train)))
    print("test:",clf.__class__.__name__, accuracy_score(y_test, y_pred))
    print("==================================================================")


train: LogisticRegression 0.5241866666666667
test: LogisticRegression 0.52736
train: RandomForestClassifier 1.0
test: RandomForestClassifier 0.50392
train: SVC 0.5256
test: SVC 0.52768
train: VotingClassifier 0.5563466666666667
test: VotingClassifier 0.52632


## Before moving forward, give the sample size a try. How increase or decrease of training size impact on the score? How does Voting compared to individual models?

##### Go to below line:
`X, y = make_moons(n_samples=1000, noise=0.50, random_state=42)`

#####and use n_samples=50, 200, 300, 500, 1000, 5000, 10000


### What if you keep sample size constant and then play with noise level?

Discuss your findings with each other.



---

What if you change 


```
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)

```
to 
```
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42,max_depth=3)
```



In [None]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,...
                                        

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.52736
RandomForestClassifier 0.50456
SVC 0.51496
VotingClassifier 0.51328


# Question: What is the impact of Voting?