In [1]:
import pandas as pd
import numpy as np

#### We are using our Spam/Ham Dataset. This time we are applying few different models and use VotingClassifier to classify outputs.

In [2]:
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/spambase.csv"
SpamData = pd.read_csv(url)
SpamData.head()
len(SpamData)

4601

In [3]:
ListOfAllVariables = SpamData.columns.values
X = SpamData[ListOfAllVariables]
del X['is_spam']
y = SpamData['is_spam']

In [4]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier  #VotingClassifier is part of sklearn.ensemble

In [5]:
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(max_depth = 5, n_estimators = 1000)
clf3 = BernoulliNB()
clf4 = MultinomialNB()
clf5 = GaussianNB()

In [6]:
eclf = VotingClassifier(estimators = [('lr', clf1), ('rf', clf2), 
                                     ('bnb', clf3),('mnb',clf4),
                                     ('gnb',clf5)], voting = 'hard')

#### When voting is set to 'hard', then the outcome will simply becomes the majority vote. i.e., if 3 classifiers predict class 1 and 2 classifiers predict class 2, majority vote is class 1. 

In [7]:
for clf in [clf1, clf2, clf3, clf4, clf5, eclf]:
    scores = cross_validation.cross_val_score(clf, X, y, cv=5, 
                                              scoring = 'accuracy',
                                              n_jobs = -1)
    print(scores.mean())


0.912391909726
0.918255574499
0.880428497671
0.781969260361
0.825647242022
0.924554967574


In [8]:
eclf = VotingClassifier(estimators = [('lr', clf1), ('rf', clf2), 
                                      ('bnb', clf3), ('mnb', clf4),
                                      ('gnb', clf5)], voting = 'soft')

for clf in [clf1, clf2, clf3, clf4, clf5, eclf]:
    scores = cross_validation.cross_val_score(clf, X, y, cv=5, 
                                              scoring = 'accuracy',
                                              n_jobs = -1)
    print(scores.mean())

0.912391909726
0.918255574499
0.880428497671
0.781969260361
0.825647242022
0.916506517612


#### When voting is set to 'soft', then the outcome will be picked based on predictions on probabilities. For instance, imagine classifier 1 and 2 predict 49% for class 1 and 51% for class 2. Also, assume classifier 3 predicts 90% probability for class 1 and 10% for class 2. In 'soft' voting method, the prediction is class 1. However, if we were using 'hard' method, then our prediction would be class 2.

# Using the VotingClassifier with GridSearch

In [9]:
from sklearn.grid_search import GridSearchCV

clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = BernoulliNB()
eclf = VotingClassifier(estimators=[('lr', clf1), 
                                    ('rf', clf2), 
                                    ('bnb', clf3)],
                                    voting = 'hard')

params = {'lr__C': [0.1, 1, 10],
          'rf__n_estimators': [1000],
          'rf__max_depth': [2, 5, 10],
          'bnb__alpha': [0.1, 0.5, 1]}

grid = GridSearchCV(estimator = eclf, param_grid = params, cv = 5, n_jobs = -1)
gridfit = grid.fit(X, y)

In [10]:
print(gridfit.best_params_)

{'bnb__alpha': 0.1, 'rf__max_depth': 10, 'rf__n_estimators': 1000, 'lr__C': 10}


In [11]:
print(gridfit.best_score_)

0.924364268637
