#### Use VotingClassifier( ) of sklearn library to create an ensemble model using Logistic regression, Gaussian NB and Random Forest classifiers. For VotingClassifier( ) show the use of voting parameter set to ‘soft’ and ‘hard’. Also make use of weight parameter. Compare and contrast your results.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
X = data.drop(labels='Survived', axis=1)
y = data['Survived']

In [5]:
y.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

In [7]:
# Custom Estimator

class Custom_TrimDataset(BaseEstimator, TransformerMixin):
    """
    A custom class used to pre-process titanic dataset.

    Methods
    -------
    fit(X, y=None)
        Effectively nothing performed in fitting
    transform(X)
        Transforms the given dataframe in to dataframe where:
            - 'AgeGroup' feature(categorical) is derived out 'Age'(continous).
            - Combine 'SibSp' and 'Parch' into one feature -> 'kins'
            - 'Pclass' and 'kins' are converted to str(object) type.
            - Undesired features are dropped.
    """
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        """
        This method transforms the given dataframe.
            - 'AgeGroup' feauture(categorical) is derived out 'Age'(continous).
            - Combine 'SibSp' and 'Parch' into one feature -> 'kins'
            - 'Pclass' and 'kins' are converted to str(object) type.
            - Undesired features are dropped.

        Parameters
        ----------
        X : pandas.DataFrame
            The dataset on which column type change, trimming and from which 
            derived feature are to be made.

        Returns
        -------
        pandas.DataFrame
            a processed DataFrame.
        """
        Xdata = X.copy()
        Xdata.loc[:, 'Pclass'] = Xdata['Pclass'].astype(str)
        Xdata['AgeGroup'] = pd.cut(Xdata['Age'], [0, 5, 15, 60, 120],
                                   labels=['child', 'young', 'adult', 'elderly'])
        Xdata['kins'] = (Xdata['SibSp'] + Xdata['Parch']).astype(str)
        Xdata.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 
                    'Cabin', 'Age', 'SibSp', 'Parch'], axis=1, inplace=True)
        return Xdata

In [8]:
preproc_pipe = Pipeline(steps=[('trimmer', Custom_TrimDataset()),
                            ('i_median', SimpleImputer(strategy='most_frequent')),
                            ('targencoder', TargetEncoder())])

In [9]:
warnings.filterwarnings('ignore') # To Supress FutureWarning

X_train_proc = preproc_pipe.fit_transform(X_train, y_train)
X_test_proc = preproc_pipe.transform(X_test)

In [10]:
lr = LogisticRegression(random_state=42, n_jobs=-1)
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
gnb = GaussianNB()

In [11]:
estimators = [('lr', lr),
        ('rf', rf),
        ('gnb', gnb)]

votclf = VotingClassifier(estimators=estimators, n_jobs=-1)

In [12]:
params = {'rf__max_features': np.arange(3,5),
          'rf__n_estimators': np.arange(100, 500, 100),
          'gnb__var_smoothing': 10.0 ** -np.arange(13, 3, -2),
          'voting': ['hard', 'soft'],
          'weights': [[1,1,1],[1,1.5,1],[1,1.5,1.2]]}

gscv_clf = GridSearchCV(votclf, params, n_jobs=-1, scoring='accuracy')

In [13]:
gscv_clf.fit(X_train_proc, y_train)
print('Best Estimator :: {0}\n\nScore :: {1}'.format(gscv_clf.best_estimator_,
                                                   gscv_clf.best_score_))

Best Estimator :: VotingClassifier(estimators=[('lr',
                              LogisticRegression(n_jobs=-1, random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_features=3,
                                                     n_estimators=300,
                                                     n_jobs=-1,
                                                     random_state=42)),
                             ('gnb', GaussianNB(var_smoothing=1e-13))],
                 n_jobs=-1, weights=[1, 1, 1])

Score :: 0.81609376538954


In [14]:
clfs = [lr, rf, gnb, votclf]

In [15]:
for clf in clfs:
    print('-'*15)
    clf.fit(X_train_proc, y_train)
    y_pred = clf.predict(X_test_proc)
    print('Accuracy Score : {1:.2f} -- {0}'.format(type(clf).__name__,
                                             accuracy_score(y_test, y_pred)))
print('-'*15)

---------------
Accuracy Score : 0.64 -- LogisticRegression
---------------
Accuracy Score : 0.80 -- RandomForestClassifier
---------------
Accuracy Score : 0.77 -- GaussianNB
---------------
Accuracy Score : 0.79 -- VotingClassifier
---------------
