# Titanic using voting classifier

- Environment: google colab
- Last updated: 2021-11-05



In [46]:
#read libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
#read data

train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
submission = pd.read_csv('/content/gender_submission.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
# number of null values each column (training set)
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# number of null values each column (test set)
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Cabin, Age columns have fairly large amount of null values. Also, fare and embarked columns have few null values too. We need to handle these null values.

In [10]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

We can fill two null values in embarked column with the most frequent value 'S'


In [11]:
train['Embarked'].fillna('S', inplace=True)

For cabin column, we need to drop the feature since it has about 75% of null values both in train and test set. We'll just fill null value with 'N' then drop the feature afterwards.

In [12]:
train['Cabin'].fillna('N', inplace=True)
test['Cabin'].fillna('N', inplace=True)

Instead of filling null value of fare column with test set's mean,  we'll look at which PClass the person was at, then get the mean of fare of that PClass. It would be not ideal to fill PClass 3 person (person who likely had less money to spend on fare than person who was in PClasss 1) with total mean and vice versa. 

In [13]:
#print out passenger who had null fare value
test[test['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,N,S


In [14]:
#group out mean fare value of test set by Pclass
test.groupby(['Pclass'])['Fare'].mean()


Pclass
1    94.280297
2    22.202104
3    12.459678
Name: Fare, dtype: float64

Compared to 35.62 (can be found from test.describe), which is total mean of test set, mean fare of Pclass of 3 is only 12.45. It is more ideal to fill null value with 12.45

In [15]:
test['Fare'].fillna('12.45', inplace=True)

Lastly, we need to fill in null value in age column. Filling age with one mean value also doesn't seem ideal (for example, filing 2 year old baby or 79 year old senior with total mean 30), we need to find a better way to fill age null value.

If you see the name column, you can see acronymn before Name (Dr, Mr, Mrs) this can divide people into few groups with various ages and sex. 

Source: https://www.kaggle.com/ash316/eda-to-prediction-dietanic

In [16]:
train['Initial']=0
test['Initial']=0
for i in train, test:
    train['Initial']=train.Name.str.extract('([A-Za-z]+)\.')
    test['Initial']=test.Name.str.extract('([A-Za-z]+)\.')

In [17]:
train['Initial'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Capt          1
Don           1
Ms            1
Mme           1
Lady          1
Sir           1
Countess      1
Jonkheer      1
Name: Initial, dtype: int64

In [18]:
test['Initial'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dona        1
Dr          1
Name: Initial, dtype: int64

In [19]:
train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
test['Initial'].replace(['Dona','Dr','Rev','Col','Ms'],['Mrs','Mr','Other','Other','Miss'],inplace=True)

In [20]:
train.groupby('Initial')['Age'].mean()

Initial
Master     4.574167
Miss      21.860000
Mr        32.739609
Mrs       35.981818
Other     45.888889
Name: Age, dtype: float64

In [21]:
test.groupby('Initial')['Age'].mean()

Initial
Master     7.406471
Miss      21.774844
Mr        32.114130
Mrs       38.904762
Other     42.750000
Name: Age, dtype: float64

In [22]:
train.loc[(train.Age.isnull())&(train.Initial=='Mr'),'Age']=33
train.loc[(train.Age.isnull())&(train.Initial=='Mrs'),'Age']=36
train.loc[(train.Age.isnull())&(train.Initial=='Master'),'Age']=5
train.loc[(train.Age.isnull())&(train.Initial=='Miss'),'Age']=22
train.loc[(train.Age.isnull())&(train.Initial=='Other'),'Age']=46

test.loc[(test.Age.isnull())&(test.Initial=='Mr'),'Age']=32
test.loc[(test.Age.isnull())&(test.Initial=='Mrs'),'Age']=39
test.loc[(test.Age.isnull())&(test.Initial=='Master'),'Age']=7
test.loc[(test.Age.isnull())&(test.Initial=='Miss'),'Age']=22
test.loc[(test.Age.isnull())&(test.Initial=='Other'),'Age']=43

In [23]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Initial        0
dtype: int64

In [24]:
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Initial        0
dtype: int64

Null values are all filled in

In [25]:
#new feature 'Fam' showing family size
train['Fam'] = train['SibSp'] + train['Parch']
test['Fam'] = test['SibSp'] + test['Parch']

In [26]:
#encode non-numeric features using LabelEncoder

def encode(df):
    features = ['Sex', 'Embarked']
    for f in features:
        le = LabelEncoder()
        le = le.fit(df[f])
        df[f] = le.transform(df[f])
        
    return df

train = encode(train)
test = encode(test)

In [27]:
#drop non-needed features

d = ['Name', 'PassengerId', 'Cabin', 'SibSp', 'Parch', 'Ticket', 'Initial']


In [28]:
train.drop(d,axis=1, inplace=True)
test.drop(d,axis=1, inplace=True)

In [29]:
y = train['Survived']
X = train.drop(['Survived'], axis=1)

X_train, X_val, y_train, y_val = tts(X, y, test_size=0.2, random_state=25)

In [30]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_pred = rf_classifier.predict(X_val)
print('Random Forest Accuracy:', accuracy_score(rf_pred, y_val))

Random Forest Accuracy: 0.7821229050279329


In [31]:
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
lr_pred = rf_classifier.predict(X_val)
print('Logistic Regression Accuracy:', accuracy_score(lr_pred, y_val))

Logistic Regression Accuracy: 0.7821229050279329


In [32]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
knn_pred = knn_classifier.predict(X_val)
print('K-Neareset Neighbors Accuracy:', accuracy_score(knn_pred, y_val))

K-Neareset Neighbors Accuracy: 0.7486033519553073


In [33]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_pred = dt_classifier.predict(X_val)
print('Decision Tree Accuracy:', accuracy_score(dt_pred, y_val))

Decision Tree Accuracy: 0.7597765363128491


In [34]:
svc_classifier = SVC(probability=True)
svc_classifier.fit(X_train, y_train)
svc_pred = svc_classifier.predict(X_val)
print('Support Vector Machine Accuracy:', accuracy_score(svc_pred, y_val))

Support Vector Machine Accuracy: 0.6871508379888268


In [35]:
#Hyperparameter tuning for each classifier (GridsearchCV)
params = {
    'max_depth' : [2, 4, 6, 8],
    'min_samples_split' : [2, 4, 6, 8],
    'n_estimators' : [20, 40, 60, 80, 100]
}

gsrf_classifier = GridSearchCV(rf_classifier,param_grid = params, cv=5, scoring="accuracy", verbose = 1)
gsrf_classifier.fit(X_train,y_train)

gsrf_best = gsrf_classifier.best_estimator_

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   40.6s finished


In [36]:
print('Random Forest gridsearchCV parameters, score',gsrf_classifier.best_params_, gsrf_classifier.best_score_)

Random Forest gridsearchCV parameters, score {'max_depth': 8, 'min_samples_split': 6, 'n_estimators': 100} 0.8343445287107258


In [37]:
params = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.00001, 0.001, 0.01, 0.1]
}

gslr_classifier = GridSearchCV(lr_classifier,param_grid = params, cv=5, scoring="accuracy", verbose = 1)
gslr_classifier.fit(X_train,y_train)

gslr_best = gslr_classifier.best_estimator_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, g

In [38]:
print('Logistic regression gridsearchCV best parameters, score',gslr_classifier.best_params_, gslr_classifier.best_score_)

Logistic regression gridsearchCV best parameters, score {'C': 0.1, 'penalty': 'l2'} 0.7977445090121147


In [39]:
params = {
    'n_neighbors' : [3, 5, 7, 9, 15],
    'weights' : ['uniform', 'distance'],
    'leaf_size': [5, 10, 20, 30, 40, 50]
}

gsknn_classifier = GridSearchCV(knn_classifier,param_grid = params, cv=5, scoring="accuracy", verbose = 1)
gsknn_classifier.fit(X_train,y_train)

gsknn_best = gsknn_classifier.best_estimator_

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    2.2s finished


In [40]:
print('K-nearest neighbrs gridsearchCV best parameters, score',gsknn_classifier.best_params_, gsknn_classifier.best_score_)

K-nearest neighbrs gridsearchCV best parameters, score {'leaf_size': 10, 'n_neighbors': 5, 'weights': 'uniform'} 0.7177385994287402


In [41]:
params = {
   'max_depth': [2, 4, 6, 8, 10],
   'min_samples_leaf': [3, 6, 9, 12, 15],
   'min_samples_split': [2, 4, 6, 8]
}

gsdt_classifier = GridSearchCV(dt_classifier,param_grid = params, cv=5, scoring="accuracy", verbose = 1)
gsdt_classifier.fit(X_train,y_train)

gsdt_best = gsdt_classifier.best_estimator_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.0s finished


In [42]:
print('Decision tree gridsearchCV best parameters, score',gsdt_classifier.best_params_, gsdt_classifier.best_score_)

Decision tree gridsearchCV best parameters, score {'max_depth': 8, 'min_samples_leaf': 6, 'min_samples_split': 8} 0.8314980793854033


In [43]:
params = {
   'C' : [0.001, 0.01, 0.1],
   'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

gssvc_classifier = GridSearchCV(svc_classifier,param_grid = params, cv=4, scoring="accuracy", verbose = 1)
gssvc_classifier.fit(X_train,y_train)

gssvc_best = gssvc_classifier.best_estimator_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    2.9s finished


In [44]:
print('Support Vector Machine gridsearchCV best parameters, score',gssvc_classifier.best_params_, gssvc_classifier.best_score_)

Support Vector Machine gridsearchCV best parameters, score {'C': 0.1, 'kernel': 'linear'} 0.7851123595505618


In [47]:
#Voting

estimator = [('rf', gsrf_best), ('lr', gslr_best), ('svm', gssvc_best)]

vote = VotingClassifier(
    estimators=estimator, voting='soft'
)
vote.fit(X_train, y_train)
vote_pred = vote.predict(X_val)
print('Voting Classifier Valdiation Set Accuracy:', accuracy_score(y_val,vote_pred))

Voting Classifier Valdiation Set Accuracy: 0.8156424581005587


In [50]:
vote_pred = vote.predict(test)
submission['Survived'] = vote_pred
submission.to_csv('/content/submission_voting', index=False)

#0.78708