## Developing a Voting Model

In [320]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

train = pd.read_csv("features/train_features.csv")
test = pd.read_csv("features/test_features.csv")

In [321]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Company,Fare,Cabin,Embarked,Title
0,1,0,3,male,Young Adult,Family,2.110213,M,S,Mr
1,2,1,1,female,Adult,Family,4.280593,Other,C,Mrs
2,3,1,3,female,Young Adult,Alone,2.188856,M,S,Miss
3,4,1,1,female,Adult,Family,3.990834,Other,S,Mrs
4,5,0,3,male,Adult,Alone,2.202765,M,S,Mr


In [322]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Company,Fare,Cabin,Embarked,Title
0,892,3,male,Adult,Alone,2.178064,M,Q,Mr
1,893,3,female,Adult,Family,2.079442,M,S,Mrs
2,894,2,male,Adult,Alone,2.369075,M,Q,Mr
3,895,3,male,Young Adult,Alone,2.268252,M,S,Mr
4,896,3,female,Young Adult,Family,2.586824,M,S,Mrs


In [323]:
def createDummies(dataframe):
    return pd.get_dummies(
        dataframe, 
        drop_first = True, 
        columns = ['Sex', 'Age', 'Title', 'Company', 'Cabin', 'Embarked']
    )

y_train = list(train['Survived'])
X_train = createDummies(train).drop(['Survived', 'PassengerId'], axis = 1)
X_train_scaled = pd.DataFrame(StandardScaler().fit_transform(X_train.values), columns = X_train.columns)

IDs = test[['PassengerId']]
X_test = createDummies(test).drop(['PassengerId'], axis = 1)
X_test_scaled = pd.DataFrame(StandardScaler().fit_transform(X_test.values), columns = X_test.columns)

## Tuning Random Forest

In [324]:
n_estimators = [250, 500, 750, 1000]
max_features = ['log2', 'sqrt', None]
max_depth = [30, 60, 90, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

rf_param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf }

rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator = rf, param_grid = rf_param_grid, cv = 5, n_jobs = -1)

# Fit the random search model
rf_grid.fit(inputs, target)

# Print best parameter after tuning
print(rf_grid.best_params_)
print(rf_grid.best_score_)

{'max_depth': 90, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 250}
0.8316552633230808


## Tuning SVC

In [325]:
svc_param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']
             } 

svc = SVC()
svc_grid = GridSearchCV(svc, svc_param_grid, refit = True, verbose = 3)
  
# Fit the random search model
svc_grid.fit(inputs, target)

# Print best parameter after tuning
print(svc_grid.best_params_)
print(svc_grid.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.810 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.809 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.820 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.764 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.803 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.816 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.809 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.753 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.676 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

## *Using Default XGBoost

## Voting System

In [326]:
rf_best = RandomForestClassifier(
    n_estimators = 1000,
    min_samples_split = 2,
    min_samples_leaf = 4,
    max_features =  None,
    max_depth =  80
)

svc_best = SVC(
    C = 100,
    gamma = 0.01,
    kernel = 'rbf'
)

xgboost_best = GradientBoostingClassifier()

vc = VotingClassifier([
        ('rf', rf_best),
        ('svc', svc_best), 
        ('xgb', xgboost_best)
    ],
    voting = 'hard', 
    n_jobs = -1)

vc.fit(inputs, target)
predictions = pd.DataFrame(vc.predict(X_test), columns = ['Survived'])
results = pd.concat([IDs, predictions], axis = 1, join = 'inner')

results.to_csv('results.csv', index = False)