In [20]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing #to transform the feature labels
from sklearn.feature_extraction import DictVectorizer #to turn categorial variables into numeric arrays
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_set = pd.read_csv("Lab5_train.csv")
test_set = pd.read_csv("Lab5_test.csv")
train_set.head()

Unnamed: 0,PassengerID,Survived,Pclass,Name,Sex,Age,SibSp,ParCh,Ticket,Fare,Cabin,Embarked
0,1302,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
1,369,0,2,"Chapman, Mr. John Henry",male,37.0,1,0,SC/AH 29037,26.0,,S
2,1128,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S
3,491,0,2,"Mack, Mrs. (Mary)",female,57.0,0,0,S.O./P.P. 3,10.5,E77,S
4,1087,0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S


Checking majority in train set

In [3]:
len(train_set[train_set["Survived"] == 1])/len(train_set) * 100

38.68852459016394

In [4]:
len(train_set[train_set["Survived"] == 0])/len(train_set) * 100

61.31147540983607

So, approx 61% didnt survive and 39% survived.

#### Feature Extraction
* Using Pclass, Sex, Age, SubSp, parCh to train model
* Adding additional features Fare, Embarked and Ticket reduced the accuracy
* Finally, combining SibSp and ParCh to FamilySize increased the accuracy somewhat

In [5]:
train_set['FamilySize'] = train_set['SibSp'] + train_set['ParCh']
test_set['FamilySize'] = test_set['SibSp'] + test_set['ParCh']

In [6]:
target = train_set.Survived

Encoding categorical features

In [7]:
vec = DictVectorizer()
train_sex_non_class = vec.fit_transform(train_set[['Sex']].to_dict('records')).toarray()
test_sex_non_class = vec.fit_transform(test_set[['Sex']].to_dict('records')).toarray()

Replace blank values of age

In [8]:
train_set['Age'].fillna(train_set['Age'].mean(), inplace=True)
test_set['Age'].fillna(test_set['Age'].mean(), inplace=True)

Scale feature values

In [9]:
min_max_scaler = preprocessing.MinMaxScaler()
train_set_age_scaled = min_max_scaler.fit_transform(train_set.Age)
test_set_age_scaled = min_max_scaler.fit_transform(test_set.Age)
min_max_scaler2 = preprocessing.MinMaxScaler()
train_set_pclass_scaled = min_max_scaler2.fit_transform(train_set.Pclass)
test_set_pclass_scaled = min_max_scaler2.fit_transform(test_set.Pclass)
min_max_scaler3 = preprocessing.MinMaxScaler()
train_set_familysize_scaled = min_max_scaler3.fit_transform(train_set.FamilySize)
test_set_familysize_scaled = min_max_scaler3.fit_transform(test_set.FamilySize)



In [11]:
train_set_features = np.concatenate((train_sex_non_class, np.reshape(train_set_age_scaled,(915,1)), np.reshape(train_set_pclass_scaled,(915,1)), np.reshape(train_set_familysize_scaled,(915,1))), axis=1)
test_set_features = np.concatenate((test_sex_non_class, np.reshape(test_set_age_scaled,(394,1)), np.reshape(test_set_pclass_scaled,(394,1)), np.reshape(test_set_familysize_scaled,(394,1))), axis=1)
train_set_features                                  

array([[ 0.        ,  1.        ,  0.56694579,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.46025127,  0.5       ,  0.1       ],
       [ 0.        ,  1.        ,  0.23430994,  1.        ,  0.        ],
       ..., 
       [ 0.        ,  1.        ,  0.19665306,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.02092091,  0.5       ,  0.2       ],
       [ 0.        ,  1.        ,  0.00523054,  1.        ,  0.2       ]])

#### Train machine learning classifier

#### Hyperparameter search for SVM

In [12]:
# Use Grid Search and cross validation to find the best hyperparameters (coarse grained search)
tuned_parameters = {'kernel': ['rbf'], 'C': [2**i for i in range(-5,16)], 'gamma': [2**i for i in range(-15,4)]}
clf1 = svm.SVC()
clf_coarse = GridSearchCV(clf1, tuned_parameters, cv=5)
clf_coarse.fit(train_set_features, target)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ['rbf'], 'C': [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], 'gamma': [3.0517578125e-05, 6.103515625e-05, 0.0001220703125, 0.000244140625, 0.00048828125, 0.0009765625, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [13]:
best_c = math.log(clf_coarse.best_params_['C'], 2)
best_gamma = math.log(clf_coarse.best_params_['gamma'], 2)
print(best_c, best_gamma, clf_coarse.cv_results_['mean_test_score'].max())

14.0 -3.0 0.812021857923


In [14]:
# Use Grid Search and cross validation to find the best hyperparameters (fine grained search)
tuned_parameters = {'kernel': ['rbf'], 'C': [2**i for i in np.arange(best_c-2, best_c+2, 0.25)], 'gamma': [2**i for i in np.arange(best_gamma-2, best_gamma+2, 0.25)]}
clf1 = svm.SVC()
clf_fine = GridSearchCV(clf1, tuned_parameters, cv=5)
clf_fine.fit(train_set_features, target)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ['rbf'], 'C': [4096.0, 4870.9923430511453, 5792.6187514801977, 6888.6234337584292, 8192.0, 9741.9846861022907, 11585.237502960395, 13777.246867516858, 16384.0, 19483.969372204581, 23170.475005920791, 27554.493735033717, 32768.0, 38967.938744409163, 46340.950011841582, 55108.987...29663689, 0.21022410381342863, 0.25, 0.29730177875068026, 0.35355339059327379, 0.42044820762685725]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
best_c = math.log(clf_fine.best_params_['C'], 2)
best_gamma = math.log(clf_fine.best_params_['gamma'], 2)
print(best_c, best_gamma, clf_fine.cv_results_['mean_test_score'].max())

13.25 -2.75 0.813114754098


#### Hyperparameter search for NN

In [16]:
# Use Grid Search and cross validation to find the best hyperparameters
tuned_parameters = {'activation': ['logistic', 'tanh', 'relu'], 'solver': ['lbfgs','sgd', 'adam'], 'learning_rate' : ['constant', 'invscaling', 'adaptive'], 'hidden_layer_sizes' : [val for val in zip(range(5,100,5),)]}
clf2 = MLPClassifier()
clf_coarse1 = GridSearchCV(clf2, tuned_parameters, cv=5)
clf_coarse1.fit(train_set_features, target)



GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'hidden_layer_sizes': [(5,), (10,), (15,), (20,), (25,), (30,), (35,), (40,), (45,), (50,), (55,), (60,), (65,), (70,), (75,), (80,), (85,), (90,), (95,)], 'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'activation': ['logistic', 'tanh', 'relu']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
best_activation = clf_coarse1.best_params_['activation']
best_solver = clf_coarse1.best_params_['solver']
best_learning_rate = clf_coarse1.best_params_['learning_rate']
best_hidden_layer_size = clf_coarse1.best_params_['hidden_layer_sizes']
print(best_activation, best_solver, best_learning_rate, best_hidden_layer_size, clf_coarse1.cv_results_['mean_test_score'].max())

relu lbfgs adaptive (25,) 0.817486338798


#### Random Forest feature importance

In [18]:
clf3 = RandomForestClassifier(criterion='gini', n_estimators=700, min_samples_split=10, min_samples_leaf=1, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
clf3.fit(train_set_features, target)
clf3.feature_importances_

array([ 0.23269874,  0.22538529,  0.28994907,  0.16175049,  0.09021642])

#### Ensemble methods

#### Voting classfier

In [53]:
clf1 = svm.SVC(kernel='rbf', C=2**best_c, gamma=2**best_gamma)
clf2 = MLPClassifier(activation=best_activation, solver=best_solver, learning_rate=best_learning_rate, hidden_layer_sizes=best_hidden_layer_size)
clf3 = RandomForestClassifier(criterion='gini', n_estimators=700, min_samples_split=10, min_samples_leaf=1, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
eclf = VotingClassifier(estimators=[('svc', clf1), ('mlp', clf2), ('dc', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['SVM', 'Neural Network', 'Random Forest', 'Ensemble']):
   scores = cross_val_score(clf, train_set_features, target, cv=5, scoring='accuracy')
   print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.8131 (+/- 0.0137) [SVM]
Accuracy: 0.8142 (+/- 0.0134) [Neural Network]
Accuracy: 0.8120 (+/- 0.0103) [Random Forest]
Accuracy: 0.8175 (+/- 0.0175) [Ensemble]


#### Bagging

In [54]:
bagging = BaggingClassifier(n_estimators=100, max_samples=0.5, max_features=0.5)
scores = cross_val_score(bagging, train_set_features, target)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

Accuracy: 0.8066 (+/- 0.0214)


#### Adaboost

In [38]:
aclf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(aclf, train_set_features, target)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

Accuracy: 0.7934 (+/- 0.0161)


#### Stochastic Gradient Boosting

In [47]:
gclf = GradientBoostingClassifier(n_estimators=700, min_samples_split=10, min_samples_leaf=1, max_features='auto', random_state=1)
scores = cross_val_score(gclf, train_set_features, target)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

Accuracy: 0.7945 (+/- 0.0204)


#### Predict

In [41]:
eclf.fit(train_set_features, target)
target_predicted = eclf.predict(test_set_features)

#### Write to csv

In [42]:
output = pd.DataFrame({"PassengerId": test_set.PassengerID.values, 
              "Survived": target_predicted})
output.to_csv("submission_voting_classifier_normalized_f.csv", index=False)