In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Cleaning and organising data

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)

In [None]:
plt.figure(0)
sns.catplot(x='Embarked', y='Survived', data=titanic, kind='point', aspect=2)
# plt.figure(1)
sns.catplot(x='Embarked', y='Fare', data=titanic, kind='point', aspect=2)

In [None]:
titanic['fam_count'] = titanic['SibSp']+titanic['Parch']

In [None]:
titanic.groupby(titanic['Cabin'].isnull()).mean()['Survived']

In [None]:
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)

In [None]:
gen_num = {'male': 0, 'female': 1}
titanic['Sex'] = [gen_num[item] for item in titanic['Sex']]

In [None]:
titanic.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket',
              'Embarked', 'Cabin'], inplace=True, axis=1)

In [None]:
y = titanic['Survived']
X = titanic.drop(['Survived'], axis=1)

## Splitting dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=1)

In [None]:
len(X_train)/len(X), len(X_test)/len(X), len(X_val)/len(X)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

In [None]:
models = GridSearchCV(lr, parameters, cv=3)
models.fit(X_train, y_train.values.ravel())

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
print_results(models)

In [None]:
LR_model = models.best_estimator_

In [None]:
import joblib
joblib.dump(models.best_estimator_, 'LR_model.pkl')

## Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
sv = SVC()
parameters={
    'C':[0.1,1,10],
    'kernel':['linear','rbf']
}

In [None]:
models = GridSearchCV(sv,parameters,cv=3)
models.fit(X_train,y_train.values.ravel())

In [None]:
print_results(models)

In [None]:
SVC_model=models.best_estimator_

In [None]:
joblib.dump(models.best_estimator_, 'SVM_model.pkl')

## Multilayer Perceptron

In [142]:
from sklearn.neural_network import MLPClassifier

In [143]:
mlp = MLPClassifier(max_iter=1000)
parameters={
    'hidden_layer_sizes':[(10,),(50,),(100,)],
    'activation':['relu','tanh','logistic'],
    'learning_rate':['constant','invscaling','adaptive']
}

In [144]:
models = GridSearchCV(mlp,parameters,cv=5)
models.fit(X_train,y_train.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=1000, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_stat...fle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'activa

In [145]:
print_results(models)

BEST PARAMS: {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}

0.8 (+/-0.063) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
0.785 (+/-0.046) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
0.79 (+/-0.034) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
0.773 (+/-0.053) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.787 (+/-0.071) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.783 (+/-0.057) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.779 (+/-0.05) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.781 (+/-0.073) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.792 (+/-0.059) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learnin

In [146]:
MLP_model=models.best_estimator_

In [147]:
joblib.dump(models.best_estimator_,'MLP_model.pkl')

['MLP_model.pkl']

## Random Forest

In [152]:
from sklearn.ensemble import RandomForestClassifier

In [153]:
# RandomForestClassifier(),RandomForestRegressor()

In [154]:
rf = RandomForestClassifier()
parameters={
    'n_estimators':[5,50,250],
    'max_depth':[2,4,6,8,10]
}

In [155]:
models= GridSearchCV(rf,parameters,cv=5)
models.fit(X_train,y_train.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [156]:
print_results(models)

BEST PARAMS: {'max_depth': 6, 'n_estimators': 50}

0.787 (+/-0.079) for {'max_depth': 2, 'n_estimators': 5}
0.777 (+/-0.047) for {'max_depth': 2, 'n_estimators': 50}
0.772 (+/-0.023) for {'max_depth': 2, 'n_estimators': 250}
0.798 (+/-0.071) for {'max_depth': 4, 'n_estimators': 5}
0.805 (+/-0.059) for {'max_depth': 4, 'n_estimators': 50}
0.811 (+/-0.05) for {'max_depth': 4, 'n_estimators': 250}
0.803 (+/-0.063) for {'max_depth': 6, 'n_estimators': 5}
0.822 (+/-0.051) for {'max_depth': 6, 'n_estimators': 50}
0.822 (+/-0.056) for {'max_depth': 6, 'n_estimators': 250}
0.805 (+/-0.023) for {'max_depth': 8, 'n_estimators': 5}
0.818 (+/-0.048) for {'max_depth': 8, 'n_estimators': 50}
0.815 (+/-0.052) for {'max_depth': 8, 'n_estimators': 250}
0.794 (+/-0.04) for {'max_depth': 10, 'n_estimators': 5}
0.807 (+/-0.048) for {'max_depth': 10, 'n_estimators': 50}
0.815 (+/-0.041) for {'max_depth': 10, 'n_estimators': 250}


In [157]:
RF_model=models.best_estimator_

In [None]:
joblib.dump(models,'RF_model.pkl')

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor

In [None]:
GradientBoostingClassifier(),GradientBoostingRegressor()

In [None]:
gb = GradientBoostingClassifier()
parameters={
    'learning_rate':[0.01, 0.1, 1, 10, 100],
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1,3,5,7,9]
}

In [None]:
models = GridSearchCV(gb,parameters,cv=5)
models.fit(X_train,y_train.values.ravel())

In [None]:
print_results(models)

In [None]:
GB_model=models.best_estimator_

In [None]:
joblib.dump(models.best_estimator_,'GB_model.pkl')

## Evaluating Models

In [148]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
from time import time

In [149]:
def evaluate_model(name,model,X_val,y_val):
    preds = model.predict(X_val)
    acc=accuracy_score(y_val,preds)
    pre=precision_score(y_val,preds)    
    rec=recall_score(y_val,preds)
    print("{}: Accuracy:{} Precision:{} Recall:{}".format(name,acc,pre,rec))

In [150]:
evaluate_model('LR',LR_model,X_val,y_val)
evaluate_model('SVM',SVC_model,X_val,y_val)
evaluate_model('MLP',MLP_model,X_val,y_val)
evaluate_model('RF',RF_model,X_val,y_val)
evaluate_model('GB',GB_model,X_val,y_val)

LR: Accuracy:0.8100558659217877 Precision:0.859375 Recall:0.6875
SVM: Accuracy:0.7821229050279329 Precision:0.847457627118644 Recall:0.625
MLP: Accuracy:0.7932960893854749 Precision:0.8412698412698413 Recall:0.6625
RF: Accuracy:0.8100558659217877 Precision:0.9107142857142857 Recall:0.6375
GB: Accuracy:0.8268156424581006 Precision:0.9152542372881356 Recall:0.675


In [151]:
evaluate_model('LR',LR_model,X_test,y_test)
evaluate_model('SVM',SVC_model,X_test,y_test)
evaluate_model('MLP',MLP_model,X_test,y_test)
evaluate_model('RF',RF_model,X_test,y_test)
evaluate_model('GB',GB_model,X_test,y_test)

LR: Accuracy:0.8033707865168539 Precision:0.6666666666666666 Recall:0.75
SVM: Accuracy:0.7752808988764045 Precision:0.6212121212121212 Recall:0.7321428571428571
MLP: Accuracy:0.7752808988764045 Precision:0.625 Recall:0.7142857142857143
RF: Accuracy:0.8089887640449438 Precision:0.6896551724137931 Recall:0.7142857142857143
GB: Accuracy:0.8033707865168539 Precision:0.6842105263157895 Recall:0.6964285714285714
