In [20]:
#Creating a model for predictinf if a passenger will survive on titanic

In [1]:
import os
import numpy as np
import pandas as pd

from statsmodels.formula.api import ols

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import seaborn as sns

%matplotlib inline

In [2]:
#Dataset ready to use for predicting 
titanic_preproc = pd.read_csv(r'C:\Users\sofia\Desktop\Python\titanic_preproc_P25.csv', 
                              index_col=0)

In [3]:
y = sns.load_dataset('titanic').survived.values

In [4]:
#Deviding data into test and train

X_train, X_test, y_train, y_test = train_test_split(
    titanic_preproc.values, y, test_size=0.2, random_state=6)

In [5]:
titanic_preproc.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'alone_True',
       'who_man', 'who_woman'],
      dtype='object')

In [6]:
#Use the StandardScaler and its method .fit_transform() to quickly standardize
#each variable (column) in the training and test sets.
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.fit_transform(X_test)

In [8]:
X_test_std[:,0]

array([-1.63610168,  0.80441666,  0.80441666, -0.41584251, -1.63610168,
        0.80441666,  0.80441666,  0.80441666,  0.80441666,  0.80441666,
       -0.41584251,  0.80441666, -0.41584251,  0.80441666, -1.63610168,
        0.80441666,  0.80441666,  0.80441666, -0.41584251,  0.80441666,
        0.80441666,  0.80441666, -1.63610168, -1.63610168, -1.63610168,
       -0.41584251,  0.80441666, -0.41584251, -0.41584251,  0.80441666,
       -1.63610168, -0.41584251, -0.41584251,  0.80441666,  0.80441666,
        0.80441666,  0.80441666, -0.41584251,  0.80441666,  0.80441666,
       -1.63610168,  0.80441666,  0.80441666,  0.80441666,  0.80441666,
        0.80441666,  0.80441666,  0.80441666, -0.41584251, -0.41584251,
       -1.63610168,  0.80441666,  0.80441666,  0.80441666,  0.80441666,
        0.80441666, -0.41584251,  0.80441666, -0.41584251, -1.63610168,
        0.80441666,  0.80441666,  0.80441666,  0.80441666, -1.63610168,
       -0.41584251, -1.63610168,  0.80441666,  0.80441666,  0.80

In [9]:
#We adjust the logistic regression to the standardised data:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_std, y_train)

LogisticRegression()

In [10]:
# we test for correctness on test data
predictions = logistic_model.predict(X_test_std)

In [11]:
(predictions == y_test).mean()

0.8603351955307262

In [12]:
#In the case of the titanic data, we will not achieve a higher correctness than 81.7%
#by logistic regression, therefore we will try a different classifier - the support vector machine (SVM).

from sklearn.svm import SVC
d= {}
for gm in [0.001, 0.1, 0.5, 0.75, 0.9, 1, 1.5, 2, 3, 5, 10, 25]:
    for c in [0.5,1,1.5,2,2.5, 3,5,10,25,50,100,500]:
        svc = SVC(gamma = gm, C = c)
        svc.fit(X_train_std, y_train) # using train data
        d[(gm,c)] = (svc.predict(X_test_std) == y_test).mean() # using test data

svc = SVC(gamma=max(d, key=d.get)[0], C = max(d, key=d.get)[1]) # best accuracy for these parameters
svc.fit(X_train_std, y_train) # using train data
np.mean(svc.predict(X_test_std) == y_test) # using test data

0.8770949720670391

In [13]:
svc.gamma

0.1

In [14]:
svc.C

1.5

In [15]:
max(d.values()), max(d, key=d.get)

(0.8770949720670391, (0.1, 1.5))

In [16]:
# Or we can use an automatic selection of hyperparameters

parameters = {'gamma': [0.01, 0.1, 1, 2, 5], 'C': [0.1, 1, 10, 100, 1000]}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(SVC(), parameters, cv=5)
clf.fit(X_train_std, y_train)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [0.01, 0.1, 1, 2, 5]})

In [17]:
#Cheching the best parameters
clf.best_params_

{'C': 10, 'gamma': 0.01}

In [18]:
np.mean(clf.predict(X_test_std) == y_test)

0.8659217877094972

In [19]:
#Although we used a more complicated classifier, we achieved only a slightly higher correctness
#than with relatively simple logistic regression. Sometimes the dependencies present
#in the data are not so complex that it is necessary to use complex algorithms.
