# Analyse des données du titanic

Dans ce notebook, nous analysons les données du titanic pour construire un modèle prédictif. L'objectif est de prédire la survie des passagers.

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC

In [7]:
titanic_train=pd.read_csv("titanic_train.csv")

In [8]:
titanic_train.drop("Cabin",axis=1,inplace=True)

In [9]:
titanic_train.drop("Ticket",axis=1,inplace=True)

In [10]:
titanic_train["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
titanic_train.drop("Embarked",axis=1,inplace=True)

Gestion des données manquantes

In [12]:
titanic_train=titanic_train.dropna()

In [13]:
titanic_train.shape

(714, 9)

Construire x_train et y_train

In [14]:
y_train=titanic_train["Survived"]

In [15]:
titanic_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare'],
      dtype='object')

In [16]:
titanic_train["Sex_bin"]=np.where(titanic_train["Sex"]=="female",1,0)

In [17]:
x_train=titanic_train[['Pclass','Age', 'SibSp','Parch', 'Fare',"Sex_bin"]]

Ajuster un modèle SVM

In [18]:
from sklearn.svm import SVC

In [19]:
modele_svm1=SVC()

In [20]:
modele_svm1.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Calculer des indices de qualité sur la base titanic_train divisé en 2 parties

In [21]:
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.metrics import roc_auc_score,precision_score,accuracy_score

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train_train,x_train_test, y_train_train,y_train_test = train_test_split(x_train,
                                                                         y_train,
                                                                         test_size=0.2)

In [24]:
modele_svm_valid=SVC(C=1)
modele_svm_valid.fit(x_train_train,y_train_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
y_modele_pred=modele_svm_valid.predict(x_train_test)

In [26]:
confusion_matrix(y_train_test,y_modele_pred)

array([[66, 21],
       [20, 36]], dtype=int64)

In [27]:
y_train_test.value_counts()

0    87
1    56
Name: Survived, dtype: int64

In [28]:
accuracy_score(y_train_test,y_modele_pred)

0.71328671328671334

In [29]:
recall_score(y_train_test,y_modele_pred)

0.6428571428571429

In [30]:
roc_auc_score(y_train_test,y_modele_pred)

0.70073891625615758

In [31]:
precision_score(y_train_test,y_modele_pred)

0.63157894736842102

Essayez de faire varier les hyperparamètres du modèle

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
dico_param={"kernel":["linear","rbf",], "C":[1,5]}

In [34]:
modele_svm_grid=GridSearchCV(SVC(),dico_param,scoring='accuracy',cv=5)

In [35]:
modele_svm_grid.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kernel': ['linear', 'rbf'], 'C': [1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [41]:
modele_svm_grid.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Appliquez le modèle trouvé sur les données de test

In [36]:
titanic_test=pd.read_csv("titanic_test.csv")

In [37]:
titanic_test=titanic_test.dropna()

In [38]:
titanic_test["Sex_bin"]=np.where(titanic_test["Sex"]=="female",1,0)

In [39]:
x_test=titanic_test[['Pclass','Age', 'SibSp','Parch', 'Fare',"Sex_bin"]]

In [40]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_bin
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,448.582633,0.406162,2.236695,29.699118,0.512605,0.431373,34.694514,0.365546
std,259.119524,0.49146,0.83825,14.526497,0.929783,0.853289,52.91893,0.481921
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.25,0.0,1.0,20.125,0.0,0.0,8.05,0.0
50%,445.0,0.0,2.0,28.0,0.0,0.0,15.7417,0.0
75%,677.75,1.0,3.0,38.0,1.0,1.0,33.375,1.0
max,891.0,1.0,3.0,80.0,5.0,6.0,512.3292,1.0


In [43]:
titanic_test["Surv_pred"]=modele_svm_grid.best_estimator_.predict(x_test)

In [45]:
titanic_test.Surv_pred.value_counts()

1    44
0    43
Name: Surv_pred, dtype: int64