# Analyse des données du Titanic

![image](./images/titanic.jpg)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Importer les données :

In [2]:
titanic = pd.read_csv("./data/titanic_train.csv")

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [5]:
titanic.groupby("Pclass")["Fare"].mean()

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64

In [6]:
titanic["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

Importer les méthodes à utiliser pour prédire

In [7]:
# importation des méthodes statistiques
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [8]:
# on crée un objet à partir de la classe
modele_logit = LogisticRegression()
modele_rf = RandomForestClassifier()

In [9]:
# transformation de la colonne embarked pour l'intégrer dans le modèle
titanic_embarked = pd.get_dummies(titanic["Embarked"])

In [10]:
titanic = pd.concat([titanic,titanic_embarked],axis=1)

In [11]:
titanic.shape

(891, 15)

In [12]:
# on construit X et y
y = titanic["Survived"]
# on séléctinne les colonnes numériques
x = titanic.select_dtypes(np.number).drop(['PassengerId', 'Survived'],
                                         axis=1)

In [13]:
# comme on a des données manquantes pour l'âge, 
# on remplace les manquants par l'âge médian
x["Age"]=x["Age"].fillna(x["Age"].median())

On sépare les données en échantillon apprentissage / test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test,y_train,y_test = train_test_split(x,y)

On ajuste le modèle en utilisant les données

In [16]:
modele_logit.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
modele_rf.fit(x_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
modele_logit.coef_

array([[-0.84793047, -0.03232591, -0.20897783,  0.24589285,  0.0069688 ,
         0.51514387,  0.80928491, -0.12021609]])

On veut prédire sur les données de test

In [19]:
from sklearn.metrics import accuracy_score, auc, confusion_matrix

In [20]:
# on calcule le pourcentage de bien callsés
accuracy_score(y_test,modele_logit.predict(x_test))

0.7130044843049327

In [21]:
accuracy_score(y_test,modele_rf.predict(x_test))

0.6860986547085202

In [22]:
# on calcule la matrice de vconfusion
confusion_matrix(y_test,modele_logit.predict(x_test))

array([[121,  20],
       [ 44,  38]], dtype=int64)

In [23]:
confusion_matrix(y_test,modele_rf.predict(x_test))

array([[109,  32],
       [ 38,  44]], dtype=int64)

On prédit à partir du modèle

In [24]:
x_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,C,Q,S
45,3,28.0,0,0,8.0500,0,0,1
77,3,28.0,0,0,8.0500,0,0,1
615,2,24.0,1,2,65.0000,0,0,1
574,3,16.0,0,0,8.0500,0,0,1
113,3,20.0,1,0,9.8250,0,0,1
133,2,29.0,1,0,26.0000,0,0,1
178,2,30.0,0,0,13.0000,0,0,1
727,3,28.0,0,0,7.7375,0,1,0
173,3,21.0,0,0,7.9250,0,0,1
855,3,18.0,0,1,9.3500,0,0,1


In [25]:
nouveau_passager = np.array([1,50,0,0,150,1,0,0]).reshape(1, -1)

In [26]:
modele_rf.predict_proba(nouveau_passager)

array([[0.3, 0.7]])