## Machine learning sur le titanic

In [1]:
import pandas as pd
import numpy as np

On importe les données

In [2]:
titanic = pd.read_csv("./data/titanic_train.csv")

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


On sélectionne les colonnes de x

In [4]:
x = titanic.drop(["PassengerId","Survived","Name","Ticket"],axis=1)

In [5]:
y = titanic["Survived"]

On simplifie la colonne `Cabin`

In [6]:
x["Cabin"]=x["Cabin"].str[0].fillna("No").replace({"T":"No","G":"No"})#.replace("G","No")

In [7]:
# on transforme toutes colonnes quali en binaires
x = pd.get_dummies(x,columns=["Sex","Cabin","Embarked"])

In [8]:
def transfo(x):
    """ Cette fonction permet de transformer en binaires toutes les colonnes
    objet d'un DataFrame en utilisant get_dummies()
    """
    list_col_quali =[]
    for col in x.columns:
        if x[col].dtype == object:
            list_col_quali.append(col)
    print(list_col_quali)        
    return pd.get_dummies(x,columns=list_col_quali)

In [9]:
x = transfo(x)

[]


In [10]:
# on remplace par la médiane
x["Age"]=x["Age"].fillna(x["Age"].median())

# Séparation apprentissage / test

In [11]:
from sklearn.model_selection import train_test_split

On veut découper nos données en train / test

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

In [13]:
print(x_train.shape, x_test.shape)

(623, 17) (268, 17)


On va construire et estimer des modèles de ML

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

In [15]:
dico_modeles = dict(logit=LogisticRegression(),
                    rf=RandomForestClassifier(n_estimators=1000),
                    gbm=GradientBoostingClassifier(),
                    knn = KNeighborsClassifier(),
                    rn = MLPClassifier()
                   )

In [16]:
for modele in dico_modeles.keys():
    dico_modeles[modele].fit(x_train,y_train)
    y_predict = dico_modeles[modele].predict(x_test)
    y_predict_proba = dico_modeles[modele].predict_proba(x_test)
    print("Matrice de confusion pour modèle {} ".format(modele), confusion_matrix(y_test,y_predict),sep="\n")
    print("Auc pour modèle {} ".format(modele) ,roc_auc_score(y_test,y_predict_proba[:,1] ))
    print("Accuracy pour modèle {} ".format(modele), accuracy_score(y_test,y_predict))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Matrice de confusion pour modèle logit 
[[144  22]
 [ 29  73]]
Auc pour modèle logit  0.8667906921804868
Accuracy pour modèle logit  0.8097014925373134
Matrice de confusion pour modèle rf 
[[139  27]
 [ 32  70]]
Auc pour modèle rf  0.8526163477439168
Accuracy pour modèle rf  0.7798507462686567
Matrice de confusion pour modèle gbm 
[[149  17]
 [ 31  71]]
Auc pour modèle gbm  0.8713973541223718
Accuracy pour modèle gbm  0.8208955223880597
Matrice de confusion pour modèle knn 
[[138  28]
 [ 43  59]]
Auc pour modèle knn  0.7614575950862272
Accuracy pour modèle knn  0.7350746268656716
Matrice de confusion pour modèle rn 
[[145  21]
 [ 30  72]]
Auc pour modèle rn  0.8719879518072289
Accuracy pour modèle rn  0.8097014925373134


In [17]:
pd.DataFrame(dico_modeles['rf'].feature_importances_,index=x.columns,
             columns=["importance"]).sort_values("importance",ascending = False)

Unnamed: 0,importance
Fare,0.229475
Age,0.224378
Sex_female,0.131263
Sex_male,0.129922
Pclass,0.083134
SibSp,0.048526
Parch,0.038908
Cabin_No,0.037683
Embarked_S,0.016126
Embarked_C,0.013205


On va rechercher les hyper-paramètres du modèle en utilisant une grille

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
# on construit la grille de paramètres
param = dict(n_estimators=[10,100,1000], max_depth=[3,5,7,9])

# on crée un objet de la classe GridSearchCV
modele_grid= GridSearchCV(RandomForestClassifier(),param,scoring="roc_auc",cv=4)

In [20]:
modele_grid.fit(x_train,y_train)

GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [21]:
modele_grid.best_score_

0.8589023665935672

In [22]:
modele_grid.best_params_

{'max_depth': 5, 'n_estimators': 10}

In [23]:
pd.DataFrame(modele_grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034592,0.000743,0.005984,6.610475e-07,3,10,"{'max_depth': 3, 'n_estimators': 10}",0.828559,0.863021,0.867535,0.839474,0.849647,0.016179,12
1,0.260054,0.002855,0.018451,0.004319902,3,100,"{'max_depth': 3, 'n_estimators': 100}",0.823958,0.865625,0.88342,0.838772,0.852944,0.023079,9
2,2.56766,0.077274,0.18248,0.009734021,3,1000,"{'max_depth': 3, 'n_estimators': 1000}",0.81901,0.867882,0.886806,0.840614,0.853578,0.025844,8
3,0.032911,0.001995,0.006223,0.0004140199,5,10,"{'max_depth': 5, 'n_estimators': 10}",0.836892,0.863889,0.891319,0.843509,0.858902,0.021196,1
4,0.2763,0.007619,0.021661,0.0005070139,5,100,"{'max_depth': 5, 'n_estimators': 100}",0.826823,0.873177,0.872569,0.834211,0.851695,0.02134,11
5,2.804983,0.069596,0.195756,0.008802424,5,1000,"{'max_depth': 5, 'n_estimators': 1000}",0.834288,0.873264,0.878819,0.841404,0.856944,0.019363,6
6,0.031668,0.000436,0.005747,0.0004374323,7,10,"{'max_depth': 7, 'n_estimators': 10}",0.864323,0.848438,0.876128,0.843246,0.858034,0.013017,4
7,0.281756,0.000488,0.022418,0.0004775477,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.845226,0.871788,0.872396,0.843596,0.858252,0.013854,2
8,2.866598,0.065687,0.196553,0.009533576,7,1000,"{'max_depth': 7, 'n_estimators': 1000}",0.836372,0.870486,0.868056,0.848947,0.855965,0.014055,7
9,0.033679,0.001297,0.005302,0.0004016295,9,10,"{'max_depth': 9, 'n_estimators': 10}",0.822222,0.86224,0.862674,0.862105,0.85231,0.017373,10


Si on veut exporter un modèle, on peut utiliser :

In [24]:
from sklearn.externals import joblib



In [25]:
joblib.dump(modele_grid,"modele_grid.pkl")

['modele_grid.pkl']

## Construction d'un pipeline

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [27]:
# création d'un objet de la classe Pipeline
mon_pipe = Pipeline(steps=[("acp",PCA(n_components=4)),("svm",SVC())])

In [28]:
mon_pipe.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('acp',
                 PCA(copy=True, iterated_power='auto', n_components=4,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [29]:
confusion_matrix(y_test,mon_pipe.predict(x_test))

array([[152,  14],
       [ 77,  25]], dtype=int64)