# Un premier modèle avec scikit-learn

In [1]:
import numpy as np
import pandas as pd

In [2]:
titanic = pd.read_csv("./data/titanic_train.csv")

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Panula, Master. Eino Viljami",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


## On sépare les données en apprentissage / test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
y = titanic["Survived"]
x = titanic[["Pclass","Sex","SibSp","Parch","Fare"]]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [31]:
# préparation des données
from sklearn.preprocessing import LabelEncoder

In [32]:
# on crée un objet à partir de la classe
transfo_sex = LabelEncoder()

In [33]:
# on transforme la colonne sex en colonne numérique
x_train["Sex"]=transfo_sex.fit_transform(x_train["Sex"])
x_test["Sex"]=transfo_sex.transform(x_test["Sex"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
transfo_sex.classes_

array(['female', 'male'], dtype=object)

In [35]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 837 to 801
Data columns (total 5 columns):
Pclass    712 non-null int64
Sex       712 non-null int32
SibSp     712 non-null int64
Parch     712 non-null int64
Fare      712 non-null float64
dtypes: float64(1), int32(1), int64(3)
memory usage: 30.6 KB


## Les modèles de ML

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [37]:
# on construit un object à partir de la classe du modèle
dico_modeles = {"logit": LogisticRegression(),
               "rf": RandomForestClassifier(),
               "gbm":GradientBoostingClassifier(),
               "knn": KNeighborsClassifier()}

In [38]:
def ajustement_modeles(dico_modeles,x_train, y_train, x_test, y_test):
    """ Cette fonction permet d'ajuster et de calculer le % de bien
    classés pour chaque modèle du dictionnaire"""
    
    for modele in dico_modeles.keys():
        dico_modeles[modele].fit(x_train, y_train)
        print("Pourcentage de bien classés pour le modèle {}".format(modele),
              accuracy_score(y_test, dico_modeles[modele].predict(x_test)))
        print("AUC pour le modèle {}".format(modele),
              roc_auc_score(y_test,dico_modeles[modele].predict_proba(x_test)[:,1]))

In [42]:
ajustement_modeles(dico_modeles,x_train,y_train,
                  x_test,y_test)



Pourcentage de bien classés pour le modèle logit 0.7932960893854749
AUC pour le modèle logit 0.8089068825910931
Pourcentage de bien classés pour le modèle rf 0.8491620111731844
AUC pour le modèle rf 0.8838056680161943
Pourcentage de bien classés pour le modèle gbm 0.8268156424581006
AUC pour le modèle gbm 0.8791497975708502
Pourcentage de bien classés pour le modèle knn 0.7430167597765364
AUC pour le modèle knn 0.7988529014844805


## Validation du modèle

In [34]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [41]:
print("Pourcentage de bien classés :", 
      accuracy_score(y_test, modele_logit.predict(x_test)))

Pourcentage de bien classés : 0.7821229050279329


In [44]:
print("Matrice de confusion :", 
      confusion_matrix(y_test, modele_logit.predict(x_test)),sep="\n")

Matrice de confusion :
[[98 14]
 [25 42]]


# Ajustement des hyper-paramètres

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
dico_param = dict(n_estimators=[10,50,100,1000],
                  max_depth=[None,5,7,9])

In [47]:
dico_param = {"n_estimators":[10,50,100,1000],
                  "max_depth":[None,5,7,9]}

In [48]:
# on crée un objet
grid_rf = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=dico_param,
                    scoring="roc_auc", cv=3)

In [49]:
grid_rf.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [50]:
grid_rf.best_params_

{'max_depth': 5, 'n_estimators': 50}

In [51]:
pd.DataFrame(grid_rf.cv_results_)[["mean_test_score","params"]]

Unnamed: 0,mean_test_score,params
0,0.800928,"{'max_depth': None, 'n_estimators': 10}"
1,0.812914,"{'max_depth': None, 'n_estimators': 50}"
2,0.812179,"{'max_depth': None, 'n_estimators': 100}"
3,0.81,"{'max_depth': None, 'n_estimators': 1000}"
4,0.833873,"{'max_depth': 5, 'n_estimators': 10}"
5,0.840138,"{'max_depth': 5, 'n_estimators': 50}"
6,0.837796,"{'max_depth': 5, 'n_estimators': 100}"
7,0.840009,"{'max_depth': 5, 'n_estimators': 1000}"
8,0.824758,"{'max_depth': 7, 'n_estimators': 10}"
9,0.826968,"{'max_depth': 7, 'n_estimators': 50}"


In [52]:
roc_auc_score(y_test,grid_rf.best_estimator_.predict_proba(x_test)[:,1])

0.8742240215924426