## Modèle de machine learning

Premier exemple de classification en machine learning avec scikit-learn

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('./Data/titanic_train.csv')

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.shape

(891, 12)

In [5]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

On construit des échantillons d'apprentissage et de test

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test =train_test_split(titanic[['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare']], titanic[['Survived']])

In [8]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
874,2,female,28.0,1,0,24.0
847,3,male,35.0,0,0,7.8958
773,3,male,,0,0,7.225
529,2,male,23.0,2,1,11.5
154,3,male,,0,0,7.3125


In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 874 to 888
Data columns (total 6 columns):
Pclass    668 non-null int64
Sex       668 non-null object
Age       532 non-null float64
SibSp     668 non-null int64
Parch     668 non-null int64
Fare      668 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 36.5+ KB


In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

In [11]:
tranform_sex = LabelEncoder()

In [12]:
x_train["Sex"] = tranform_sex.fit_transform(x_train["Sex"])

In [13]:
impute_moy_age = SimpleImputer()

In [14]:
x_train["Age"] = impute_moy_age.fit_transform(np.array(x_train["Age"]).reshape(-1, 1))

In [15]:
#x_train["Sex"] = tranform_sex.inverse_transform(x_train["Sex"])

In [16]:
y_train.Survived.value_counts()

0    423
1    245
Name: Survived, dtype: int64

In [17]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 874 to 888
Data columns (total 6 columns):
Pclass    668 non-null int64
Sex       668 non-null int32
Age       668 non-null float64
SibSp     668 non-null int64
Parch     668 non-null int64
Fare      668 non-null float64
dtypes: float64(2), int32(1), int64(3)
memory usage: 33.9 KB


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
modele_rf = RandomForestClassifier(n_estimators= 100)
modele_lin = LogisticRegression()
modele_nn = MLPClassifier()
modele_tree = DecisionTreeClassifier()

In [20]:
modele_rf.fit(x_train, y_train)
modele_lin.fit(x_train, y_train)
modele_nn.fit(x_train, y_train)
modele_tree.fit(x_train, y_train)

  """Entry point for launching an IPython kernel.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
pd.DataFrame(modele_rf.feature_importances_,index=x_train.columns)

Unnamed: 0,0
Pclass,0.090348
Sex,0.27661
Age,0.256917
SibSp,0.050182
Parch,0.042148
Fare,0.283796


## Validation du modèle

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [23]:
x_test["Age"] = impute_moy_age.transform(np.array(x_test["Age"]).reshape(-1, 1))
x_test["Sex"] = tranform_sex.transform(x_test["Sex"])

In [24]:
modele_rf.predict(x_test)

array([1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0], dtype=int64)

In [25]:
print("RF",accuracy_score(y_test, modele_rf.predict(x_test)))
print("Logit",accuracy_score(y_test, modele_lin.predict(x_test)))
print("NN",accuracy_score(y_test, modele_nn.predict(x_test)))
print("Tree",accuracy_score(y_test, modele_tree.predict(x_test)))

RF 0.7713004484304933
Logit 0.7668161434977578
NN 0.7802690582959642
Tree 0.7488789237668162


In [26]:
print(confusion_matrix(y_test, modele_rf.predict(x_test)))
print(confusion_matrix(y_test, modele_lin.predict(x_test)))
print(confusion_matrix(y_test, modele_nn.predict(x_test)))
print(confusion_matrix(y_test, modele_tree.predict(x_test)))

[[106  20]
 [ 31  66]]
[[104  22]
 [ 30  67]]
[[105  21]
 [ 28  69]]
[[103  23]
 [ 33  64]]


In [27]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(y_test, modele_rf.predict(x_test)))
print(classification_report(y_test, modele_lin.predict(x_test)))
print(classification_report(y_test, modele_nn.predict(x_test)))
print(classification_report(y_test, modele_tree.predict(x_test)))

              precision    recall  f1-score   support

           0       0.77      0.84      0.81       126
           1       0.77      0.68      0.72        97

   micro avg       0.77      0.77      0.77       223
   macro avg       0.77      0.76      0.76       223
weighted avg       0.77      0.77      0.77       223

              precision    recall  f1-score   support

           0       0.78      0.83      0.80       126
           1       0.75      0.69      0.72        97

   micro avg       0.77      0.77      0.77       223
   macro avg       0.76      0.76      0.76       223
weighted avg       0.77      0.77      0.77       223

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       126
           1       0.77      0.71      0.74        97

   micro avg       0.78      0.78      0.78       223
   macro avg       0.78      0.77      0.77       223
weighted avg       0.78      0.78      0.78       223

              preci

## Grid search

In [108]:
from sklearn.model_selection import GridSearchCV

In [120]:
dico_param = {"n_estimators": [10,100, 1000],"max_depth":[4,9,None]}

In [121]:
modele_grid = GridSearchCV(RandomForestClassifier(),dico_param, scoring="accuracy",cv = 4)

In [122]:
modele_grid.fit(x_train,y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 100, 1000], 'max_depth': [4, 9, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [123]:
modele_grid.best_params_

{'max_depth': 4, 'n_estimators': 100}

In [124]:
modele_grid.best_score_

0.8293413173652695

In [125]:
accuracy_score(y_test,modele_grid.best_estimator_.predict(x_test))

0.8116591928251121

In [126]:
from sklearn.externals import joblib

In [127]:
joblib.dump(modele_rf,'mon_modele.pkl')

['mon_modele.pkl']