# Modèle de machine learning

In [1]:
import pandas as pd
import numpy as np

Importer les données et construire le fichier global

In [2]:
white = pd.read_csv("./data/winequality-white.csv",sep=";")
red = pd.read_csv("./data/winequality-red.csv",sep=";")


In [3]:
white["color"]="W"
red["color"]="R"

In [4]:
data = pd.concat([white,red])

In [9]:
data.shape

(6497, 13)

In [8]:
data.groupby("color").mean().T

color,R,W
fixed acidity,8.319637,6.854788
volatile acidity,0.527821,0.278241
citric acid,0.270976,0.334192
residual sugar,2.538806,6.391415
chlorides,0.087467,0.045772
free sulfur dioxide,15.874922,35.308085
total sulfur dioxide,46.467792,138.360657
density,0.996747,0.994027
pH,3.311113,3.188267
sulphates,0.658149,0.489847


On commence par séparer données d'apprentissage et de test

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 13 columns):
fixed acidity           6497 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
color                   6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 710.6+ KB


In [27]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder

In [19]:
# création d'un objet de la classe LabelEncoder
wine_trans = LabelEncoder()

In [20]:
# transformation des données
data['color']=wine_trans.fit_transform(data['color'])

In [22]:
data['color'] = wine_trans.inverse_transform(data['color'])

In [24]:
data['color']= wine_trans.transform(data['color'])

In [26]:
wine_trans.classes_

array(['R', 'W'], dtype=object)

In [28]:
x = data.drop("color",axis=1)

In [29]:
y = data["color"]

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, stratify = y)

In [36]:
y_train.value_counts(normalize=True)

1    0.753904
0    0.246096
Name: color, dtype: float64

In [37]:
y.value_counts(normalize=True)

1    0.753886
0    0.246114
Name: color, dtype: float64

In [38]:
y_test.value_counts(normalize=True)

1    0.753846
0    0.246154
Name: color, dtype: float64

# Choisir un modèle de machine learning

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [40]:
# on crée un objet à partir de la classe du modèle
modele_knn = KNeighborsClassifier()

In [42]:
# on ajute les paramètres du modèle en utilisant les données
%time modele_knn.fit(x_train,y_train)

Wall time: 9.01 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [44]:
wine_trans.inverse_transform(modele_knn.predict(x_test))

array(['W', 'W', 'W', ..., 'W', 'W', 'R'], dtype=object)

In [45]:
(y_test == modele_knn.predict(x_test)).sum()/len(y_test)

0.9476923076923077

In [48]:
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_auc_score

In [47]:
accuracy_score(y_test,modele_knn.predict(x_test))

0.9476923076923077

## Automatisation des traitements

In [49]:
dico_modele = dict(knn=KNeighborsClassifier(),svm=SVC(),
                  rf = RandomForestClassifier(), gbm = GradientBoostingClassifier(),
                  nb = GaussianNB())

In [59]:
from sklearn.externals import joblib

In [62]:
for nom_modele in dico_modele.keys():
    modele = dico_modele[nom_modele]
    modele.fit(x_train,y_train)
    print("Matrice de confusion pour le modèle {} :".format(nom_modele),
         confusion_matrix(y_test, modele.predict(x_test)),sep="\n")
    print("% de bien classés pour le modèle {} :".format(nom_modele),
         accuracy_score(y_test, modele.predict(x_test)))
    joblib.dump(modele,str(nom_modele)+".pkl")

Matrice de confusion pour le modèle knn :
[[ 409   71]
 [  31 1439]]
% de bien classés pour le modèle knn : 0.9476923076923077




Matrice de confusion pour le modèle svm :
[[ 398   82]
 [  14 1456]]
% de bien classés pour le modèle svm : 0.9507692307692308
Matrice de confusion pour le modèle rf :
[[ 470   10]
 [   6 1464]]
% de bien classés pour le modèle rf : 0.9917948717948718
Matrice de confusion pour le modèle gbm :
[[ 469   11]
 [   6 1464]]
% de bien classés pour le modèle gbm : 0.9912820512820513
Matrice de confusion pour le modèle nb :
[[ 459   21]
 [  26 1444]]
% de bien classés pour le modèle nb : 0.9758974358974359
