Notebook comprenant un exemple de machine learning sur des données de vins

In [1]:
import pandas as pd
import numpy as np

In [2]:
# on importe les données
wine_red = pd.read_csv("./data/winequality-red.csv",sep=";")
wine_white = pd.read_csv("./data/winequality-white.csv",sep=";")

In [3]:
wine_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# créer la cible
wine_red["type"]="red"
wine_white["type"]="white"

In [5]:
# concaténer les dataframes
data_wine = pd.concat([wine_red,wine_white])

In [6]:
data_wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 13 columns):
fixed acidity           6497 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
type                    6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 710.6+ KB


In [7]:
data_wine["type"].value_counts()

white    4898
red      1599
Name: type, dtype: int64

On sépare les données en apprentissage / test

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x = data_wine.drop("type",axis=1)

In [10]:
y = data_wine["type"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

In [12]:
# on importe les modèles
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [13]:
# on crée des objets
modele_logit = LogisticRegression()
modele_foret = RandomForestClassifier(n_estimators=100)

In [14]:
# on apprend sur les données
%time modele_logit.fit(x_train, y_train)
%time modele_foret.fit(x_train, y_train)



Wall time: 47.9 ms
Wall time: 923 ms


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

On vérifie la qualité des modèles

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
print(accuracy_score(y_test, modele_logit.predict(x_test)))

0.9825641025641025


In [17]:
print(accuracy_score(y_test, modele_foret.predict(x_test)))

0.9953846153846154


In [18]:
print(confusion_matrix(y_test, modele_logit.predict(x_test)))

[[ 422   18]
 [  16 1494]]


In [19]:
print(confusion_matrix(y_test, modele_foret.predict(x_test)))

[[ 434    6]
 [   3 1507]]


On étudie les paramètres des modèles

In [20]:
pd.DataFrame(modele_logit.coef_,columns=x.columns).T

Unnamed: 0,0
fixed acidity,-0.666496
volatile acidity,-8.141555
citric acid,0.31188
residual sugar,0.147261
chlorides,-1.957785
free sulfur dioxide,-0.044721
total sulfur dioxide,0.060312
density,2.445164
pH,-1.809092
sulphates,-6.354658


In [21]:
pd.DataFrame(modele_foret.feature_importances_,index=x.columns)

Unnamed: 0,0
fixed acidity,0.050496
volatile acidity,0.127952
citric acid,0.020784
residual sugar,0.057378
chlorides,0.235706
free sulfur dioxide,0.050726
total sulfur dioxide,0.30119
density,0.061349
pH,0.019776
sulphates,0.062696


On essaye de prédire à partir du modèle 

In [22]:
modele_foret.predict(np.random.random(size=12).reshape(1, -1))

array(['white'], dtype=object)

In [23]:
modele_foret.predict(x.iloc[10:12])

array(['red', 'red'], dtype=object)

In [24]:
modele_foret.predict_proba(x.iloc[10:12])

array([[1.  , 0.  ],
       [0.99, 0.01]])

# Modèle de régression

On cherche à prédire la qualité du vins à partir de ses caractéristiques chimiques

In [25]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

In [26]:
x_reg = data_wine.drop(["type","quality"],axis=1)

In [27]:
y_reg = data_wine["quality"]

In [28]:
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x_reg,y_reg,test_size = 0.3)

In [29]:
modele_ridge = Ridge()
modele_knn = KNeighborsRegressor()

In [30]:
%time modele_ridge.fit(x_train_reg,y_train_reg)
%time modele_knn.fit(x_train_reg,y_train_reg)

Wall time: 21.9 ms
Wall time: 12 ms


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [31]:
from sklearn.metrics import r2_score, mean_squared_error

# racine de la moyenne des carrés des erreurs
print(np.sqrt(mean_squared_error(y_test_reg,modele_knn.predict(x_test_reg))))

print(r2_score(y_test_reg,modele_knn.predict(x_test_reg)))

0.8124543379674031
0.12718710748905682


Le modèle obtenu n'est pas efficace pour prédire la qualité à partir des caractérstiques physiques du vin.