# Classification multi-classe

On cherche à prédire la note d'un vin avec un classifieur multi-classe.

In [1]:
%matplotlib inline

In [2]:
from papierstat.datasets import load_wines_dataset
df = load_wines_dataset()
X = df.drop(['quality', 'color'], axis=1)
y = df['quality']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
clr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [5]:
import numpy
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

51.87692307692308

On regarde la matrice de confusion.

In [6]:
from sklearn.metrics import confusion_matrix
import pandas
pandas.DataFrame(confusion_matrix(y_test, clr.predict(X_test)))

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,5,3,0,0,0
1,0,0,34,21,0,0,0
2,0,0,320,222,0,0,0
3,0,0,184,512,8,0,0
4,0,0,15,235,11,0,0
5,0,0,2,50,2,0,0
6,0,0,0,1,0,0,0


On l'affiche différemment avec le nom des classes.

In [7]:
conf = confusion_matrix(y_test, clr.predict(X_test))
dfconf = pandas.DataFrame(conf)
dfconf.columns = clr.classes_
dfconf.index = clr.classes_
dfconf

Unnamed: 0,3,4,5,6,7,8,9
3,0,0,5,3,0,0,0
4,0,0,34,21,0,0,0
5,0,0,320,222,0,0,0
6,0,0,184,512,8,0,0
7,0,0,15,235,11,0,0
8,0,0,2,50,2,0,0
9,0,0,0,1,0,0,0


Pas extraordinaire. On applique la stratégie [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html).

In [8]:
from sklearn.multiclass import OneVsRestClassifier
clr = OneVsRestClassifier(LogisticRegression())
clr.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [9]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

51.93846153846153

Le modèle logistique régression multi-classe est équivalent à la stratégie *OneVsRest*. Voyons l'autre.

In [10]:
from sklearn.multiclass import OneVsOneClassifier
clr = OneVsOneClassifier(LogisticRegression())
clr.fit(X_train, y_train)

OneVsOneClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [11]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

52.86153846153846

In [12]:
conf = confusion_matrix(y_test, clr.predict(X_test))
dfconf = pandas.DataFrame(conf)
dfconf.columns = clr.classes_
dfconf.index = clr.classes_
dfconf

Unnamed: 0,3,4,5,6,7,8,9
3,0,0,4,3,0,1,0
4,0,1,34,20,0,0,0
5,0,0,320,222,0,0,0
6,0,0,175,495,34,0,0
7,0,0,18,200,43,0,0
8,0,0,3,40,11,0,0
9,0,0,0,1,0,0,0


A peu près pareil mais sans doute pas de manière significative. Voyons avec un arbre de décision.

In [13]:
from sklearn.tree import DecisionTreeClassifier
clr = DecisionTreeClassifier()
clr.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [14]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

59.692307692307686

Et avec [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) :

In [15]:
clr = OneVsRestClassifier(DecisionTreeClassifier())
clr.fit(X_train, y_train)

OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=1)

In [16]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

51.815384615384616

Et avec [OneVsOneClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsOneClassifier.html)

In [17]:
clr = OneVsOneClassifier(DecisionTreeClassifier())
clr.fit(X_train, y_train)

OneVsOneClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=1)

In [18]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

60.0

Mieux.

In [19]:
from sklearn.ensemble import RandomForestClassifier
clr = RandomForestClassifier()
clr.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

62.58461538461538

In [21]:
clr = OneVsRestClassifier(RandomForestClassifier())
clr.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=1)

In [22]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

64.43076923076923

Proche, il faut affiner avec une validation croisée.

In [23]:
from sklearn.neural_network import MLPClassifier
clr = MLPClassifier(hidden_layer_sizes=30, max_iter=600)
clr.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=30, learning_rate='constant',
       learning_rate_init=0.001, max_iter=600, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [24]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

46.64615384615385

In [25]:
clr = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=30, max_iter=600))
clr.fit(X_train, y_train)

OneVsRestClassifier(estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=30, learning_rate='constant',
       learning_rate_init=0.001, max_iter=600, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
          n_jobs=1)

In [26]:
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

50.58461538461538

Pas foudroyant.