# Benchmark

To get a rough idea of what is a good MAE for this problem we train a random forest classifier. We can use this model as a reference. In addition we also train a simple logistic regression as a performance reference for our attempts.

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

In [7]:
#data = pd.read_csv('data/red_normal.csv')
#data = pd.read_csv('data/red_data.csv')
#data = pd.read_csv('data/white_normal.csv')
#data = pd.read_csv('data/white_data.csv')
#data = pd.read_csv('data/wine_normal.csv')
data = pd.read_csv('data/wine_data.csv')

In [26]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,quality
0,6.3,0.27,0.25,5.8,0.038,52.0,155.0,0.995,3.28,0.38,9.4,0,6
1,7.0,0.31,0.26,7.4,0.069,28.0,160.0,0.9954,3.13,0.46,9.8,0,6
2,7.6,0.38,0.2,3.4,0.046,9.0,116.0,0.9944,3.15,0.41,9.4,0,5
3,8.5,0.28,0.35,1.7,0.061,6.0,15.0,0.99524,3.3,0.74,11.8,1,7
4,8.0,0.725,0.24,2.8,0.083,10.0,62.0,0.99685,3.35,0.56,10.0,1,6


In [29]:
X, y = data.ix[:,:-1], data['quality']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8)

## Random Forest Classifier

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
clf = RandomForestClassifier(n_estimators=1000,
                             max_depth=None,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto',
                             n_jobs=-1)


In [32]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [33]:
y_pred = clf.predict(X_test)

In [34]:
print("MAE = {}\nAcc. = {}".format(mean_absolute_error(y_test, y_pred),
                                   accuracy_score(y_test, y_pred)))

MAE = 0.3707692307692308
Acc. = 0.6753846153846154


In [35]:
confusion_matrix(y_test, y_pred)

array([[  0,   1,   0,   5,   0,   0,   0],
       [  0,   5,  15,  15,   2,   0,   0],
       [  0,   0, 305, 123,   3,   0,   0],
       [  0,   0,  77, 435,  29,   1,   0],
       [  0,   0,   6, 108, 114,   1,   0],
       [  0,   0,   0,  19,  16,  19,   0],
       [  0,   0,   0,   1,   0,   0,   0]])

In [36]:
y_test.value_counts()

6    542
5    431
7    229
8     54
4     37
3      6
9      1
Name: quality, dtype: int64

In [37]:
pd.Series(y_pred).value_counts()

6    706
5    403
7    164
8     21
4      6
dtype: int64

In [38]:
y_train.value_counts()

6    2294
5    1707
7     850
4     179
8     139
3      24
9       4
Name: quality, dtype: int64

## Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [39]:
penalty = ['l1', 'l2']
C = [0.001*10**i for i in range(6)]

In [40]:
clfs = list()
maes = list()
params = list()
for p in penalty:
    for c in C:
        params.append([p, c])
        clf = LogisticRegression(penalty=p, C=c, n_jobs=-1)
        clf.fit(X_train, y_train)
        clfs.append(clf)
        y_pred = clf.predict(X_test)
        print("Results for penalty = {} and C = {}".format(p, c))
        mae = mean_absolute_error(y_test, y_pred)
        maes.append(mae)
        print("MAE =", mae)
        print("Acc. =", accuracy_score(y_test, y_pred))
        print()

Results for penalty = l1 and C = 0.001
MAE = 0.659230769231
Acc. = 0.421538461538

Results for penalty = l1 and C = 0.01
MAE = 0.616923076923
Acc. = 0.467692307692

Results for penalty = l1 and C = 0.1
MAE = 0.546923076923
Acc. = 0.525384615385

Results for penalty = l1 and C = 1.0
MAE = 0.546153846154
Acc. = 0.524615384615

Results for penalty = l1 and C = 10.0
MAE = 0.547692307692
Acc. = 0.522307692308

Results for penalty = l1 and C = 100.0
MAE = 0.549230769231
Acc. = 0.522307692308

Results for penalty = l2 and C = 0.001
MAE = 0.625384615385
Acc. = 0.46

Results for penalty = l2 and C = 0.01
MAE = 0.58
Acc. = 0.499230769231

Results for penalty = l2 and C = 0.1
MAE = 0.549230769231
Acc. = 0.523076923077

Results for penalty = l2 and C = 1.0
MAE = 0.545384615385
Acc. = 0.523846153846

Results for penalty = l2 and C = 10.0
MAE = 0.545384615385
Acc. = 0.524615384615

Results for penalty = l2 and C = 100.0
MAE = 0.548461538462
Acc. = 0.522307692308



In [41]:
arg_min = np.argmin(maes)
p, c = params[arg_min]
print("Lowest MAE ({}) with penalty = {} and C = {}".format(maes[arg_min], p, c))

Lowest MAE (0.5453846153846154) with penalty = l2 and C = 1.0


In [42]:
y_pred = clfs[arg_min].predict(X_test)

In [43]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   1,   5,   0,   0,   0],
       [  0,   0,  20,  16,   1,   0,   0],
       [  0,   0, 251, 180,   0,   0,   0],
       [  0,   0, 120, 417,   5,   0,   0],
       [  0,   0,   7, 209,  13,   0,   0],
       [  0,   0,   3,  46,   5,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0]])