# Benchmarks

To get a rough idea of what is a good MAE for this problem we train a random forest classifier. We can use this model as a reference. In addition we also train a simple logistic regression as a performance reference for our attempts.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

In [3]:
#data = pd.read_csv('data/red_normal.csv')
#data = pd.read_csv('data/red_data.csv')
#data = pd.read_csv('data/white_normal.csv')
#data = pd.read_csv('data/white_data.csv')
data = pd.read_csv('data/wine_normal.csv')
#data = pd.read_csv('data/wine_data.csv')

In [4]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,quality
0,-1.564967,-2.115822,-0.613053,-0.412341,-0.381414,-1.025732,-0.973236,0.248294,1.137303,-0.261526,-1.067971,0,5
1,-0.987675,1.452634,-1.288368,0.404591,-0.391592,0.402953,-0.159545,-0.66055,1.741648,1.367797,1.761353,1,7
2,-0.77273,1.646757,-0.613053,-0.392625,1.147534,1.33543,1.591586,-0.009153,0.541278,0.264209,-0.580412,0,4
3,-0.645329,-0.550403,-0.282528,-0.865805,0.433714,-0.578715,-0.432035,-0.664473,0.342604,-0.173903,-0.092853,0,6
4,-0.298454,-0.583142,2.099702,-0.898276,5.590264,0.489629,0.899821,0.293176,-1.496978,3.845571,-1.053784,1,5


In [5]:
X, y = data.ix[:,:-1], data['quality']

## Random Forest Classifier

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')

In [9]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [10]:
y_pred = clf.predict(X_test)

In [49]:
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.433846153846
Acc. = 0.623076923077


In [12]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   1,   0,   0,   0,   0],
       [  0,   7,  15,  17,   0,   0,   0],
       [  0,   1, 307, 118,   1,   0,   0],
       [  0,   0,  75, 473,  19,   0,   0],
       [  0,   0,   3, 107, 115,   1,   0],
       [  0,   0,   0,  13,   9,  17,   0],
       [  0,   0,   0,   0,   1,   0,   0]])

In [13]:
y_test.value_counts()

6    567
5    427
7    226
8     39
4     39
9      1
3      1
Name: quality, dtype: int64

In [14]:
pd.Series(y_pred).value_counts()

6    728
5    401
7    145
8     18
4      8
dtype: int64

In [15]:
y_train.value_counts()

6    2269
5    1711
7     853
4     177
8     154
3      29
9       4
Name: quality, dtype: int64

## Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
penalty = ['l1', 'l2']
C = [0.001*10**i for i in range(6)]

In [36]:
clfs = list()
maes = list()
params = list()
for p in penalty:
    for c in C:
        params.append([p, c])
        clf = LogisticRegression(penalty=p, C=c, n_jobs=-1)
        clf.fit(X_train, y_train)
        clfs.append(clf)
        y_pred = clf.predict(X_test)
        print("Results for penalty = {} and C = {}".format(p, c))
        mae = mean_absolute_error(y_test, y_pred)
        maes.append(mae)
        print("MAE =", mae)
        print("Acc. =", accuracy_score(y_test, y_pred))
        print()

Results for penalty = l1 and C = 0.001
MAE = 0.913076923077
Acc. = 0.321538461538

Results for penalty = l1 and C = 0.01
MAE = 0.526923076923
Acc. = 0.534615384615

Results for penalty = l1 and C = 0.1
MAE = 0.516923076923
Acc. = 0.541538461538

Results for penalty = l1 and C = 1.0
MAE = 0.519230769231
Acc. = 0.54

Results for penalty = l1 and C = 10.0
MAE = 0.520769230769
Acc. = 0.538461538462

Results for penalty = l1 and C = 100.0
MAE = 0.520769230769
Acc. = 0.538461538462

Results for penalty = l2 and C = 0.001
MAE = 0.542307692308
Acc. = 0.524615384615

Results for penalty = l2 and C = 0.01
MAE = 0.529230769231
Acc. = 0.530769230769

Results for penalty = l2 and C = 0.1
MAE = 0.516923076923
Acc. = 0.540769230769

Results for penalty = l2 and C = 1.0
MAE = 0.520769230769
Acc. = 0.538461538462

Results for penalty = l2 and C = 10.0
MAE = 0.520769230769
Acc. = 0.538461538462

Results for penalty = l2 and C = 100.0
MAE = 0.520769230769
Acc. = 0.538461538462



In [37]:
arg_min = np.argmin(maes)
p, c = params[arg_min]
print("Lowest MAE ({}) with penalty = {} and C = {}".format(maes[arg_min], p, c))

Lowest MAE (0.5169230769230769) with penalty = l1 and C = 0.1


In [38]:
y_pred = clfs[arg_min].predict(X_test)

In [39]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   5,   2,   0,   0],
       [  0,   0,  27,  12,   2,   0],
       [  0,   0, 279, 139,   0,   0],
       [  0,   0, 158, 406,  15,   0],
       [  0,   0,   9, 184,  19,   0],
       [  0,   0,   3,  36,   4,   0]])

In [40]:
y_test.value_counts()

6    579
5    418
7    212
8     43
4     41
3      7
Name: quality, dtype: int64

In [41]:
pd.Series(y_pred).value_counts()

6    779
5    481
7     40
dtype: int64

In [42]:
y_train.value_counts()

6    2257
5    1720
7     867
4     175
8     150
3      23
9       5
Name: quality, dtype: int64

In [43]:
data.quality.value_counts()

6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64

## Radom Guessing

In [82]:
y_pred = []
for _ in range(len(y_test)):
    y_pred.append(np.random.randint(5,8))

In [83]:
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.94
Acc. = 0.293846153846
