# Benchmarks

The purpose of this notebook is to get a rough idea of what are reasonable performance goals for the mean absolute error (MAE). Our approach is to train a couple models and use thier MAEs as a reference.
- Random forest classifier
- Logistic regression
- Guessing at random

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

In [3]:
#data = pd.read_csv('data/red_normal.csv')
#data = pd.read_csv('data/red_data.csv')
#data = pd.read_csv('data/white_normal.csv')
#data = pd.read_csv('data/white_data.csv')
data = pd.read_csv('data/wine_normal.csv')
#data = pd.read_csv('data/wine_data.csv')

In [4]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,quality
0,-1.564967,-2.115822,-0.613053,-0.412341,-0.381414,-1.025732,-0.973236,0.248294,1.137303,-0.261526,-1.067971,0,5
1,-0.987675,1.452634,-1.288368,0.404591,-0.391592,0.402953,-0.159545,-0.66055,1.741648,1.367797,1.761353,1,7
2,-0.77273,1.646757,-0.613053,-0.392625,1.147534,1.33543,1.591586,-0.009153,0.541278,0.264209,-0.580412,0,4
3,-0.645329,-0.550403,-0.282528,-0.865805,0.433714,-0.578715,-0.432035,-0.664473,0.342604,-0.173903,-0.092853,0,6
4,-0.298454,-0.583142,2.099702,-0.898276,5.590264,0.489629,0.899821,0.293176,-1.496978,3.845571,-1.053784,1,5


In [5]:
X, y = data.ix[:,:-1], data['quality']

## Random Forest Classifier

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')

In [9]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [10]:
y_pred = clf.predict(X_test)

In [11]:
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.35
Acc. = 0.687692307692


In [12]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   5,   5,   0,   0],
       [  0,   5,  31,  13,   1,   0],
       [  0,   0, 307, 105,   4,   0],
       [  0,   0,  81, 454,  25,   0],
       [  0,   0,   5, 105, 118,   2],
       [  0,   0,   1,   8,  15,  10]])

In [13]:
y_test.value_counts()

6    560
5    416
7    230
4     50
8     34
3     10
Name: quality, dtype: int64

In [14]:
pd.Series(y_pred).value_counts()

6    690
5    430
7    163
8     12
4      5
dtype: int64

In [15]:
y_train.value_counts()

6    2276
5    1722
7     849
4     166
8     159
3      20
9       5
Name: quality, dtype: int64

## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
penalty = ['l1', 'l2']
C = [0.001*10**i for i in range(6)]

In [18]:
clfs = list()
maes = list()
params = list()
for p in penalty:
    for c in C:
        params.append([p, c])
        clf = LogisticRegression(penalty=p, C=c, n_jobs=-1)
        clf.fit(X_train, y_train)
        clfs.append(clf)
        y_pred = clf.predict(X_test)
        print("Results for penalty = {} and C = {}".format(p, c))
        mae = mean_absolute_error(y_test, y_pred)
        maes.append(mae)
        print("MAE =", mae)
        print("Acc. =", accuracy_score(y_test, y_pred))
        print()

Results for penalty = l1 and C = 0.001
MAE = 0.916923076923
Acc. = 0.32

Results for penalty = l1 and C = 0.01
MAE = 0.524615384615
Acc. = 0.542307692308

Results for penalty = l1 and C = 0.1
MAE = 0.510769230769
Acc. = 0.547692307692

Results for penalty = l1 and C = 1.0
MAE = 0.511538461538
Acc. = 0.546923076923

Results for penalty = l1 and C = 10.0
MAE = 0.512307692308
Acc. = 0.546153846154

Results for penalty = l1 and C = 100.0
MAE = 0.512307692308
Acc. = 0.546153846154

Results for penalty = l2 and C = 0.001
MAE = 0.525384615385
Acc. = 0.544615384615

Results for penalty = l2 and C = 0.01
MAE = 0.516153846154
Acc. = 0.544615384615

Results for penalty = l2 and C = 0.1
MAE = 0.509230769231
Acc. = 0.547692307692

Results for penalty = l2 and C = 1.0
MAE = 0.510769230769
Acc. = 0.546153846154

Results for penalty = l2 and C = 10.0
MAE = 0.512307692308
Acc. = 0.546153846154

Results for penalty = l2 and C = 100.0
MAE = 0.512307692308
Acc. = 0.546153846154



In [19]:
arg_min = np.argmin(maes)
p, c = params[arg_min]
print("Lowest MAE ({}) with penalty = {} and C = {}".format(maes[arg_min], p, c))

Lowest MAE (0.5092307692307693) with penalty = l2 and C = 0.1


In [20]:
y_pred = clfs[arg_min].predict(X_test)

In [21]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   6,   4,   0,   0],
       [  0,   0,  32,  17,   1,   0],
       [  0,   0, 276, 139,   1,   0],
       [  0,   0, 128, 426,   6,   0],
       [  0,   0,  10, 210,  10,   0],
       [  0,   0,   1,  28,   5,   0]])

In [22]:
y_test.value_counts()

6    560
5    416
7    230
4     50
8     34
3     10
Name: quality, dtype: int64

In [23]:
pd.Series(y_pred).value_counts()

6    824
5    453
7     23
dtype: int64

In [24]:
y_train.value_counts()

6    2276
5    1722
7     849
4     166
8     159
3      20
9       5
Name: quality, dtype: int64

## Radom Guessing

In [25]:
y_pred = []
for _ in range(len(y_test)):
    # We randomly guess a wine score in [5, 7]
    y_pred.append(np.random.randint(5,8))

In [26]:
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.931538461538
Acc. = 0.316923076923
