# Benchmarks

The purpose of this notebook is to get a rough idea of what are reasonable performance goals for the mean absolute error (MAE). Our approach is to train a couple models and use thier MAEs as a reference.
- Random forest classifier
- Logistic regression
- Guessing at random

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

In [71]:
#data = pd.read_csv('data/red_normal.csv')
#data = pd.read_csv('data/red_data.csv')
#data = pd.read_csv('data/white_normal.csv')
#data = pd.read_csv('data/white_data.csv')
data = pd.read_csv('data/wine_normal.csv')
#data = pd.read_csv('data/wine_data.csv')

In [72]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,quality
0,-1.564967,-2.115822,-0.613053,-0.412341,-0.381414,-1.025732,-0.973236,0.248294,1.137303,-0.261526,-1.067971,0,5
1,-0.987675,1.452634,-1.288368,0.404591,-0.391592,0.402953,-0.159545,-0.66055,1.741648,1.367797,1.761353,1,7
2,-0.77273,1.646757,-0.613053,-0.392625,1.147534,1.33543,1.591586,-0.009153,0.541278,0.264209,-0.580412,0,4
3,-0.645329,-0.550403,-0.282528,-0.865805,0.433714,-0.578715,-0.432035,-0.664473,0.342604,-0.173903,-0.092853,0,6
4,-0.298454,-0.583142,2.099702,-0.898276,5.590264,0.489629,0.899821,0.293176,-1.496978,3.845571,-1.053784,1,5


In [73]:
# Optionally generate interaction features x*y and x**2
if False:
    from itertools import combinations
    
    old_features = [name for name in data.columns if not 'type' in name and name != "quality"]
    
    for feature1, feature2 in combinations(old_features, 2):
        temp = data[feature1] * data[feature2]
        data.insert(data.shape[1]-1, feature1 + 'x' + feature2, temp)

    for name in old_features:
         data.insert(data.shape[1]-1, name + '**2', data[name]*data[name])   

In [74]:
X, y = data.ix[:,:-1], data['quality']

In [75]:
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,...,volatile_acidity**2,citric**2,sugar**2,chlorides**2,free_SD**2,total_SD**2,density**2,pH**2,sulphates**2,alcohol**2
0,-1.564967,-2.115822,-0.613053,-0.412341,-0.381414,-1.025732,-0.973236,0.248294,1.137303,-0.261526,...,4.476701,0.375834,0.170025,0.145477,1.052125,0.947189,0.06165,1.293457,0.068396,1.140562
1,-0.987675,1.452634,-1.288368,0.404591,-0.391592,0.402953,-0.159545,-0.66055,1.741648,1.367797,...,2.110146,1.659893,0.163694,0.153344,0.162371,0.025454,0.436327,3.033336,1.870869,3.102365
2,-0.77273,1.646757,-0.613053,-0.392625,1.147534,1.33543,1.591586,-0.009153,0.541278,0.264209,...,2.711808,0.375834,0.154154,1.316835,1.783373,2.533147,8.4e-05,0.292982,0.069807,0.336878
3,-0.645329,-0.550403,-0.282528,-0.865805,0.433714,-0.578715,-0.432035,-0.664473,0.342604,-0.173903,...,0.302943,0.079822,0.749619,0.188108,0.334911,0.186654,0.441524,0.117377,0.030242,0.008622
4,-0.298454,-0.583142,2.099702,-0.898276,5.590264,0.489629,0.899821,0.293176,-1.496978,3.845571,...,0.340055,4.40875,0.8069,31.251051,0.239736,0.809677,0.085952,2.240944,14.788418,1.11046


## Random Forest Classifier

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')

In [79]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [80]:
y_pred = clf.predict(X_test)

In [81]:
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.366923076923
Acc. = 0.685384615385


In [82]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   6,   5,   0,   0],
       [  0,   7,  30,  20,   0,   0],
       [  0,   2, 301, 126,   2,   0],
       [  0,   0,  72, 460,  29,   0],
       [  0,   0,   8,  79, 110,   0],
       [  0,   0,   1,  20,   9,  13]])

In [83]:
y_test.value_counts()

6    561
5    431
7    197
4     57
8     43
3     11
Name: quality, dtype: int64

In [84]:
pd.Series(y_pred).value_counts()

6    710
5    418
7    150
8     13
4      9
dtype: int64

In [85]:
y_train.value_counts()

6    2275
5    1707
7     882
4     159
8     150
3      19
9       5
Name: quality, dtype: int64

## Logistic Regression

In [86]:
from sklearn.linear_model import LogisticRegression

In [87]:
penalty = ['l1', 'l2']
C = [0.001*10**i for i in range(6)]

In [88]:
clfs = list()
maes = list()
params = list()
for p in penalty:
    for c in C:
        params.append([p, c])
        clf = LogisticRegression(penalty=p, C=c, n_jobs=-1)
        clf.fit(X_train, y_train)
        clfs.append(clf)
        y_pred = clf.predict(X_test)
        print("Results for penalty = {} and C = {}".format(p, c))
        mae = mean_absolute_error(y_test, y_pred)
        maes.append(mae)
        print("MAE =", mae)
        print("Acc. =", accuracy_score(y_test, y_pred))
        print()

Results for penalty = l1 and C = 0.001
MAE = 0.662307692308
Acc. = 0.431538461538

Results for penalty = l1 and C = 0.01
MAE = 0.539230769231
Acc. = 0.530769230769

Results for penalty = l1 and C = 0.1
MAE = 0.510769230769
Acc. = 0.55

Results for penalty = l1 and C = 1.0
MAE = 0.514615384615
Acc. = 0.547692307692

Results for penalty = l1 and C = 10.0
MAE = 0.523846153846
Acc. = 0.543076923077

Results for penalty = l1 and C = 100.0
MAE = 0.521538461538
Acc. = 0.545384615385

Results for penalty = l2 and C = 0.001
MAE = 0.536923076923
Acc. = 0.536923076923

Results for penalty = l2 and C = 0.01
MAE = 0.520769230769
Acc. = 0.545384615385

Results for penalty = l2 and C = 0.1
MAE = 0.513076923077
Acc. = 0.548461538462

Results for penalty = l2 and C = 1.0
MAE = 0.519230769231
Acc. = 0.543846153846

Results for penalty = l2 and C = 10.0
MAE = 0.52
Acc. = 0.543846153846

Results for penalty = l2 and C = 100.0
MAE = 0.525384615385
Acc. = 0.543846153846



In [89]:
arg_min = np.argmin(maes)
p, c = params[arg_min]
print("Lowest MAE ({}) with penalty = {} and C = {}".format(maes[arg_min], p, c))

Lowest MAE (0.5107692307692308) with penalty = l1 and C = 0.1


In [90]:
y_pred = clfs[arg_min].predict(X_test)

In [91]:
confusion_matrix(y_test, y_pred)

array([[  0,   1,   5,   5,   0,   0],
       [  0,   6,  32,  17,   2,   0],
       [  0,   2, 267, 156,   6,   0],
       [  0,   0, 134, 390,  37,   0],
       [  0,   0,  11, 134,  52,   0],
       [  0,   0,   1,  24,  18,   0]])

In [92]:
y_test.value_counts()

6    561
5    431
7    197
4     57
8     43
3     11
Name: quality, dtype: int64

In [93]:
pd.Series(y_pred).value_counts()

6    726
5    450
7    115
4      9
dtype: int64

In [94]:
y_train.value_counts()

6    2275
5    1707
7     882
4     159
8     150
3      19
9       5
Name: quality, dtype: int64

## Radom Guessing

In [95]:
y_pred = []
for _ in range(len(y_test)):
    # We randomly guess a wine score in [5, 7]
    y_pred.append(np.random.randint(5,8))

In [96]:
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.954615384615
Acc. = 0.300769230769
