# Benchmark: Random Forest Classifier

To get a rough idea of what is a good MAE for this problem we train a random forest classifier. We can use this model as a reference.

In [7]:
import pandas as pd
import numpy as np

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, confusion_matrix

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
# We slightly rename the features
new_names = {"fixed acidity": "fixed_acidity", 
             "volatile acidity": "volatile_acidity",
             "citric acid": "citric",
             "residual sugar": "sugar",
             "free sulfur dioxide": "free_SD",
             "total sulfur dioxide": "total_SD"}

In [11]:
red = pd.read_csv('data/winequality-red.csv', delimiter=";").rename(columns=new_names)
white = pd.read_csv('data/winequality-white.csv', delimiter=";").rename(columns=new_names)

In [12]:
red['type'] = 'red'
white['type'] = 'white'

In [13]:
wine = pd.concat([red, white], ignore_index=True).sample(frac=1).reset_index(drop=True)
#wine = white
#wine = red

In [14]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,quality,type
0,7.6,0.3,0.42,2.0,0.052,6.0,24.0,0.9963,3.44,0.82,11.9,6,red
1,6.5,0.25,0.27,17.4,0.064,29.0,140.0,0.99776,3.2,0.49,10.1,6,white
2,6.8,0.22,0.35,5.5,0.043,21.0,114.0,0.9938,3.3,0.53,10.7,7,white
3,9.2,0.28,0.46,3.2,0.058,39.0,133.0,0.996,3.14,0.58,9.5,5,white
4,8.5,0.32,0.42,2.3,0.075,12.0,19.0,0.99434,3.14,0.71,11.8,7,red


In [15]:
temp = wine['quality']
wine = pd.get_dummies(wine.drop('quality', axis=1))
wine['quality'] = temp

In [16]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,type_white,quality
0,7.6,0.3,0.42,2.0,0.052,6.0,24.0,0.9963,3.44,0.82,11.9,1,0,6
1,6.5,0.25,0.27,17.4,0.064,29.0,140.0,0.99776,3.2,0.49,10.1,0,1,6
2,6.8,0.22,0.35,5.5,0.043,21.0,114.0,0.9938,3.3,0.53,10.7,0,1,7
3,9.2,0.28,0.46,3.2,0.058,39.0,133.0,0.996,3.14,0.58,9.5,0,1,5
4,8.5,0.32,0.42,2.3,0.075,12.0,19.0,0.99434,3.14,0.71,11.8,1,0,7


In [17]:
X, y = wine.ix[:,:-1], wine['quality']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [22]:
clf = RandomForestClassifier(n_estimators=1000,
                             n_jobs=-1)

In [23]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [24]:
y_pred = clf.predict(X_test)

In [25]:
mean_absolute_error(y_test, y_pred)

0.36153846153846153

In [17]:
confusion_matrix(y_test, y_pred)

array([[  0,   1,   4,   3,   0,   0,   0],
       [  0,   6,  25,  13,   0,   0,   0],
       [  0,   2, 333, 101,   4,   0,   0],
       [  0,   0,  76, 446,  35,   0,   0],
       [  0,   0,   2,  89, 116,   1,   0],
       [  0,   0,   0,  15,  18,   9,   0],
       [  0,   0,   0,   1,   0,   0,   0]])

In [18]:
sorted(wine.quality.unique())

[3, 4, 5, 6, 7, 8, 9]

In [19]:
y_train.value_counts()

6    2279
5    1698
7     871
4     172
8     151
3      22
9       4
Name: quality, dtype: int64

In [20]:
y_test.value_counts()

6    557
5    440
7    208
4     44
8     42
3      8
9      1
Name: quality, dtype: int64

In [21]:
# accuracy
sum([yt == yp for yt, yp in zip(y_test, y_pred)])/len(y_test)

0.69999999999999996

In [22]:
y_prob = clf.predict_proba(X_test)

In [23]:
y_prob

array([[ 0.   ,  0.005,  0.129, ...,  0.068,  0.002,  0.   ],
       [ 0.002,  0.004,  0.046, ...,  0.431,  0.333,  0.001],
       [ 0.   ,  0.011,  0.14 , ...,  0.205,  0.054,  0.003],
       ..., 
       [ 0.   ,  0.011,  0.355, ...,  0.087,  0.007,  0.   ],
       [ 0.   ,  0.004,  0.065, ...,  0.477,  0.036,  0.004],
       [ 0.   ,  0.013,  0.131, ...,  0.423,  0.021,  0.   ]])

In [24]:
# We make predictions averaging the two highest probabilities
def weighted_avg(a,b):
    total = a[1] + b[1]
    num = (a[0] + 3)*a[1] + (b[0] + 3)*b[1]
    return num/total

y_pred2 = []
for y in y_prob:
    a, b, *_ = sorted(list(enumerate(y)), key=lambda x: x[1])[::-1]
    y_pred2.append(weighted_avg(a,b))
y_pred2 = np.array(y_pred2)

In [25]:
y_pred2

array([ 5.86054054,  7.43586387,  6.25883838, ...,  5.60335196,
        6.53535354,  6.50658683])

In [26]:
mean_absolute_error(y_test, y_pred2)

0.44256257717299186

# XGBoost