# Benchmark: Random Forest Classifier

To get a rough idea of what is a good MAE for this problem we train a random forest classifier. We can use this model as a reference.

In [129]:
import pandas as pd
import numpy as np

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [108]:
from sklearn.metrics import mean_absolute_error, confusion_matrix

In [75]:
# We slightly rename the features
new_names = {"fixed acidity": "fixed_acidity", 
             "volatile acidity": "volatile_acidity",
             "citric acid": "citric",
             "residual sugar": "sugar",
             "free sulfur dioxide": "free_SD",
             "total sulfur dioxide": "total_SD"}

In [76]:
red = pd.read_csv('data/winequality-red.csv', delimiter=";").rename(columns=new_names)
white = pd.read_csv('data/winequality-white.csv', delimiter=";").rename(columns=new_names)

In [77]:
red['type'] = 'red'
white['type'] = 'white'

In [78]:
wine = pd.concat([red, white], ignore_index=True)

In [79]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [80]:
temp = wine['quality']
wine = pd.get_dummies(wine.drop('quality', axis=1))
wine['quality'] = temp

In [81]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,type_white,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1,0,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,1,0,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,1,0,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1,0,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1,0,5


In [82]:
X, y = wine.ix[:,:-1], wine['quality']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [104]:
clf = RandomForestClassifier(n_estimators=2000)

In [105]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [177]:
y_pred = clf.predict(X_test)

In [178]:
mean_absolute_error(y_test, y_pred)

0.32538461538461538

In [159]:
confusion_matrix(y_test, y_pred)

array([[  0,   1,   2,   1,   0,   0,   0],
       [  0,   9,  26,  12,   0,   0,   0],
       [  0,   1, 324, 113,   0,   0,   0],
       [  0,   0,  64, 441,  38,   0,   0],
       [  0,   0,   5, 101, 121,   1,   0],
       [  0,   0,   0,  12,   8,  18,   0],
       [  0,   0,   0,   1,   1,   0,   0]])

In [116]:
sorted(wine.quality.unique())

[3, 4, 5, 6, 7, 8, 9]

In [120]:
y_train.value_counts()

6    2293
5    1700
7     851
4     169
8     155
3      26
9       3
Name: quality, dtype: int64

In [121]:
y_test.value_counts()

6    543
5    438
7    228
4     47
8     38
3      4
9      2
Name: quality, dtype: int64

In [123]:
# accuracy
sum([yt == yp for yt, yp in zip(y_test, y_pred)])/len(y_test)

0.7023076923076923