In [16]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

In [17]:
df = pd.read_csv('winequality-white.csv',sep=';',quotechar='"')

In [18]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [19]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [20]:
df['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [21]:
df['quality'].describe()

count    4898.000000
mean        5.877909
std         0.885639
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64

In [22]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

## Data Processing and Preparation

In [23]:
def isTasty(quality):
    if quality >= 7:
        return 1
    else:
        return 0

In [24]:
df['tasty'] = df['quality'].apply(isTasty)

In [25]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'tasty'],
      dtype='object')

In [26]:
df['tasty'].value_counts()

0    3838
1    1060
Name: tasty, dtype: int64

In [27]:
data = df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
target = df['tasty']

In [28]:
data_train, data_test, target_train, target_test = train_test_split(data,target,test_size = 0.33,random_state=123)

In [29]:
[subset.shape for subset in [data_train,data_test,target_train,target_test]]

[(3281, 11), (1617, 11), (3281,), (1617,)]

## Training Our Classifiers

In [30]:
simpleTree = DecisionTreeClassifier(max_depth=5)

In [31]:
simpleTree.fit(data_train,target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [32]:
gbmTree = GradientBoostingClassifier(max_depth=5)

In [33]:
gbmTree.fit(data_train,target_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [34]:
rfTree = RandomForestClassifier(max_depth=5)

In [35]:
rfTree.fit(data_train,target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Evaluating Classifier Performance

In [36]:
simpleTreePerformance = precision_recall_fscore_support(target_test,simpleTree.predict(data_test))

In [37]:
gbmTreePerformance = precision_recall_fscore_support(target_test,gbmTree.predict(data_test))

In [38]:
rfTreePerformance = precision_recall_fscore_support(target_test,rfTree.predict(data_test))

In [39]:
simpleTreePerformance

(array([0.86769006, 0.56626506]),
 array([0.91660232, 0.4378882 ]),
 array([0.89147578, 0.4938704 ]),
 array([1295,  322], dtype=int64))

In [40]:
gbmTreePerformance

(array([0.89831748, 0.732     ]),
 array([0.94826255, 0.56832298]),
 array([0.92261458, 0.63986014]),
 array([1295,  322], dtype=int64))

In [41]:
rfTreePerformance

(array([0.84304318, 0.58860759]),
 array([0.94980695, 0.28881988]),
 array([0.89324619, 0.3875    ]),
 array([1295,  322], dtype=int64))

With each of these metrics, we're looking for a value as close to one (1) as possible. We can see that the Gradient Boosted (GBM) tree generally out-performs the others in correctly classifying tasty wines. The GBM tree also achieved a higher recall for the positive class than the other classifiers. This all being the case, it is clear that the GBM tree classifier was the strongest performer in the cohort.

In [42]:
print('Precision, Recall, Fscore, and Support for each class in simple, gradient boosted, and random forest tree classifiers:'+'\n')
for treeMethod in [simpleTreePerformance,gbmTreePerformance,rfTreePerformance]:
    print('Precision: ',treeMethod[0])
    print('Recall: ',treeMethod[1])
    print('Fscore: ',treeMethod[2])
    print('Support: ',treeMethod[3],'\n')

Precision, Recall, Fscore, and Support for each class in simple, gradient boosted, and random forest tree classifiers:

Precision:  [0.86769006 0.56626506]
Recall:  [0.91660232 0.4378882 ]
Fscore:  [0.89147578 0.4938704 ]
Support:  [1295  322] 

Precision:  [0.89831748 0.732     ]
Recall:  [0.94826255 0.56832298]
Fscore:  [0.92261458 0.63986014]
Support:  [1295  322] 

Precision:  [0.84304318 0.58860759]
Recall:  [0.94980695 0.28881988]
Fscore:  [0.89324619 0.3875    ]
Support:  [1295  322] 



In [43]:
print('Confusion Matrix for simple, gradient boosted, and random forest tree classifiers:')
print('Simple Tree:\n',confusion_matrix(target_test,simpleTree.predict(data_test)),'\n')
print('Gradient Boosted:\n',confusion_matrix(target_test,gbmTree.predict(data_test)),'\n')
print('Random Forest:\n',confusion_matrix(target_test,rfTree.predict(data_test)))

Confusion Matrix for simple, gradient boosted, and random forest tree classifiers:
Simple Tree:
 [[1187  108]
 [ 181  141]] 

Gradient Boosted:
 [[1228   67]
 [ 139  183]] 

Random Forest:
 [[1230   65]
 [ 229   93]]


So, now that we know that the GBM tree is our favored classifier for predicting the tastiness of wines, that begs the question: "what makes a tasty wine?".

In [44]:
gbmTree.feature_importances_

array([0.06292317, 0.09975023, 0.06415391, 0.07694603, 0.08524038,
       0.0844569 , 0.09412271, 0.13000576, 0.08332232, 0.08312791,
       0.13595068])

In [45]:
print('Feature Importances for GBM tree\n')
for importance,feature in zip(gbmTree.feature_importances_,['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']):
    print('{}: {}'.format(feature,importance))

Feature Importances for GBM tree

fixed acidity: 0.06292317043932964
volatile acidity: 0.09975023431152459
citric acid: 0.06415391136861659
residual sugar: 0.07694602680200366
chlorides: 0.08524038133639728
free sulfur dioxide: 0.08445689822967423
total sulfur dioxide: 0.09412271396742863
density: 0.13000575707661322
pH: 0.0833223179566319
sulphates: 0.083127905754266
alcohol: 0.13595068275751415
