# Red Wine Quality
## Determine Red Wine Quality Using Decision Tree

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

### 1. Load and Analyze Data

In [6]:
wine = pd.read_csv('Data/redwines.csv')
print(wine.shape)
print(wine.describe())
print(wine.sample(5))

(1599, 12)
       fixed_acidity  volatile_acidity  citric_acid  residual_sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free_sulfur_dioxide  total_sulfur_dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.0000

In [4]:
# Print missing values
print(wine.isnull().sum()/len(wine)*100)
print(wine.isna().any())

fixed_acidity           0.0
volatile_acidity        0.0
citric_acid             0.0
residual_sugar          0.0
chlorides               0.0
free_sulfur_dioxide     0.0
total_sulfur_dioxide    0.0
density                 0.0
pH                      0.0
sulphates               0.0
alcohol                 0.0
quality                 0.0
dtype: float64
fixed_acidity           False
volatile_acidity        False
citric_acid             False
residual_sugar          False
chlorides               False
free_sulfur_dioxide     False
total_sulfur_dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool


In [39]:
X = wine.iloc[:,0:11].values
y = wine.iloc[:,-1].values
X.shape

(1599, 11)

### 2. Build the Model and Check Accuracy

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy')
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)


In [46]:
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

0.61
[[  0   1   1   0   0   0]
 [  0   1   5   4   1   0]
 [  0   7 124  40   5   0]
 [  0   3  34  92  26   2]
 [  0   0   7  13  27   2]
 [  0   0   0   2   3   0]]


In [50]:
print(classification_report(y_test, y_pred))
print(dt.feature_importances_)
print(y_test.shape, y_pred.shape)
print(np.corrcoef(y_test,y_pred))
from scipy.stats.stats import pearsonr 
print(pearsonr(y_test,y_pred))

             precision    recall  f1-score   support

          3       0.00      0.00      0.00         2
          4       0.08      0.09      0.09        11
          5       0.73      0.70      0.71       176
          6       0.61      0.59      0.60       157
          7       0.44      0.55      0.49        49
          8       0.00      0.00      0.00         5

avg / total       0.61      0.61      0.61       400

[0.07822017 0.13082633 0.05159864 0.07508773 0.06806675 0.05125177
 0.1078468  0.05609966 0.0879008  0.12612063 0.16698073]
(400,) (400,)
[[1.         0.54136418]
 [0.54136418 1.        ]]
(0.541364179782519, 7.779090059053218e-32)


  'precision', 'predicted', average, warn_for)


In [43]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion = 'entropy', n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

0.685
[[  0   1   1   1   0   0]
 [  0   0   8   7   0   0]
 [  0   0 135  30   1   0]
 [  0   1  38 121   7   0]
 [  0   0   5  19  18   0]
 [  0   0   0   4   3   0]]


In [44]:
print(classification_report(y_test, y_pred))
print(rf.feature_importances_)
print(y_test.shape, y_pred.shape)
print(np.corrcoef(y_test,y_pred))

             precision    recall  f1-score   support

          3       0.00      0.00      0.00         3
          4       0.00      0.00      0.00        15
          5       0.72      0.81      0.76       166
          6       0.66      0.72      0.69       167
          7       0.62      0.43      0.51        42
          8       0.00      0.00      0.00         7

avg / total       0.64      0.69      0.66       400

[0.0712792  0.10786207 0.07335869 0.0668665  0.08127898 0.06651371
 0.1044953  0.08832767 0.07040482 0.11737421 0.15223884]
(400,) (400,)
[[1.         0.57159266]
 [0.57159266 1.        ]]


  'precision', 'predicted', average, warn_for)
