# Wine Quality Prediction using Random Forest

## Importing Libraries

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
#from sklearn.externals.six import StringIO
#from Ipython.display import Image
import pydotplus

In [2]:
data = pd.read_csv('winequality_red.csv')
data.head()                   

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


The data set consists following Input variables : 1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide

7 - total sulfur dioxide 8 - density 9 - pH 10 - sulphates 11 - alcohol

and the Output variable gives the quality of th wine based on the input variables:

12 - quality (score between 0 and 10)

In [4]:
X= data.drop('quality', axis = 1)
y=data['quality']

In [5]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [6]:
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 1)

In [8]:
# DT without any Pre-processing
clf= DecisionTreeClassifier(min_samples_split = 2)
clf.fit(x_train,y_train)

DecisionTreeClassifier()

In [9]:
# Accuracy of the DT
clf.score(x_test,y_test)

0.5979166666666667

In [10]:
# Another DT without any Pre-processing
clf2= DecisionTreeClassifier( criterion = 'entropy',max_depth = 24, min_samples_leaf = 1)
clf2.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=24)

In [11]:
# Accuracy of the DT
clf2.score(x_test,y_test)

0.5875

In [12]:
rand_clf = RandomForestClassifier(random_state = 1)

In [13]:
rand_clf.fit(x_train,y_train)

RandomForestClassifier(random_state=1)

In [14]:
rand_clf.score(x_test,y_test)

0.6875

As we can see the 2 individual decision tree models have both less score than 1 random forest classifier.

So, using random forest classifier has increased the predictve accuracy of the model.

## Hyperparameter Tuning using GridSearchCV

In [15]:
grid_param = {
    'n_estimators': [90,100,115,130],
    'criterion': ['gini','entropy'],
    'max_depth': range(2,20,1),
    'min_samples_leaf': range(1,10,1),
    'min_samples_split':range(2,10,1),
    'max_features': ['auto','log2']
}

In [16]:
grid_search= GridSearchCV(estimator=rand_clf, param_grid = grid_param, cv=5, n_jobs=3, verbose = 3)

In [17]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 20736 candidates, totalling 103680 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    5.7s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   14.6s
[Parallel(n_jobs=3)]: Done 282 tasks      | elapsed:   29.0s
[Parallel(n_jobs=3)]: Done 506 tasks      | elapsed:   48.6s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done 1146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 1562 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done 2042 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 2586 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed:  4.5min
[Parallel(n_jobs=3)]: Done 3866 tasks      | elapsed:  5.5min
[Parallel(n_jobs=3)]: Done 4602 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 5402 tasks      | elapsed:  7.8min
[Parallel(n_jobs=3)]: Done 6266 tasks      | elapsed:  9.1min
[Parallel(n_jobs=3)]: Done 7194 tasks      | elapsed: 10.5min


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1), n_jobs=3,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 20),
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'n_estimators': [90, 100, 115, 130]},
             verbose=3)

In [60]:
# Finding the best parameter
grid_search.best_params_

{'criterion': 'entropy',
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 115}

In [61]:
# Using these parameters to train the model
rand_clf = RandomForestClassifier(criterion= 'entropy',
                                 max_depth = 11,
                                 max_features = 'auto',
                                 min_samples_leaf = 1,
                                 min_samples_split = 2,
                                 n_estimators= 100,
                                 random_state = 1)

In [62]:
rand_clf.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=11, random_state=1)

In [63]:
rand_clf.score(x_train,y_train)

0.9928507596067918

In [53]:
# Let's do some more tweaks
grid_param2 = {
    "n_estimators" : [90,100,115],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf' : [1,2,3,4,5],
    'min_samples_split': [4,5,6,7,8],
    'max_features' : ['auto','log2']
}

In [54]:
grid_search2 = GridSearchCV(estimator=rand_clf,param_grid=grid_param2,cv=5,n_jobs =-1,verbose = 3)

In [55]:
grid_search2.fit(x_train,y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  3.3min finished


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(criterion='entropy', max_depth=11,
                                              random_state=1),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [4, 5, 6, 7, 8],
                         'n_estimators': [90, 100, 115]},
             verbose=3)

In [56]:
# Finding the best parameter
grid_search2.best_params_

{'criterion': 'entropy',
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 115}

In [57]:
# Using these parameters to train the model
rand_clf2 = RandomForestClassifier(criterion= 'entropy',
                                 max_features = 'auto',
                                 min_samples_leaf = 1,
                                 min_samples_split = 6,
                                 n_estimators= 115,
                                 random_state = 1)

In [58]:
rand_clf2.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', min_samples_split=6,
                       n_estimators=115, random_state=1)

In [59]:
rand_clf2.score(x_test,y_test)

0.675

In [64]:
# let's save the model
import pickle

with open('modelForPrediction.sav', 'wb') as f:
    pickle.dump(rand_clf,f)