## Using Random Forest

In [18]:
import pandas as pd
data = pd.read_csv('wine_quality_class.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality rate,Quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,good
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,good
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,good
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,good
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,good


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality rate            4898 non-null int64
Quality                 4898 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 497.5+ KB


In [20]:
from collections import Counter
Counter(data.Quality)

Counter({'good': 3078, 'Bad': 1640, 'premium': 180})

In [21]:
data_corr = data.corr()
data_corr.iloc[:,-1].sort_values()

density                -0.307123
chlorides              -0.209934
volatile acidity       -0.194723
total sulfur dioxide   -0.174737
fixed acidity          -0.113663
residual sugar         -0.097577
citric acid            -0.009209
free sulfur dioxide     0.008158
sulphates               0.053678
pH                      0.099427
alcohol                 0.435575
quality rate            1.000000
Name: quality rate, dtype: float64

In [37]:
X = data.iloc[:,:-2]
y = data.Quality

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3428, 11)
(1470, 11)
(3428,)
(1470,)


In [32]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=14, random_state=7)
model.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [36]:
y_predict = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_predict))

print(classification_report(y_test,y_predict))
pd.crosstab(y_test, y_predict)

0.7761904761904762
             precision    recall  f1-score   support

        Bad       0.72      0.69      0.70       490
       good       0.80      0.86      0.83       919
    premium       0.94      0.26      0.41        61

avg / total       0.78      0.78      0.77      1470



col_0,Bad,good,premium
Quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bad,338,151,1
good,132,787,0
premium,0,45,16


In [31]:
from sklearn.model_selection import GridSearchCV 

parameters = { 'max_depth':range(1,20),
               'random_state': range(1,20),
               'criterion':['gini']
              }
grid = GridSearchCV(RandomForestClassifier(), parameters)

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'gini', 'max_depth': 14, 'random_state': 7}
0.7730455075845974
