In [1]:
'''
- Random Forest with GridSearchCV
- Python Pandas External Dataset Machine Learning Classifers Grid Search CV Optimizing Parameters 

The following dataset has information around red wine characteristics (acidity, pH, etc) as well as a quality rating. More information about the schema can be found here.
Given this, create a Random Forest model to predict wine quality. Additionally, use GridSearchCV (or a tool of your own choice) to find the best parameters for the model.

dataset: https://raw.githubusercontent.com/erood/interviewqs.com_code_snippets/master/Datasets/winequality-red.csv
information about dataset: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
information about gridsearch: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

'''

'\n- Random Forest with GridSearchCV\n- Python Pandas External Dataset Machine Learning Classifers Grid Search CV Optimizing Parameters \n\nThe following dataset has information around red wine characteristics (acidity, pH, etc) as well as a quality rating. More information about the schema can be found here.\nGiven this, create a Random Forest model to predict wine quality. Additionally, use GridSearchCV (or a tool of your own choice) to find the best parameters for the model.\n\ndataset: https://raw.githubusercontent.com/erood/interviewqs.com_code_snippets/master/Datasets/winequality-red.csv\ninformation about dataset: https://archive.ics.uci.edu/ml/datasets/Wine+Quality\ninformation about gridsearch: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html\nreference: https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/\n\nPS: First, I imported the dataset to excel in order to normalize the data and overwritten the

In [2]:
# Importing libraries to use in the project
import pandas as pd
import numpy as np

In [3]:
# Importing dataset and Analysing the data
dataset = pd.read_csv('wine_quality-red.csv', sep=';')
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Checking the shape of the dataset
dataset.shape

(1599, 12)

In [5]:
# Checking the description of the dataset
dataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [6]:
# Checking dataset types
dataset.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [7]:
# Labeling the data
# We want to predict wine quality so X will be all columns except quality

X_data = dataset.drop('quality', axis=1).values     # all columns except 'quality'
print(X_data)

[[ 7.4    0.7    0.    ...  3.51   0.56   9.4  ]
 [ 7.8    0.88   0.    ...  3.2    0.68   9.8  ]
 [ 7.8    0.76   0.04  ...  3.26   0.65   9.8  ]
 ...
 [ 6.3    0.51   0.13  ...  3.42   0.75  11.   ]
 [ 5.9    0.645  0.12  ...  3.57   0.71  10.2  ]
 [ 6.     0.31   0.47  ...  3.39   0.66  11.   ]]


In [8]:
# Labeling the data

y_data = dataset['quality'].values      # just 'quality' column
print(y_data)

[5 5 5 ... 6 5 6]


In [9]:
# Preprocessing the data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.15, random_state=0)

print("number of test samples :", X_test.shape)
print("number of training samples:", X_train.shape)

number of test samples : (240, 11)
number of training samples: (1359, 11)


In [10]:
'''
You can check the dataset and see that it is not scaled well since there are columns such as "fixed acidity" and "alcohol" that have values higher than 0 and 1. In that case, we need to scale those values down to values between 0 and 1.
'''
print("BEFORE SCALING:")
print("X_train:")
print(X_train)
print("\nX_test:")
print(X_test)

from sklearn.preprocessing import StandardScaler
dataset_scaler = StandardScaler()
X_train = dataset_scaler.fit_transform(X_train)
X_test = dataset_scaler.fit_transform(X_test)
print("\nAFTER SCALING:")
print("X_train:")
print(X_train)
print("\nX_test:")
print(X_test)

BEFORE SCALING:
X_train:
[[7.40e+00 6.10e-01 1.00e-02 ... 3.48e+00 6.50e-01 9.80e+00]
 [6.90e+00 7.65e-01 1.80e-01 ... 3.40e+00 6.00e-01 1.03e+01]
 [8.70e+00 6.90e-01 0.00e+00 ... 3.36e+00 4.50e-01 9.40e+00]
 ...
 [7.90e+00 5.70e-01 3.10e-01 ... 3.29e+00 6.90e-01 9.50e+00]
 [1.30e+01 4.70e-01 4.90e-01 ... 3.30e+00 6.80e-01 1.27e+01]
 [9.80e+00 9.80e-01 3.20e-01 ... 3.25e+00 4.80e-01 9.40e+00]]

X_test:
[[10.8   0.47  0.43 ...  3.17  0.76 10.8 ]
 [ 8.1   0.82  0.   ...  3.36  0.53  9.6 ]
 [ 9.1   0.29  0.33 ...  3.26  0.84 11.7 ]
 ...
 [ 9.2   0.31  0.36 ...  3.33  0.86 12.  ]
 [ 6.8   0.36  0.32 ...  3.36  0.55 12.8 ]
 [10.4   0.52  0.45 ...  3.22  0.76 11.4 ]]

AFTER SCALING:
X_train:
[[-0.53574243  0.43487983 -1.33321092 ...  1.10026409 -0.04612167
  -0.57529378]
 [-0.8253963   1.28248808 -0.46454456 ...  0.58100936 -0.33443511
  -0.10312066]
 [ 0.21735763  0.87235506 -1.38430894 ...  0.321382   -1.19937545
  -0.95303227]
 ...
 [-0.24608856  0.21614222  0.19972972 ... -0.13296589  0.

In [11]:
# Training the dataset using Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=300, random_state=0)
# rf.fit(X=X_train, y=y_train)


In [12]:
# Next step is getting accuracy for all folds using cross_val_score
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(estimator=rf_classifier, X=X_train, y=y_train, cv=5)
print(accuracy)

[0.66544118 0.63970588 0.66911765 0.70220588 0.70110701]


In [13]:
# Checking the average of accuracy

print(f'Accuracy mean = {accuracy.mean()*100:.2f}%')
print(f'Standard deviation = {accuracy.std()*100:.2f}%')


Accuracy mean = 67.55%
Standard deviation = 2.36%


In [14]:
# Using GridSearchCV to find the best parameters for the model

from sklearn.model_selection import GridSearchCV

grid_parameters = {
    'n_estimators': [100, 200, 400, 600, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=grid_parameters, scoring='accuracy', cv=5)

In [15]:
# Fitting the data test set

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [16]:
# Checking what is the best parameter and the best score

print(f'Best parameter: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_*100:.2f}%')

Best parameter: {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 1000}
Best score: 67.85%
