Task #2 Wine_Quality_ Predictions using RFC

Task Details

Predict the Quality Of the Wine.

Expected Submission

Predict the Quality Of the Wine.

Tableau Representation Of the Wine Quality Predictions using Random Forest Classification.

In [None]:
#main libraries and graphics
import os
import numpy as np
import pandas as pd

#ML libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

#metrics
from sklearn.metrics import roc_auc_score

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

------------------------------------------------ Reading data ----------------------------------------------------

In [None]:
#reading data from file
#creating DataFrame included red wine data

with open("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv") as red_wine_file:
    red_wine_data = pd.read_csv(red_wine_file, delimiter=',')

#reading data structure information

red_wine_data.info(verbose = True, show_counts = True)

#data example

red_wine_data.head()

--------------------------------- Checking the balance of the classification -------------------------------------

In [None]:
red_wine_data.hist('quality')

In this example, we are faced with the problem of unbalanced classification, so a simple parameter of the model's accuracy will not reflect its true performance. Instead, we use the area under the receiver operating characteristics curve (ROC AUC).

------------------------------------------------ Data preparation ---------------------------------------------------

In [None]:
#removing dependent variables
target = np.array(red_wine_data.pop('quality'))
red_wine_data_cutted = red_wine_data.drop(['citric acid', 'density', 'pH', 'total sulfur dioxide'], axis = 1)

In [None]:
#splitting the data set into a training and test samples
train_X, test_X, train_y, test_y = train_test_split(red_wine_data_cutted, target, 
                                                    test_size=0.2, shuffle = True, random_state=1)

In [None]:
#scaling data set
scaler = StandardScaler()
scaler.fit(train_X)
train_X_scaled = scaler.transform(train_X)
test_X_scaled = scaler.transform(test_X)

In [None]:
##### the use of PCA led to a deterioration in the metric


#PCA test (n_components=7, to see the explained variance of all generated components)
"""
pca_test = PCA(n_components = 7)
pca_test.fit(train_X_scaled)
evr = pca_test.explained_variance_ratio_
cvr = np.cumsum(pca_test.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr
display(pca_df.head(7))
"""

In [None]:
#i select the number of main components to achieve maximum 'roc_auc_score' metric, it turned out "5"
"""
pca = PCA(n_components = 5)
pca.fit(train_X_scaled)
train_X_scaled_pca = pca.transform(train_X_scaled)
test_X_scaled_pca = pca.transform(test_X_scaled)
"""

----------------------------------------- Random Forest Classification ----------------------------------------------

In [None]:
#searching the best hyperparameters for RFC model
rfc = RandomForestClassifier()

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
bootstrap = [True, False]

param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rs = RandomizedSearchCV(rfc, 
                        param_dist, 
                        n_iter = 100, 
                        cv = 3, 
                        verbose = 1, 
                        n_jobs=-1, 
                        random_state=0)

rs.fit(train_X_scaled, train_y)
rs.best_params_

In [None]:
#applying the best parameters to the model
rfc = rs.best_estimator_

#fit the model
rfc.fit(train_X_scaled, train_y)

#calcuting 'roc_auc_score' metric
rf_predictions = rfc.predict(test_X_scaled)
rf_probs = rfc.predict_proba(test_X_scaled)
print("The value of the metric 'roc_auc_score' in the test sample: {:.6f}".format(roc_auc_score(test_y, rf_probs, multi_class = 'ovr')))