<a href="https://colab.research.google.com/github/sharonccs/PortfolioProjects/blob/main/Predict_Wine_Quality_with_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Step 1: Import libraries and modules.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [15]:
#Step 2: Load red wine data.
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
data = pd.read_csv(dataset_url, sep=';')
print (data.head())

   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.0              0.27         0.36  ...       0.45      8.8        6
1            6.3              0.30         0.34  ...       0.49      9.5        6
2            8.1              0.28         0.40  ...       0.44     10.1        6
3            7.2              0.23         0.32  ...       0.40      9.9        6
4            7.2              0.23         0.32  ...       0.40      9.9        6

[5 rows x 12 columns]


In [18]:
print (data.shape)

(4898, 12)


In [20]:
print (data.describe())

       fixed acidity  volatile acidity  ...      alcohol      quality
count    4898.000000       4898.000000  ...  4898.000000  4898.000000
mean        6.854788          0.278241  ...    10.514267     5.877909
std         0.843868          0.100795  ...     1.230621     0.885639
min         3.800000          0.080000  ...     8.000000     3.000000
25%         6.300000          0.210000  ...     9.500000     5.000000
50%         6.800000          0.260000  ...    10.400000     6.000000
75%         7.300000          0.320000  ...    11.400000     6.000000
max        14.200000          1.100000  ...    14.200000     9.000000

[8 rows x 12 columns]


In [23]:
#Step 3: Split data into training and test sets.
y = data.quality
X = data.drop('quality', axis=1)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)
#Step 4: Declare data preprocessing steps.
X_train_scaled = preprocessing.scale(X_train)
print (X_train_scaled)

[[-0.06595853  4.41384815 -1.09716225 ... -0.45516282 -1.8298144
   0.47240462]
 [-0.42101991 -0.47438852  0.21253797 ...  0.47161958 -0.69685184
  -1.4010065 ]
 [-0.18431232 -0.37662379  0.21253797 ...  1.00120952  0.2618088
   0.30949931]
 ...
 [-1.13114269 -0.18109432 -0.36045587 ...  0.33922209 -0.69685184
   0.7167626 ]
 [-1.24949648 -0.08332959  0.53996302 ... -0.38896408 -0.34824797
  -0.17921664]
 [-0.06595853 -0.57215326  0.13068171 ...  0.33922209 -0.60970087
  -1.23810118]]


In [36]:
print (X_train_scaled.mean(axis=0))

[-5.04162533e-16 -2.08556444e-16  1.03371455e-16 -7.79819746e-17
 -7.09091908e-16 -9.06767146e-17 -2.53894801e-17 -2.23391154e-14
  8.89538570e-16 -4.82400122e-16  7.76192677e-16]


In [37]:
print (X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [39]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [46]:
X_train_scaled = scaler.transform(X_train)
print (X_train_scaled.mean(axis=0))
print (X_train_scaled.std(axis=0))

[-5.04162533e-16 -2.08556444e-16  1.03371455e-16 -7.79819746e-17
 -7.09091908e-16 -9.06767146e-17 -2.53894801e-17 -2.23391154e-14
  8.89538570e-16 -4.82400122e-16  7.76192677e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [47]:
X_test_scaled = scaler.transform(X_test)
print (X_test_scaled.mean(axis=0))
print (X_test_scaled.std(axis=0))

[-0.00557394 -0.01379692  0.00639383 -0.02115795  0.01000181  0.05080134
  0.04727659 -0.00491557 -0.01622463 -0.00489094 -0.02344258]
[0.99321244 0.92419246 0.95166836 0.95971113 1.03454363 0.97035169
 0.96328528 0.9562574  0.99740768 0.97227308 1.01108118]


In [49]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [52]:
print (pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor())], 'verbose': False, 'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'randomforestregressor__bootstrap': True, 'randomforestregressor__ccp_alpha': 0.0, 'randomforestregressor__criterion': 'squared_error', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_samples': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__n_estimators': 100, 'randomforestregressor__n_jobs': None, 'randomforestregressor__oob_score': False, 'randomforestregressor__random_stat

In [54]:
#Step 5: Declare hyperparameters to tune.
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [61]:
#Step 6: Tune model using a cross-validation pipeline.
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [59]:
#Step 7: Refit on the entire training set.
print (clf.refit)

True


In [62]:
Step 8: Evaluate model pipeline on test data.
y_pred = clf.predict(X_test)

In [65]:
print (r2_score(y_test, y_pred))
print (mean_squared_error(y_test, y_pred))

0.5854613778650255
0.3254154081632653


In [67]:
Step 9: Save model for future use.
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [69]:
clf2 = joblib.load('rf_regressor.pkl')
clf2.predict(X_test)

array([6.01, 5.63, 6.02, 5.77, 6.02, 5.41, 5.42, 5.88, 5.88, 6.72, 6.59,
       6.18, 6.66, 6.47, 5.07, 5.99, 5.23, 6.73, 5.29, 5.06, 5.2 , 5.82,
       7.02, 6.79, 5.69, 5.35, 5.93, 6.13, 5.99, 6.14, 5.86, 5.95, 6.77,
       5.93, 6.31, 5.53, 5.9 , 5.28, 6.  , 6.02, 5.04, 5.35, 6.82, 6.07,
       6.36, 5.43, 4.9 , 6.  , 6.25, 6.37, 6.86, 6.05, 5.95, 5.19, 5.48,
       5.88, 4.98, 5.6 , 6.87, 5.22, 5.45, 4.73, 6.59, 5.55, 6.28, 6.27,
       5.89, 5.73, 6.05, 5.97, 5.2 , 6.28, 5.85, 6.1 , 5.29, 5.2 , 5.91,
       5.  , 5.05, 4.48, 5.5 , 5.15, 5.2 , 4.99, 6.  , 5.77, 6.37, 5.67,
       6.15, 6.84, 5.26, 5.81, 6.85, 5.89, 8.  , 7.45, 5.66, 6.6 , 5.14,
       5.82, 5.11, 5.12, 5.08, 5.44, 5.8 , 5.05, 5.74, 6.14, 5.97, 6.63,
       6.75, 6.25, 5.32, 6.78, 7.  , 5.65, 5.15, 6.21, 5.28, 5.6 , 6.12,
       5.44, 5.07, 5.21, 5.84, 5.66, 5.48, 5.3 , 5.19, 5.37, 6.45, 6.32,
       5.66, 5.26, 6.66, 6.95, 6.57, 5.79, 6.71, 5.34, 5.83, 5.96, 5.7 ,
       6.79, 6.04, 5.94, 6.15, 5.13, 6.14, 5.87, 5.