In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
#Ignore Warning 
import warnings as wrn
wrn.filterwarnings('ignore')



In [2]:
df = pd.read_csv('winequality-red.csv')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.shape

(1599, 12)

In [5]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


# Split data into training and test sets

In [8]:
y = df.quality
X = df.drop('quality', axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

# data preprocessing 

In [10]:
X_train_scaled = preprocessing.scale(X_train)

In [11]:
X_train_scaled

array([[ 0.51358886,  2.19680282, -0.164433  , ...,  1.08415147,
        -0.69866131, -0.58608178],
       [-1.73698885, -0.31792985, -0.82867679, ...,  1.46964764,
         1.2491516 ,  2.97009781],
       [-0.35201795,  0.46443143, -0.47100705, ..., -0.13658641,
        -0.35492962, -0.20843439],
       ...,
       [-0.98679628,  1.10708533, -0.93086814, ...,  0.24890976,
        -0.98510439,  0.35803669],
       [-0.69826067,  0.46443143, -1.28853787, ...,  1.08415147,
        -0.35492962, -0.68049363],
       [ 3.1104093 , -0.62528606,  2.08377675, ..., -1.61432173,
         0.79084268, -0.39725809]])

You can confirm that the scaled dataset is indeed centered at zero, with unit variance

In [12]:
X_train_scaled.mean(axis=0)

array([ 1.16664562e-16, -3.05550043e-17, -8.47206937e-17, -2.22218213e-17,
        1.94440936e-17, -6.38877362e-17, -4.16659149e-18, -1.20753377e-13,
       -8.70817622e-16, -4.08325966e-16, -1.17220107e-15])

In [13]:
X_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [14]:
scaler = preprocessing.StandardScaler().fit(X_train)

Now, the scaler object has the saved means and standard deviations for each feature in the training set.

In [18]:
X_train_scaled = scaler.transform(X_train)

print (X_train_scaled.mean(axis=0))
print (X_train_scaled.std(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  1.94440936e-17 -6.38877362e-17 -4.16659149e-18 -1.20753377e-13
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


Note how we're taking the scaler object and using it to transform the training set. Later, we can transform the test set using the exact same means and standard deviations used to transform the training set:

In [19]:
X_test_scaled = scaler.transform(X_test)

In [21]:
print (X_test_scaled.mean(axis=0))
print (X_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


Notice how the scaled features in the test set are not perfectly centered at zero with unit variance! This is exactly what we'd expect, as we're transforming the test set using the means from the training set, not from the test set itself.

In practice, when we set up the cross-validation pipeline, we won't even need to manually fit the Transformer API. Instead, we'll simply declare the class object

In [22]:
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

# Declare hyperparameters to tune

In [23]:
print (pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurit

In [24]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                   'randomforestregressor__max_depth': [None, 5, 3, 1]}

As you can see, the format should be a Python dictionary (data structure for key-value pairs) where keys are the hyperparameter names and values are lists of settings to try. 

# Tune model using a cross-validation pipeline

In [25]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

Yes, it's really that easy. GridSearchCV essentially performs cross-validation across the entire "grid" (all possible permutations) of hyperparameters

It takes in your model (in this case, we're using a model pipeline), the hyperparameters you want to tune, and the number of folds to create.

Now, you can see the best set of parameters found using CV:

In [26]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


# Refit on the entire training set.

In [27]:
print (clf.refit)

True


# Evaluate model pipeline on test data

In [28]:
y_pred = clf.predict(X_test)

Now we can use the metrics we imported earlier to evaluate our model performance

In [29]:
print (r2_score(y_test, y_pred))

0.4713926993159392


In [30]:
print (mean_squared_error(y_test, y_pred))

0.34109625000000005


Great, so now the question is... is this performance good enough?

Well, the rule of thumb is that your very first model probably won't be the best possible model. However, we recommend a combination of three strategies to decide if you're satisfied with your model performance.

# Save model for future use

save model in .pkl file

In [31]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

And that's it. When you want to load the model again, simply use this function:

load model from .pkl file

In [32]:
clf2 = joblib.load('rf_regressor.pkl')

In [33]:
# Predict data set using loaded model
clf2.predict(X_test)

array([6.51, 5.62, 4.95, 5.49, 6.3 , 5.43, 4.91, 4.85, 5.01, 5.99, 5.3 ,
       5.6 , 5.9 , 5.09, 5.77, 5.73, 6.65, 5.72, 5.71, 6.97, 5.5 , 5.67,
       5.05, 5.9 , 5.95, 5.08, 5.45, 5.13, 5.9 , 6.  , 5.91, 6.62, 6.  ,
       5.02, 5.03, 6.01, 5.09, 6.09, 5.15, 6.07, 4.87, 5.75, 6.64, 5.07,
       6.13, 5.39, 5.58, 5.51, 5.15, 6.3 , 5.95, 5.29, 5.95, 5.13, 5.66,
       5.6 , 5.42, 5.36, 4.99, 5.31, 5.27, 5.18, 5.09, 5.82, 6.01, 5.34,
       6.43, 5.05, 5.07, 6.63, 5.64, 5.72, 5.11, 5.04, 5.24, 5.96, 5.41,
       5.09, 5.22, 5.18, 6.37, 5.58, 6.06, 6.53, 5.1 , 6.06, 6.51, 6.4 ,
       5.59, 5.83, 6.01, 5.43, 6.41, 5.7 , 5.63, 5.84, 6.75, 6.76, 5.51,
       6.82, 5.05, 5.43, 5.15, 6.46, 5.09, 4.76, 5.66, 4.97, 5.78, 5.92,
       5.81, 5.46, 6.01, 5.39, 5.1 , 5.23, 5.91, 5.13, 4.72, 5.94, 5.86,
       5.07, 5.79, 6.14, 5.18, 5.41, 5.25, 5.93, 5.5 , 5.34, 5.83, 6.23,
       5.15, 5.26, 5.11, 6.45, 5.02, 5.19, 6.65, 5.66, 5.14, 5.04, 5.56,
       6.12, 5.36, 5.44, 5.07, 6.62, 5.76, 5.22, 5.