# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib



# Load wine data

In [3]:
dataset_url='http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data=pd.read_csv(dataset_url,sep=';')

In [4]:
print(data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [5]:
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

# Splitting dataset into train and test set

In [6]:
y=data.quality
x=data.drop('quality',axis=1)

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=123,stratify=y)

# Datapreprocessing by standardising

In [8]:
scaler=preprocessing.StandardScaler().fit(x_train)

Transforming the train set

In [9]:
x_train_scaled=scaler.transform(x_train)

In [10]:
print(x_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [11]:
print(x_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


Tranforming the test set

In [12]:
x_test_scaled=scaler.transform(x_test)

In [13]:
print(x_test_scaled.mean(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]


In [14]:
print(x_test_scaled.std(axis=0))

[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


Pipeline with preprocessing

In [15]:
pipeline=make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=100))

# Tuning hyperparameters

In [16]:
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurit

In [20]:
hyperparameters={'randomforestregressor__max_features':['auto','sqrt','log2'],'randomforestregressor__max_depth':[None,5,3,1]}

# Setting up a model using Cross-validation pipeline

In [22]:
clf=GridSearchCV(pipeline,hyperparameters,cv=10)
clf.fit(x_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [24]:
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}


In [25]:
print(clf.refit)

True


# Evaluating the model pipeline on Test data

In [26]:
y_pred=clf.predict(x_test)

In [27]:
print(y_pred)

[6.49 5.8  4.91 5.61 6.56 5.72 4.78 4.66 5.01 6.18 5.23 5.7  5.85 5.12
 5.81 5.66 6.76 5.71 5.76 7.   5.56 5.56 5.01 6.06 5.91 5.03 5.62 5.15
 5.91 5.99 5.91 6.57 6.01 5.05 4.9  5.9  5.1  5.83 5.12 5.82 4.84 5.89
 6.79 5.11 6.24 5.39 5.58 5.64 5.03 6.62 5.96 5.24 5.96 5.14 5.66 5.91
 5.21 5.35 5.04 5.29 5.35 5.02 5.05 5.79 5.94 5.3  6.42 5.02 5.26 6.63
 5.87 5.45 5.11 4.97 5.26 6.01 5.27 5.09 5.4  5.27 6.63 5.54 6.26 6.65
 5.16 5.93 6.57 6.18 5.48 5.76 5.88 5.27 6.47 5.67 5.72 5.81 6.69 6.85
 5.64 6.78 5.05 5.42 5.12 6.54 5.06 4.42 5.73 5.07 5.68 5.95 5.67 5.62
 6.18 5.43 4.94 5.18 5.93 5.   4.86 6.05 5.8  5.09 5.88 6.   5.27 5.35
 5.25 5.93 5.49 5.51 5.76 6.34 5.17 5.26 5.06 6.6  5.02 5.13 6.81 5.44
 5.08 5.13 5.65 6.07 5.41 5.49 5.17 6.57 5.55 5.1  5.59 5.13 4.79 4.99
 5.25 5.94 5.35 5.72 5.86 5.21 5.64 5.2  5.28 5.99 5.02 5.98 5.2  5.12
 5.5  5.13 5.77 4.97 5.58 5.09 5.58 5.52 5.01 5.52 5.67 5.06 6.09 5.5
 4.99 5.   5.28 6.24 5.19 5.66 5.23 4.8  5.53 6.65 5.86 5.92 5.33 5.14
 5.31 5

In [28]:
print(r2_score(y_test,y_pred))

0.4576756462255587


In [29]:
print(mean_squared_error(y_test,y_pred))

0.34994749999999997


# For future use

In [30]:
joblib.dump(clf,'rfr.pkl')

['rfr.pkl']