https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV



from sklearn.metrics import mean_squared_error, r2_score


from sklearn.externals import joblib

In [3]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
data.shape

(1599, 12)

In [5]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [6]:
y = data.quality
X = data.drop('quality', axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [9]:
X_train_scaled = preprocessing.scale(X_train)
X_train_scaled

array([[ 0.51358886,  2.19680282, -0.164433  , ...,  1.08415147,
        -0.69866131, -0.58608178],
       [-1.73698885, -0.31792985, -0.82867679, ...,  1.46964764,
         1.2491516 ,  2.97009781],
       [-0.35201795,  0.46443143, -0.47100705, ..., -0.13658641,
        -0.35492962, -0.20843439],
       ...,
       [-0.98679628,  1.10708533, -0.93086814, ...,  0.24890976,
        -0.98510439,  0.35803669],
       [-0.69826067,  0.46443143, -1.28853787, ...,  1.08415147,
        -0.35492962, -0.68049363],
       [ 3.1104093 , -0.62528606,  2.08377675, ..., -1.61432173,
         0.79084268, -0.39725809]])

In [12]:
X_train_scaled.mean(axis=0)

array([ 1.16664562e-16, -3.05550043e-17, -8.47206937e-17, -2.22218213e-17,
        2.22218213e-17, -6.38877362e-17, -4.16659149e-18, -2.54439854e-15,
       -8.70817622e-16, -4.08325966e-16, -1.17220107e-15])

In [11]:
X_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [13]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [15]:
X_train_scaled = scaler.transform(X_train)
X_train_scaled.mean(axis=0)

array([ 1.16664562e-16, -3.05550043e-17, -8.47206937e-17, -2.22218213e-17,
        2.22218213e-17, -6.38877362e-17, -4.16659149e-18, -2.54439854e-15,
       -8.70817622e-16, -4.08325966e-16, -1.17220107e-15])

In [16]:
X_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [17]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled.mean(axis=0)

array([ 0.02776704,  0.02592492, -0.03078587, -0.03137977, -0.00471876,
       -0.04413827, -0.02414174, -0.00293273, -0.00467444, -0.10894663,
        0.01043391])

In [18]:
X_test_scaled.std(axis=0)

array([1.02160495, 1.00135689, 0.97456598, 0.91099054, 0.86716698,
       0.94193125, 1.03673213, 1.03145119, 0.95734849, 0.83829505,
       1.0286218 ])

In [19]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [20]:
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
              oob_score=False, random_state=None, verbose=0, warm_start=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=N

In [21]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [22]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'log2'}

In [24]:
clf.refit

True

In [26]:
y_pred = clf.predict(X_test)
y_pred

array([6.4 , 5.67, 4.98, 5.51, 6.18, 5.68, 4.87, 4.79, 5.02, 6.04, 5.36,
       5.71, 5.87, 5.14, 5.77, 5.73, 6.59, 5.72, 5.77, 6.96, 5.49, 5.62,
       5.04, 5.94, 5.87, 5.06, 5.41, 5.21, 5.87, 5.95, 5.9 , 6.57, 6.  ,
       5.1 , 4.92, 5.91, 5.09, 6.12, 4.98, 6.  , 4.87, 5.87, 6.66, 5.18,
       6.21, 5.39, 5.67, 5.62, 5.18, 6.49, 5.95, 5.28, 5.78, 5.2 , 5.65,
       5.71, 5.34, 5.36, 4.96, 5.31, 5.32, 5.05, 4.97, 5.79, 6.16, 5.14,
       6.41, 5.05, 5.1 , 6.73, 5.77, 5.69, 5.11, 5.05, 5.39, 5.98, 5.27,
       5.15, 5.26, 5.24, 6.28, 5.63, 6.18, 6.39, 5.1 , 6.01, 6.53, 6.38,
       5.81, 5.83, 5.83, 5.49, 6.49, 5.62, 5.72, 5.78, 6.72, 6.75, 5.62,
       6.73, 5.06, 5.47, 5.14, 6.54, 5.07, 4.91, 5.63, 4.93, 5.78, 5.89,
       5.78, 5.59, 5.97, 5.49, 5.1 , 5.17, 5.89, 5.06, 4.94, 6.05, 5.86,
       5.08, 5.78, 6.12, 5.24, 5.23, 5.27, 5.8 , 5.51, 5.4 , 5.83, 6.1 ,
       5.19, 5.33, 5.09, 6.33, 5.  , 5.15, 6.82, 5.53, 5.09, 5.18, 5.67,
       6.04, 5.4 , 5.34, 5.16, 6.57, 5.76, 5.17, 5.

In [27]:
r2_score(y_test, y_pred)

0.47110067195350813

In [28]:
mean_squared_error(y_test, y_pred)

0.3412846875

In [30]:
joblib.dump(clf, 'rf_regressor.pkl')

clf2 = joblib.load('rf_regressor.pkl')

clf2.predict(X_test)

797     7
871     5
1333    5
1463    6
1058    7
1130    6
1037    5
735     5
1560    5
1224    6
445     6
85      5
813     4
545     5
787     6
148     6
938     7
774     6
305     6
1002    7
920     5
1441    6
71      5
1487    5
449     6
590     5
77      6
457     5
571     6
1021    6
       ..
1484    4
392     5
992     6
1215    6
1129    6
406     6
1182    6
921     6
827     5
367     5
1415    5
1198    6
150     6
1068    7
821     7
552     6
197     6
536     5
894     6
1380    6
889     5
218     5
359     6
1024    7
1066    7
211     6
162     6
748     6
914     6
557     5
Name: quality, Length: 320, dtype: int64