In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold

In [10]:
data=pd.read_csv("winequality-red.csv",sep=";")

In [11]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [14]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [15]:
data["quality"].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [16]:
X=data.iloc[:,:-1].values
Y=data.iloc[:,-1].values

In [17]:
kf=KFold(n_splits=5,random_state=11)

In [18]:
for train_index,test_index in kf.split(X,Y):
    x_train,x_test=X[train_index],X[test_index]
    y_train,y_test=Y[train_index],Y[test_index]

In [19]:
x_train.shape

(1280, 11)

In [21]:
x_test.shape

(319, 11)

In [23]:
scale=StandardScaler().fit(x_train)

In [24]:
x_train_std=scale.transform(x_train)

In [25]:
x_test_std=scale.transform(x_test)

In [27]:
x_train_std[:1]

array([[-0.66611373,  1.01737316, -1.47326725, -0.50439708, -0.2681482 ,
        -0.43119642, -0.38102276,  0.43821658,  1.35639805, -0.5961707 ,
        -0.91314732]])

In [28]:
x_test_std[:1]

array([[-8.35767955e-01, -3.31513482e-01, -4.54751408e-01,
        -5.04397083e-01, -2.47261592e-01,  1.21664941e+00,
         2.14579769e-01, -7.24240454e-01,  4.53677519e-01,
        -1.36299667e-01, -3.56558893e-04]])

In [32]:
random=RandomForestRegressor(oob_score=True,n_jobs=-1,random_state=21)

In [30]:
param={}

In [31]:
param["n_estimators"]=[100,200,300,400,500]
param["max_depth"]=[1,2,3,4,5]
param["min_samples_split"]=list(range(10,101,10))
param["min_samples_leaf"]=[1,2,3,4,5,6,7,8,9,10]
param["max_features"]=["auto","sqrt"]

In [35]:
clf1=GridSearchCV(estimator=random,param_grid=param,n_jobs=-1,iid=False,cv=3)

In [37]:
fit1=clf1.fit(x_train_std,y_train)

In [38]:
fit1.best_params_

{'max_depth': 5,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 300}

In [39]:
fit1.best_score_

0.33042723434662796

In [40]:
pipe=Pipeline([("scale",StandardScaler()),("gbm",GradientBoostingRegressor())])

In [41]:
param={}

In [42]:
param["gbm__learning_rate"]=[0.01,0.05]
param["gbm__n_estimators"]=[100,200,300,400,500]
param["gbm__max_depth"]=[3,4,5]
param["gbm__min_samples_leaf"]=[10,20,30,40,50]
param["gbm__min_samples_split"]=list(range(100,1001,100))


In [43]:
param

{'gbm__learning_rate': [0.01, 0.05],
 'gbm__n_estimators': [100, 200, 300, 400, 500],
 'gbm__max_depth': [3, 4, 5],
 'gbm__min_samples_leaf': [10, 20, 30, 40, 50],
 'gbm__min_samples_split': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}

In [44]:
clf2=GridSearchCV(estimator=pipe,param_grid=param,n_jobs=-1,iid=False,cv=3)

In [45]:
fit2=clf2.fit(x_train,y_train)

In [46]:
fit2.best_params_

{'gbm__learning_rate': 0.01,
 'gbm__max_depth': 5,
 'gbm__min_samples_leaf': 40,
 'gbm__min_samples_split': 100,
 'gbm__n_estimators': 300}

In [48]:
fit2.best_score_

0.33864785336113723

In [49]:
fit1.score(x_test_std,y_test)

0.2793838663377589

In [57]:
fit2.score(x_test_std,y_test)

0.052161223194728934

In [None]:
#NOTICE THE DIFFERNCE BETWEEN Pipeline and make_pipeline

In [51]:
pipe3=make_pipeline(StandardScaler(),SVR())

In [52]:
pipe3.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('svr',
   SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'svr': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
   kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'svr__C': 1.0,
 'svr__cache_size': 200,
 'svr__coef0': 0.0,
 'svr__degree': 3,
 'svr__epsilon': 0.1,
 'svr__gamma': 'auto',
 'svr__kernel': 'rbf',
 'svr__max_iter': -1,
 'svr__shrinking': True,
 'svr__tol': 0.001,
 'svr__verbose': False}

In [53]:
param2={}

In [54]:
param2['svr__C']=[0.001,0.005,0.01,0.05,0.1,0.5,1]
param2["svr__gamma"]=[0.001,0.005,0.01,0.05,0.1,0.5,1]
param2["svr__degree"]=[1,2,3,4]
param2['svr__kernel']=["linear","rbf","poly"]

In [55]:
clf3=GridSearchCV(estimator=pipe3,param_grid=param2,n_jobs=-1,iid=False,cv=3)

In [56]:
fit3=clf3.fit(x_train,y_train)

In [58]:
fit3.best_params_

{'svr__C': 0.5, 'svr__degree': 1, 'svr__gamma': 0.01, 'svr__kernel': 'rbf'}

In [59]:
fit3.best_score_

0.3303643177244831

In [60]:
fit3.score(x_test_std,y_test)

-0.19319467087028674

In [61]:
fit3.score(x_test,y_test)

0.310379248620244

In [73]:
from sklearn.ensemble import ExtraTreesRegressor

In [80]:
pipe4=make_pipeline(ExtraTreesRegressor(oob_score=True,n_jobs=1,bootstrap=True))

In [81]:
pipe4.get_params()

{'memory': None,
 'steps': [('extratreesregressor',
   ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=True, random_state=None, verbose=0, warm_start=False))],
 'extratreesregressor': ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=True, random_state=None, verbose=0, warm_start=False),
 'extratreesregressor__bootstrap': True,
 'extratreesregressor__criterion': 'mse',
 'extratreesregressor__max_depth': None,
 'extratreesregressor__

In [82]:
param3={}

In [83]:
param3['extratreesregressor__max_depth']=[1,2,3,4,5]
param3['extratreesregressor__min_samples_leaf']=[10,20,30,40,50]
param3['extratreesregressor__min_samples_split']=list(range(100,500,50))
param3['extratreesregressor__n_estimators']=[100,200,300,400,500,600,700,800]



In [84]:
clf4=GridSearchCV(estimator=pipe4,param_grid=param3,n_jobs=-1,iid=False,cv=3)

In [85]:
fit4=clf4.fit(x_train,y_train)

In [86]:
fit4.best_params_

{'extratreesregressor__max_depth': 5,
 'extratreesregressor__min_samples_leaf': 10,
 'extratreesregressor__min_samples_split': 100,
 'extratreesregressor__n_estimators': 500}

In [87]:
fit4.best_score_

0.2843521186699811

In [98]:
from mlens.ensemble import SuperLearner

In [90]:
from sklearn.metrics import mean_squared_error

In [91]:
# --- Build ---
# Passing a scoring function will create cv scores during fitting
# the scorer should be a simple function accepting to vectors and returning a scalar

In [120]:
ensemble=SuperLearner(scorer=mean_squared_error,random_state=11)

In [121]:
ensemble.add([RandomForestRegressor(),ExtraTreesRegressor()])

SuperLearner(array_check=2, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=3775, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...0A8598>)],
   n_jobs=-1, name='group-12', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=11, sample_size=20,
       scorer=<function mean_squared_error at 0x0000016D2F0A8598>,
       shuffle=False, verbose=False)

In [122]:
ensemble.add([ExtraTreesRegressor(),GradientBoostingRegressor()])

SuperLearner(array_check=2, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=3775, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...0A8598>)],
   n_jobs=-1, name='group-13', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=11, sample_size=20,
       scorer=<function mean_squared_error at 0x0000016D2F0A8598>,
       shuffle=False, verbose=False)

In [123]:
ensemble.add([GradientBoostingRegressor(),SVR()])

SuperLearner(array_check=2, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=3775, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...0A8598>)],
   n_jobs=-1, name='group-14', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=11, sample_size=20,
       scorer=<function mean_squared_error at 0x0000016D2F0A8598>,
       shuffle=False, verbose=False)

In [124]:
ensemble.add_meta(SVR())

SuperLearner(array_check=2, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=3775, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...0A8598>)],
   n_jobs=-1, name='group-15', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=11, sample_size=20,
       scorer=<function mean_squared_error at 0x0000016D2F0A8598>,
       shuffle=False, verbose=False)

In [125]:
fit=ensemble.fit(x_train,y_train)

In [126]:
pred=fit.predict(x_test)

In [127]:
mean_squared_error(y_test,pred)

0.44516182451956676

In [128]:
#### remember you are not allowed to pass any parameters to the individual base learners