In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import r2_score,mean_absolute_error
warnings.filterwarnings('ignore')

In [4]:
boston = load_boston()
df = pd.DataFrame(data= boston.data, columns= boston.feature_names)
df['Target'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
df.shape

(506, 14)

## Using Pipeline

In [6]:
X = df.drop('Target',1)
y = df['Target']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=101)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((354, 13), (152, 13), (354,), (152,))

In [15]:
pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('model',RandomForestRegressor())
])

In [16]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', RandomForestRegressor())])

In [17]:
y_pred = pipe.predict(X_test)

In [18]:
r2_score(y_test,y_pred)

0.8641334117633147

In [19]:
pipe.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('model', RandomForestRegressor())],
 'verbose': False,
 'scaler': StandardScaler(),
 'model': RandomForestRegressor(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__max_depth': None,
 'model__max_features': 'auto',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': None,
 'model__verbose': 0,
 'model__warm_start': False}

In [21]:
grid_cv = GridSearchCV(
                        estimator= pipe,
                        cv=5,
                        n_jobs= -1,
                        return_train_score= True,
                        param_grid = {'model__n_jobs' : [-1],'model__max_depth':[5,10,15,20]}
                    )

In [22]:
grid_cv.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', RandomForestRegressor())]),
             n_jobs=-1,
             param_grid={'model__max_depth': [5, 10, 15, 20],
                         'model__n_jobs': [-1]},
             return_train_score=True)

In [24]:
pd.DataFrame(grid_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_jobs,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.188683,0.040177,0.034832,0.004218,5,-1,"{'model__max_depth': 5, 'model__n_jobs': -1}",0.838703,0.877706,0.846076,...,0.835673,0.070218,4,0.948794,0.944391,0.944688,0.948327,0.953262,0.947892,0.003236
1,0.191385,0.032577,0.038536,0.008448,10,-1,"{'model__max_depth': 10, 'model__n_jobs': -1}",0.857995,0.900841,0.868131,...,0.844198,0.078882,1,0.97768,0.974096,0.975316,0.97719,0.980439,0.976944,0.002172
2,0.177369,0.045715,0.033231,0.00878,15,-1,"{'model__max_depth': 15, 'model__n_jobs': -1}",0.856129,0.88927,0.857159,...,0.837036,0.084278,3,0.979797,0.976484,0.978197,0.978628,0.984055,0.979432,0.002544
3,0.171306,0.034625,0.018617,0.004547,20,-1,"{'model__max_depth': 20, 'model__n_jobs': -1}",0.861863,0.896199,0.850616,...,0.83936,0.082558,2,0.979673,0.975796,0.977494,0.981653,0.983726,0.979668,0.002833


In [25]:
grid_cv.score(X_test,y_test)

0.8580689416820548

# Column Transformer

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline

In [21]:
cities = ['Delhi','Bangalore','Pune','Hyderabad']
food_quality = ['Best','Good','Average','Bad']
np.random.choice(cities,1)

array(['Delhi'], dtype='<U9')

In [19]:
df = pd.DataFrame({'city': np.random.choice(cities,100),
                  'food_quality' : np.random.choice(food_quality,100)})
df.head()

Unnamed: 0,city,food_quality
0,Pune,Good
1,Pune,Best
2,Hyderabad,Bad
3,Hyderabad,Bad
4,Delhi,Average


In [34]:
ohe = OneHotEncoder(sparse = False,drop='first')
ohe.fit_transform(df[['city']])[:10]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [31]:
oe = OrdinalEncoder(dtype=int)
oe.fit_transform(df[['food_quality']])[:10]

array([[3],
       [2],
       [1],
       [1],
       [0],
       [2],
       [1],
       [0],
       [2],
       [0]])

In [44]:
ct = ColumnTransformer(transformers=[
            ('tnsf1_ohe',OneHotEncoder(sparse=False),['city']),
            ('tnsf2_oe',OrdinalEncoder(dtype=int),['food_quality'])
],remainder='passthrough') #passthrough will keep remaining columns in the dataframe

In [45]:
ct.fit_transform(df)[:10]

array([[0., 0., 0., 1., 3.],
       [0., 0., 0., 1., 2.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 2.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 2.],
       [0., 0., 1., 0., 0.]])