### Working with pipelines means creating a flow of all the programs are will be done and are done in the project.
#### We can update (delete or remove the changes done in the pipeline (section) easily).

### sklearn Pipelines

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [26]:
df.shape

(8128, 5)

In [27]:
# Intensionally adding missing values
np.random.seed(42)
missing_km_indices = np.random.choice(df.index, size=int(0.05*len(df)), replace=False)
df.loc[missing_km_indices, 'km_driven'] = np.nan

# Introduce missing values in 'owner' column (1% missing values)
missing_owner_indices = np.random.choice(df.index, size=int(0.01*len(df)), replace=False)
df.loc[missing_owner_indices, 'owner'] = np.nan


In [28]:
df.isnull().sum()

brand              0
km_driven        406
fuel               0
owner             81
selling_price      0
dtype: int64

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
                                                      df.drop(columns=['selling_price']),
                                                      df['selling_price'],
                                                      test_size=0.2,
                                                      random_state=42
                                                    )

In [30]:
""" Steps
1. Missing imputations in km_driven and owner
2. (Ohe in fuel, brand) + (Oe in owner)
3. Scaling
4. Feature Selection
5. Model Building
6. Prediction
"""

' Steps\n1. Missing imputations in km_driven and owner\n2. (Ohe in fuel, brand) + (Oe in owner)\n3. Scaling\n4. Feature Selection\n5. Model Building\n6. Prediction\n'

In [31]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [32]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_km_driven',SimpleImputer(),[1]),
    ('impute_owner',SimpleImputer(strategy='most_frequent'),[3]) # default give the mean, but we want it to have most frequent so use strategy
],remainder='passthrough')

In [33]:
# encoding categorical variables
trf2 = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [3]),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,2])
    ],
    remainder='passthrough'
)

In [34]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,38)) # slice used to get all the column
])

In [35]:
a = [1,2,3,4,5]
x = slice(0,5)
a[x]

[1, 2, 3, 4, 5]

In [36]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [37]:
# train the model
trf5 = RandomForestRegressor()

In [38]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imputer',trf1),
    ('encoder',trf2),
    ('scaler',trf3),
    ('fselector',trf4),
    ('model',trf5)
])


In [39]:
pipe.fit(X_train, y_train)

In [40]:
pipe.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [41]:
pipe.named_steps

{'imputer': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_km_driven', SimpleImputer(), [1]),
                                 ('impute_owner',
                                  SimpleImputer(strategy='most_frequent'),
                                  [3])]),
 'encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('ordinal',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  [3]),
                                 ('onehot',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [0, 2])]),
 'scaler': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 38, None))]),
 'fselector': SelectKBest(score_func=<function chi2 at 0x0000022D99B1E8E0>),
 'model

In [42]:
pipe.named_steps['scaler'].transformers_[0][1].data_max_

array([3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [43]:
pipe.predict(X_test)[10:40]

array([631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222, 631107.85033222, 631107.85033222,
       631107.85033222, 631107.85033222])

In [44]:
# Predict
pipe.predict(np.array(['Maruti',100000.0,'Diesel','First Owner']).reshape(1,4))



array([631107.85033222])

#### Cross Validation

In [45]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

np.float64(-639113244101.0538)

#### HyperParameter Tuning

In [47]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
}

In [48]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [49]:
grid.best_score_

np.float64(-639088313514.5734)

### Export Pipeline

In [50]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))