# Creating ML Pipeline

**Author:** Manaranjan Pradhan</br>
**Email ID:** manaranjan@gmail.com</br>
**LinkedIn:** https://www.linkedin.com/in/manaranjanpradhan/

### Load Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
cars_df = pd.read_csv( "final_cars_maruti.csv" )

In [3]:
cars_df.sample(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,Age,Model,Mileage,Power,KM_Driven
925,Hyderabad,Diesel,Manual,First,5,5.5,6,swift,25.2,74.0,108
957,Ahmedabad,Diesel,Manual,First,5,6.9,5,ciaz,28.09,88.5,52
427,Pune,Diesel,Manual,First,5,6.0,2,swift,28.4,74.0,30
900,Kochi,Petrol,Manual,First,5,7.09,4,ciaz,20.73,91.2,25
290,Jaipur,Diesel,Manual,First,5,3.3,10,swift,19.3,73.9,71


In [4]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      1010 non-null   object 
 1   Fuel_Type     1010 non-null   object 
 2   Transmission  1010 non-null   object 
 3   Owner_Type    1010 non-null   object 
 4   Seats         1010 non-null   int64  
 5   Price         1010 non-null   float64
 6   Age           1010 non-null   int64  
 7   Model         1010 non-null   object 
 8   Mileage       1010 non-null   float64
 9   Power         1010 non-null   float64
 10  KM_Driven     1010 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 86.9+ KB


### Feature Set Selection

In [5]:
x_features = ['Fuel_Type', 
              'Transmission', 
              'Owner_Type', 
              'Age', 
              'Model', 
              'KM_Driven']

In [6]:
x_features

['Fuel_Type', 'Transmission', 'Owner_Type', 'Age', 'Model', 'KM_Driven']

In [7]:
cat_vars = ['Fuel_Type',
            'Transmission',
            'Owner_Type',
            'Model']

In [8]:
num_vars = list(set(x_features) - set(cat_vars))

In [9]:
num_vars

['Age', 'KM_Driven']

### Need for Data Transformation

1. Categorical columns
    - OHE Encoding
2. Numerical Columns
    - No Transformation Required

### Setting X and y variables

In [10]:
X = cars_df[x_features]
y = cars_df['Price']

### Data Splitting

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [13]:
X_train.shape

(808, 6)

In [14]:
X_test.shape

(202, 6)

## Creating Pipelines

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
ohe_encoder = OneHotEncoder(handle_unknown = 'ignore')

In [19]:
from sklearn.compose import ColumnTransformer

In [20]:
from sklearn.pipeline import Pipeline

In [22]:
cat_transformer = Pipeline(steps=[('oheencoder', ohe_encoder)])

In [23]:
preprocessor = ColumnTransformer(
        transformers = [('numerical', "passthrough", num_vars),
                        ('categorical', cat_transformer, cat_vars)])

### Encode Categorical Variables

### Linear Regression

In [24]:
from sklearn.linear_model import LinearRegression

In [26]:
lreg = LinearRegression()

In [27]:
lreg_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                                  ('regression', lreg)])

In [29]:
lreg_pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['Age', 'KM_Driven']),
                                                 ('categorical',
                                                  Pipeline(steps=[('oheencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Fuel_Type', 'Transmission',
                                                   'Owner_Type', 'Model'])])),
                ('regression', LinearRegression())])

In [30]:
lreg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['Age', 'KM_Driven']),
                                                 ('categorical',
                                                  Pipeline(steps=[('oheencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Fuel_Type', 'Transmission',
                                                   'Owner_Type', 'Model'])])),
                ('regression', LinearRegression())])

In [31]:
X_train[0:10]

Unnamed: 0,Fuel_Type,Transmission,Owner_Type,Age,Model,KM_Driven
435,Diesel,Manual,First,8,swift,78
308,Petrol,Manual,First,7,wagon,61
750,Diesel,Manual,First,8,ertiga,38
314,Diesel,Manual,First,6,ritz,32
293,Petrol,Manual,First,10,alto,73
16,Petrol,Manual,First,3,eeco,38
21,Petrol,Manual,First,6,alto,88
958,Petrol,Manual,First,4,eeco,126
468,Diesel,Manual,First,8,swift,83
676,Petrol,Automatic,First,5,ciaz,50


### Predict on Test Set

In [33]:
y_pred = lreg_pipeline.predict(X_test)

In [36]:
from sklearn.metrics import mean_squared_error, r2_score

In [37]:
r2_score(y_test, y_pred)

0.8746562737207995

## Predicting on New Data

In [39]:
data = {'Fuel_Type': 'Diesel',
        'Transmission': 'Manual',
        'Owner_Type': 'First',
        'Age': 8,
        'Model': 'ertiga',
        'KM_Driven': 87}

In [41]:
data_df = pd.DataFrame(data, index = [0])

In [43]:
data_df

Unnamed: 0,Fuel_Type,Transmission,Owner_Type,Age,Model,KM_Driven
0,Diesel,Manual,First,8,ertiga,87


In [45]:
lreg_pipeline.predict(data_df)

array([6.1714833])

## Save the Pipeline

In [46]:
from joblib import dump

In [49]:
dump(lreg_pipeline, 'carsmodel.pkl')

['carsmodel.pkl']

In [50]:
ls -al

 Volume in drive C is OS
 Volume Serial Number is 7EFF-AF26

 Directory of C:\Users\Sohan\OneDrive - Indian Institute of Management\ML Manaranjan Pradhan\Session_8



File Not Found
