Sub-Topics

1. Concept of dealing with heterogeneous data
2. Combining multiple pipelines with column transformer
3. Implement heterogeneous data
4. Case Study: Implement heterogeneous pipeline on complex data
Session Details

Heterogeneous Pipeline

pipeline helps us to continuous updates a machine learning model wnenever new data come into my data at that time it will 
run machine learning model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler


In [2]:
iris = load_iris()  # data set
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
x_train, x_test, y_train,y_test = train_test_split(iris.data, iris.target, test_size=0.20, random_state=1)

In [4]:
my_pipeline = Pipeline([('scale', MinMaxScaler()), ('clf', LogisticRegression())])

In [5]:
my_pipeline.fit(x_train, y_train)

Pipeline(steps=[('scale', MinMaxScaler()), ('clf', LogisticRegression())])

In [6]:
score = my_pipeline.score(x_test, y_test)
score

0.9

In [7]:
# to print values
print('Logistic Regression: %.2f' % score)  # string formating

Logistic Regression: 0.90


In [8]:
from sklearn import set_config
set_config(display = 'diagram')  # pipeline visualization
my_pipeline

In [9]:
df = pd.read_csv(r"D:\Data Science\MACHINE LEARNING\tips.csv")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [10]:
# Columns Transformers

In [15]:
x= df.drop(['tip'], axis=1)
y= df['tip']

In [17]:
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.20, random_state=1)

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [19]:
col_transformer = ColumnTransformer(transformers = [('SS', StandardScaler(), ['total_bill']), 
                                  ('oho', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])])
col_transformer 

In [20]:
x_train_transformed = col_transformer.fit_transform(x_train)
x_train_transformed 

array([[-0.28611937,  1.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.02695905,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.3716196 ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.23206267,  1.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [-1.06543688,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.29287646,  1.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [21]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
pipe = Pipeline([('Preprocessing', col_transformer), ('lr',lr)])
pipe.fit(x_train, y_train)