In [None]:
# we are going to create a custom pipeline
# this custom pipeline will include both Sklearn methods and custom methods
# steps: 
# 1. We will apply StandardScalar package of sklearn for numerical data
# 2. we will apply map method for categorical data - non sklearn package
# 3. Create a custom pipeline method that will wrap everything to a single method
# advantage: the same process does't need to be repeated for train and test data

In [1]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline

In [2]:
data = sns.load_dataset('tips')
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
def poly(degree = 2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

In [4]:
def standard_scalar(data:pd.DataFrame)->pd.DataFrame:
    std = StandardScaler()
    new = std.fit_transform(data)
    new_data = pd.DataFrame(new, columns=data.columns)
    return new_data

def map_cat_to_num(data:pd.DataFrame)->pd.DataFrame:
    new = data.select_dtypes(exclude = ['float', 'int'])
    mapped = new.copy()
    for cols in new.columns:
        unique = new[cols].unique()
        d = {val: idx for idx, val in enumerate(unique)}
        mapped_data = new[cols].map(d)
        mapped.drop(cols, axis = 1, inplace = True)
        mapped[cols] = mapped_data
    return mapped

def partition(data:pd.DataFrame)->pd.DataFrame:
    num_data = data.select_dtypes(include = ['float', 'int'])
    cat_data = data.select_dtypes(exclude = ['float', 'int'])
    num_data_transformed = standard_scalar(num_data)
    cat_data_transformed = map_cat_to_num(cat_data)
    new_data = pd.concat([num_data_transformed,cat_data_transformed], axis = 1)
    return new_data

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class pipeStep(BaseEstimator, TransformerMixin):
    def __init__(self, step_func):
        self.step_func = step_func
    def fit(self, *args):
        return self
    def transform(self, X):
        return self.step_func(X)

In [6]:
clean_data = Pipeline([
    ('preprocessing', pipeStep(partition)),
    ('LR', LinearRegression())
])

In [7]:
X = data.drop('tip', axis = 1)
y = data['tip']
clean_data.fit(X, y)