## A Sklearn pipeline
A sklearn pipeline for a small datset that transforms numerical and categorical columns and standardize them

In [None]:
# Coding with sklearn pipeline
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline,make_pipeline
d2 = {'Genre':['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'],
      'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
      'likes':[6000000, np.nan, 5000000, 1610000, 1800000, np.nan, 4800000, 1650000, 2680000, 5000000],
      'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
df=pd.DataFrame(data=d2)
df.head()

In [2]:
#num_cols=['Social_media_followers']
#cat_cols=['Genre']
num_cols = df.select_dtypes(include=['float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

X = df.iloc[:,0:3]
y = df.iloc[:,3]

num_pipe = Pipeline(steps=[
                   ('impute', SimpleImputer(strategy='mean')),
                   ('scale', StandardScaler() )
                   ])
cat_pipe = Pipeline(steps=[
                   ('impute', SimpleImputer(strategy='most_frequent')),
                  ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
                   ])

col_trans = ColumnTransformer(transformers=[
                            ('num', num_pipe, num_cols),
                            ('cat', cat_pipe, cat_cols)],
                            remainder='drop',
                            n_jobs=-1
                            )

dt = DecisionTreeClassifier()
pipe = make_pipeline(col_trans, dt)

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3)
pipe.fit(X_train, y_train)
print(f'Score:{pipe.score(X_test, y_test)}')

Score:0.6666666666666666


In [3]:
import joblib
joblib.dump(pipe,'pipe.joblib')
loaded_pipe = joblib.load('pipe.joblib')