<a href="https://colab.research.google.com/github/rs2pydev/pythonic_topics/blob/main/Unannotated_Tutorial__NominalEncoding_ColumnTransformer_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><center><b>Unannotated tutorial on using the Scikit-Learn library for data preprocessing: Encoding categorical features and building pipelines</b></center>

This tutorial is based on the following vidoes from the [Data School's](https://www.youtube.com/dataschool) YouTube channel:

* [How do I encode categorical features using scikit-learn?](https://www.youtube.com/watch?v=irHhDMbw3xo)

* [Encode categorical features using `OneHotEncoder` or `OrdinalEncoder`](https://www.youtube.com/watch?v=0w78CHM_ubM&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=6)

* [Seven ways to select columns using ColumnTransformer](https://www.youtube.com/watch?v=sCt4LVD5hPc&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=3&t=155s)

In [79]:
import pandas as pd
import sklearn as skl

In [80]:
print(f'pandas version: {pd.__version__:s}')
print(f'sklearn version: {skl.__version__:s}')

pandas version: 1.3.5
sklearn version: 1.0.2


In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline

In [82]:
# Titanic - Train data set
df = pd.read_csv('http://bit.ly/kaggletrain')

In [83]:
print(f'Shape - Train data set: {df.shape}\n')

Shape - Train data set: (891, 12)



In [84]:
def draw_line():
    return print('****' * 20)

In [85]:
cols = df.columns.to_list()

draw_line()
print('Columns:')
draw_line()
print(*cols, sep=", ", end="\n")

********************************************************************************
Columns:
********************************************************************************
PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked


In [86]:
draw_line()
print('Null values:')
draw_line()
print(df.isna().sum())

********************************************************************************
Null values:
********************************************************************************
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [87]:
# For this tutorial we will select 4 columns
select_cols = ['Survived', 'Pclass', 'Sex', 'Embarked']

df = df.loc[df.Embarked.notna(), select_cols] # Drop null values in `Embarked` 

In [88]:
print(f'Shape - New dataframe: {df.shape}')

Shape - New dataframe: (889, 4)


In [89]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [90]:
X = df.loc[:, ['Pclass']]
y = df['Survived']

In [91]:
X.shape

(889, 1)

In [92]:
y.shape

(889,)

In [93]:
logReg = LogisticRegression(solver='lbfgs')

In [94]:
avg_accur = cross_val_score(logReg, X, y, scoring='accuracy', \
                           cv=5, n_jobs=-1).mean()
print(f'Average accuracy of logistic regression model: {avg_accur: 0.5f}')

Average accuracy of logistic regression model:  0.67834


In [95]:
null_accur = y.value_counts(normalize=True)
print(f'Null accuracy:\n{null_accur}')

Null accuracy:
0    0.617548
1    0.382452
Name: Survived, dtype: float64


In [96]:
# Approach 1

cat_var_preprocess_1 = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked']), 
    remainder='passthrough') 

In [97]:
# Approach 2

cat_var_preprocess_2 = make_column_transformer(
    (OneHotEncoder(drop='if_binary') , ['Sex']), 
    (OneHotEncoder(drop='first'), ['Embarked']), 
    remainder='passthrough') 

In [98]:
# Using approach 1 from above
# Below we will declare the columns inside the `make_column_selector`
# function using regex

cat_var_preprocess_3 = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector('S|E')),   
    remainder='passthrough') 

In [99]:
################################################################
# Recreate the freature matrix with all the necessary columns
################################################################

X = df.loc[:, ['Pclass', 'Sex', 'Embarked']]
X.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [100]:
pipe1 = make_pipeline(cat_var_preprocess_1, logReg)
pipe2 = make_pipeline(cat_var_preprocess_2, logReg)
pipe3 = make_pipeline(cat_var_preprocess_3, logReg)

In [101]:
avg_accur1 = cross_val_score(pipe1, X, y, cv=5, scoring='accuracy').mean()
print(f'Average accuracy using `pipe1`: {avg_accur1: 0.5f}\n')
avg_accur2 = cross_val_score(pipe2, X, y, cv=5, scoring='accuracy').mean()
print(f'Average accuracy using `pipe2`: {avg_accur2: 0.5f}\n')
avg_accur3 = cross_val_score(pipe3, X, y, cv=5, scoring='accuracy').mean()
print(f'Average accuracy using `pipe3`: {avg_accur3: 0.5f}')

Average accuracy using `pipe1`:  0.77279

Average accuracy using `pipe2`:  0.77279

Average accuracy using `pipe3`:  0.77279


In [102]:
################################################
# Quick generation of some new data through
# random selection of few columns from the train
# data
################################################

X_new = df.sample(n=10, random_state=101).drop(columns=['Survived'], axis=1)
X_new.head()

Unnamed: 0,Pclass,Sex,Embarked
511,3,male,S
613,3,male,Q
615,2,female,S
337,1,female,C
718,3,male,Q


In [103]:
# Fit pipeline on train data

pipe1.fit(X=X, y=y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Sex', 'Embarked'])])),
                ('logisticregression', LogisticRegression())])

In [104]:
# Make predictions on new data with pipeline

pipe1.predict(X=X_new)

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1])