Pipeline

In [25]:
import pandas as pd

data = {
  "Name": ["Anna", "Bob", "Charlie", "Diana", "Eric"],
  "Age": [20, 38, 19, None, 23],
  "Gender": ["f", "m", "m", "f", "m"],
  "Job": ["Programmer", "Writer", "Cook", "Programmer", "Teacher"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,38.0,m,Writer
2,Charlie,19.0,m,Cook
3,Diana,,f,Programmer
4,Eric,23.0,m,Teacher


Preprocessing Pipeline
- Drop Name feature
- Impute Ages
- Turn Gender into Binary/Numeric
- One Hot Encode Job

In [26]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop the name column
df = df.drop(columns=["Name"], axis=1)

# Impute Ages
imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

# Label Encode Gender
gender_dict = {"f": 0, "m": 1}
df['Gender'] = [gender_dict[g] for g in df['Gender']]

# One-Hot-Encode Job
encoder = OneHotEncoder(sparse=False)
encoded_matrix = encoder.fit_transform(df[["Job"]])

# Create a DataFrame from the encoded matrix
encoded_df = pd.DataFrame(encoded_matrix, columns=encoder.get_feature_names_out())

df = pd.concat([df, encoded_df], axis=1)
df = df.drop(columns=["Job"], axis=1)
df

Unnamed: 0,Age,Gender,Job_Cook,Job_Programmer,Job_Teacher,Job_Writer
0,20.0,0,0.0,1.0,0.0,0.0
1,38.0,1,0.0,0.0,0.0,1.0
2,19.0,1,1.0,0.0,0.0,0.0
3,25.0,0,0.0,1.0,0.0,0.0
4,23.0,1,0.0,0.0,1.0,0.0


Defining Estimators for our pipeline

In a pipeline we have estimators. An Estimator is something that has the functions `fit`, `transform`, `fit_transform`. 
Essentially, a classifier is an estimator.

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    return X.drop(columns=["Name"], axis=1)


class AgeImputer(BaseEstimator, TransformerMixin):
  def __init__(self, strategy="mean"):
    self.strategy = strategy

  def fit(self, X, y=None):
    self.imputer = SimpleImputer(strategy=self.strategy)
    self.imputer.fit(X[["Age"]])
    return self

  def transform(self, X, y=None):
    X["Age"] = self.imputer.fit_transform(X[["Age"]])
    return X

class GenderEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, column):
    self.column = column

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    gender_dict = {"f": 0, "m": 1}
    X[self.column] = [gender_dict[g] for g in X[self.column]]
    return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, column, encoder):
    self.column = column
    self.encoder = encoder

  def fit(self, X, y=None):
    self.encoder.fit(X[[self.column]])
    return self

  def transform(self, X, y=None):
    encoded_matrix = self.encoder.transform(X[[self.column]])
    encoded_df = pd.DataFrame(encoded_matrix, columns=self.encoder.get_feature_names_out())
    X = pd.concat([X, encoded_df], axis=1)
    X = X.drop(columns=[self.column], axis=1)
    return X

In [49]:
data2 = {
  "Name": ["Fiona", "Gerald", "Hans", "Isabella", "Jacob"],
  "Age": [20, None, None, None, 23],
  "Gender": ["f", "m", "m", "f", "m"],
  "Job": ["Writer", "Programmer", "Programmer", "Programmer", "Teacher"]
}

df2 = pd.DataFrame(data2)

# Create a pipeline
dropper = NameDropper()
imputer = AgeImputer()
gender_encoder = GenderEncoder("Gender")
encoder = FeatureEncoder("Job", OneHotEncoder(sparse=False))

encoder.fit_transform(gender_encoder.fit_transform(imputer.fit_transform(dropper.fit_transform(df2))))


Unnamed: 0,Age,Gender,Job_Programmer,Job_Teacher,Job_Writer
0,20.0,0,0.0,0.0,1.0
1,21.5,1,1.0,0.0,0.0
2,21.5,1,1.0,0.0,0.0
3,21.5,0,1.0,0.0,0.0
4,23.0,1,0.0,1.0,0.0


In [50]:
# Use the pipeline
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
  ("dropper", NameDropper()),
  ("imputer", AgeImputer()),
  ("gender_encoder", GenderEncoder("Gender")),
  ("encoder", FeatureEncoder("Job", OneHotEncoder(sparse=False))),
])

pipeline.fit_transform(df2)

Unnamed: 0,Age,Gender,Job_Programmer,Job_Teacher,Job_Writer
0,20.0,0,0.0,0.0,1.0
1,21.5,1,1.0,0.0,0.0
2,21.5,1,1.0,0.0,0.0
3,21.5,0,1.0,0.0,0.0
4,23.0,1,0.0,1.0,0.0
