## Select Columns and Add New Columns in an ML Pipeline with Code Example

- https://medium.com/mlearning-ai/select-columns-and-add-new-columns-in-an-ml-pipeline-with-code-example-bd90ccba1891

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right"> Initial upload: 2023. 7.10</div>
<div style="text-align: right"> Last update: 2023. 7. 10</div>

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats
import warnings; warnings.filterwarnings('ignore')
#plt.style.use('ggplot')
plt.style.use('seaborn-whitegrid')
%matplotlib inline

ColumnTransformer 및 FeatureUnion을 사용해서 피처를 선택하고, 새 피처를 추가 및 결합하는 방법을 소개함

In [2]:
data = {
    'sex': ['Female', 'Female', 'Male', 'Male', 'Female'],
    'age': [15.0, 39.0, 46.0, 33.0, 35.0],
    'distance': [5.0, 5.0, 5.0, 6.0, 4.0],
    'unit': ['km', 'km', 'km', 'km', 'km']    
}

df = pd.DataFrame(data)
df

Unnamed: 0,sex,age,distance,unit
0,Female,15.0,5.0,km
1,Female,39.0,5.0,km
2,Male,46.0,5.0,km
3,Male,33.0,6.0,km
4,Female,35.0,4.0,km


In [3]:
X_train =df.copy()

### 1. Select custom columns

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, subset):
        self.subset = subset
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.loc[:, self.subset]

In [6]:
customized_cols = ['distance','unit']

In [7]:
selected_cols = ColumnSelector(customized_cols)

selected_cols.fit_transform(X_train)

Unnamed: 0,distance,unit
0,5.0,km
1,5.0,km
2,5.0,km
3,6.0,km
4,4.0,km


### 2. Select all numeric and all categorical columns

In [8]:
class NumColSelector(BaseEstimator, TransformerMixin):
    '''select all numeric columns of a given dataset'''        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.select_dtypes(include='number')

class CatColSelector(BaseEstimator, TransformerMixin):
    '''select all categorical columns of a given dataset'''        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.select_dtypes(include='object')

In [9]:
NumColSelector().transform(X_train)

Unnamed: 0,age,distance
0,15.0,5.0
1,39.0,5.0
2,46.0,5.0
3,33.0,6.0
4,35.0,4.0


In [10]:
CatColSelector().transform(X_train)

Unnamed: 0,sex,unit
0,Female,km
1,Female,km
2,Male,km
3,Male,km
4,Female,km


### 3. Add a new column

In [11]:
class AgeMedianByDistGroup(BaseEstimator, TransformerMixin):
    '''get the median age of each distance group''' 
    def __init__(self, train):
        self.age_median_by_dist_group = train.groupby('distance').apply(lambda x: x['age'].median())
        self.age_median_by_dist_group.name = 'age_median_by_dist_group'
        
    def fit(self, X=None, y=None):
        return self
    
    def transform(self, X, y=None):
        new_X = pd.merge(X, self.age_median_by_dist_group, 
                         left_on = 'distance', right_index=True, how='left')        
        X['age_median_by_dist_group'] = new_X['age_median_by_dist_group']
        return X

In [12]:
test_pipe = AgeMedianByDistGroup(train=X_train)

In [13]:
AgeMedianByDistGroup(X_train).transform(X_train)

Unnamed: 0,sex,age,distance,unit,age_median_by_dist_group
0,Female,15.0,5.0,km,39.0
1,Female,39.0,5.0,km,39.0
2,Male,46.0,5.0,km,39.0
3,Male,33.0,6.0,km,33.0
4,Female,35.0,4.0,km,35.0


### 4. Set up the final pipeline

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

pipe = Pipeline([
                #('add_new_col', AgeMedianByDistGroup(X_train)),
                ('get_num_cols', NumColSelector()),
                ('fix_nan', SimpleImputer(missing_values=np.nan, strategy='median')),
                ('scale_data', MinMaxScaler())
])

In [15]:
pipe