## Advanced Pipelining


In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set()
import sklearn

Let's start with ColumnTransformer on a toy example:

In [None]:
from sklearn.compose import ColumnTransformer

data1 = np.column_stack((np.random.uniform(-2,1,5), np.random.randint(0,10,5)))
print(data1)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

print(StandardScaler().fit_transform(data1))
print()
print(MinMaxScaler().fit_transform(data1))

In [None]:
# Let's perform Z-normalization on the zeroth column and MinMaxScaling on the first
c0 = StandardScaler().fit_transform(data1[:,0:1]) #data1[:,0:1] vs data1[:,0] ?
c1 = MinMaxScaler().fit_transform(data1[:,1:])

print(np.column_stack((c0,c1)))

In [None]:
ct = ColumnTransformer([ ('znorm', StandardScaler(), [0]), ('minmax', MinMaxScaler(), [1]) ])

data1ct =ct.fit_transform(data1)

print(data1ct)

In [None]:
ct

**What did we do?**  
We specified the name and type of the transformers and the columns that they should affect. It is your turn, let's do something similar below

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

data2 = [[0.5, 1.2, -2.3, -0.7, 3.7],
         ['a', 'b', None, 'b', 'c']]

y = [0, 1, 0, 0, 1]

# Put these into a pandas dataframe. Note that the rows of data2 should be the columns of the data!

df = pd.DataFrame(data2)
display(df)
df = df.T
display(df)

In [None]:
# Perform z-normalization on the first column
# Perform most frequent imputation to the second column followed by one-hot encoding. 
# Hint: ColumnTransformers can be used in a Pipeline and Pipelines can be used in ColumnTransformers 

#SimpleImputer + OneHotEncoder Pipeline
pp_pipe = Pipeline([('imp',SimpleImputer(strategy = "most_frequent")), 
                    ('ohe', OneHotEncoder(handle_unknown = "ignore"))])
display(pp_pipe.fit_transform(df.iloc[:,1:]).todense())

In [None]:
#ct = ColumnTransformer([ ('znorm', StandardScaler(), [0]), ('minmax', MinMaxScaler(),[1]) ])
ct = ColumnTransformer([ ('znorm', StandardScaler(), [0]), ('pipe', pp_pipe, [1]) ])

display(ct.fit_transform(df))

In [None]:
# Fit a decision tree classifier 
#dt = DecisionTreeClassifier().fit(ct.fit_transform(df))

#something.fit(df, y)
main_pipe = Pipeline([('pp_all',ct), ('dt_classifier', DecisionTreeClassifier())])
main_pipe.fit(df, y)
print(main_pipe.score(df,y))

In [None]:
main_pipe

In [None]:
df

In [None]:
np.arange(df.shape[1])

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

cv = StratifiedKFold(n_splits = 2 ,shuffle=True)

params_to_search = {'dt_classifier__max_depth':[2,3,4,None]}

gs = GridSearchCV(estimator = main_pipe, param_grid = params_to_search, cv = cv)
gs.fit(df,y)
gs.score(df,y)

Detour: Python Slice Feature  
`slice(start,stop,step)`

In [None]:
tmp = np.arange(10)
print(tmp)
print(tmp[slice(3)], tmp[:3])
print(tmp[slice(3,7)], tmp[3:7])
print(tmp[slice(1,7,2)], tmp[1:7:2])

What if we had more columns that we do not want to touch or where it is impractical to list all?

In [None]:
np.set_printoptions(precision=3, suppress = True)

data3 = np.random.random((15,10))

pp1 = StandardScaler()
pp2 = MinMaxScaler((-1,0))

ct_more1 = ColumnTransformer([('znorm', pp1, [0,1]), ('minmax',pp2, slice(3,7))])

data3ct1 = ct_more1.fit_transform(data3)
print(data3ct1.shape)
print(data3ct1.mean(axis=0))
print(data3ct1.std(axis=0))
print(data3ct1.min(axis=0))
print(data3ct1.max(axis=0))

What to do with the remaining columns?
* Drop them (default behavior)
* Pass them as is
* Apply another transformer

In [None]:
ct_more2 = ColumnTransformer([('znorm', pp1, [0,1]), ('minmax',pp2, slice(3,7))], remainder='passthrough')

data3ct2 = ct_more2.fit_transform(data3)
print(data3ct2.shape)
print(data3ct2.mean(axis=0))
print(data3ct2.std(axis=0))
print(data3ct2.min(axis=0))
print(data3ct2.max(axis=0))

In [None]:
ct_more3 = ColumnTransformer([('znorm', pp1, [0,1]), ('minmax',pp2, slice(3,7))], remainder=MinMaxScaler((1,2)))

data3ct3 = ct_more3.fit_transform(data3)
print(data3ct3.shape)
print(data3ct3.mean(axis=0))
print(data3ct3.std(axis=0))
print(data3ct3.min(axis=0))
print(data3ct3.max(axis=0))

In [None]:
ct_more4 = ColumnTransformer([('znorm', pp1, [0,1]), ('dropped','drop', slice(3,7))], remainder=MinMaxScaler((1,2)))

data3ct4 = ct_more4.fit_transform(data3)
print(data3ct4.shape)

In [None]:
#Keeping the ordering
ct_more5 = ColumnTransformer([('znorm', pp1, [0,1]),  
                              ('keep','passthrough',[2]), 
                              ('minmax',pp2, slice(3,6))], 
                               remainder=MinMaxScaler((1,2)))

data3ct5 = ct_more5.fit_transform(data3)
print(data3ct5.shape)
print(data3ct5.mean(axis=0))
print(data3ct5.std(axis=0))
print(data3ct5.min(axis=0))
print(data3ct5.max(axis=0))

In [None]:
#Keeping the ordering
ct_more5 = ColumnTransformer([('znorm', pp1, [0,1]),  
                              ('keep','passthrough',[9]),
                              ('minmax',pp2, slice(3,6)),
                              ], 
                               remainder=MinMaxScaler((1,2)))
                              

data3ct5 = ct_more5.fit_transform(data3)
print(data3ct5.shape)
print(data3ct5.mean(axis=0))
print(data3ct5.std(axis=0))
print(data3ct5.min(axis=0))
print(data3ct5.max(axis=0))

If the input is a pandas DataFrame, we can use the column names (in the first level) as well!

In [None]:
df = pd.DataFrame(data2).T
df = df.rename({0:'numbers',1:'categories'},axis=1)
df

In [None]:
cat_pipe = Pipeline(steps = [('imp',SimpleImputer(strategy='most_frequent')),('enc',OneHotEncoder())])
ct_nodf = ColumnTransformer([('znorm',StandardScaler(),[0]),('cat', cat_pipe,[1])])
ct_nodf.fit_transform(df)

In [None]:
cat_pipe = Pipeline(steps = [('imp',SimpleImputer(strategy='most_frequent')),('enc',OneHotEncoder())])
ct_df = ColumnTransformer([('znorm',StandardScaler(),['numbers']),('cat', cat_pipe,['categories'])])
ct_df.fit_transform(df)

Let's do a more complicated example but still with a toy dataset.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# You need an internet connection
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# If the above doesn't work (loads it but with different column names)
#from catboost.datasets import titanic
#X, y = titanic()

display(X)

In [None]:
y

Features:
* sibsp: Number of Siblings/Spouses Aboard
* parch: Number of Parents/Children Aboard
* survival - Survival (0 = No; 1 = Yes)
* pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* ticket - Ticket Number
* fare - Passenger Fare
* cabin - Cabin
* embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
* boat - Lifeboat (if survived)
* body - Body number (if did not survive and body was recovered)
* home.dest - Home destination

In [None]:
X.info()

In [None]:
X.describe()

In [None]:
X.describe(include=object)

In [None]:
X.describe(include='category')

What should we do?
* Drop `name`, `ticket`, `boat`, `home.dest`, `cabin` and `body` (some of them e.g. cabin would be useful but we are making it easier) 
* `pclass`: Ordinal, may stay as is or may be treated as a categorical value
* `sex`: To binary
* `age` and `fare`: Standard scaler
* `sibsp` and `parch`: log1p (count values) and perhaps maxabsolute scaler (lots of 0s) 
* `embarked`: One-Hot 
* impute if necessary

These are the initial ideas, let's look at the data and see

In [None]:
X = X.drop(['name','ticket','boat','home.dest','cabin','body'],axis=1)

In [None]:
X.drop(['age','fare'],axis=1).boxplot()

In [None]:
plt.boxplot(np.log1p(X['pclass'].max()-X['pclass']))

In [None]:
plt.boxplot(np.log1p(X['sibsp']))

In [None]:
plt.boxplot(np.log1p(X['parch']))

In [None]:
plt.hist(X['age'])

In [None]:
plt.hist((np.log1p(X['age'])))

In [None]:
plt.hist(X['fare'])

In [None]:
plt.hist((np.log1p(X['fare'])))

In [None]:
import matplotlib.colors as mcolors

def countplot(pd_series, ax = None):
    counts = pd_series.value_counts()
    if ax:
        ax.bar(range(len(counts)),counts.values, color = mcolors.TABLEAU_COLORS)
        plt.sca(ax)
    else:
        plt.bar(range(len(counts)),counts.values, color = mcolors.TABLEAU_COLORS)
    plt.xticks(range(len(counts)), counts.keys())
    return counts
    

In [None]:
#sns.countplot(X['embarked'])
countplot(X['embarked'])

In [None]:
#sns.countplot(X['sex'])
countplot(X['sex'])

In [None]:
countplot(y)

In [None]:
# Missing data counts
X.isnull().sum()

**Exercise:** Look at survival statistics based on sex, age, fare, pclass etc. 

Check if they really did "women and children first"

Survived based on gender

In [None]:
ez_df = X.copy()
ez_df['survived'] = y

countplot(ez_df[ez_df['sex']=='male']['survived'])

In [None]:
countplot(ez_df[ez_df['sex']=='female']['survived'])

In [None]:
baseline_accuracy = ((ez_df[ez_df['sex']=='female']['survived']=='1').sum()+(ez_df[ez_df['sex']=='male']['survived']=='0').sum())/1309
print(baseline_accuracy)

Women's survival rate is higher than men's. What about class?

In [None]:
fig, axs = plt.subplots(1, 3,figsize=(12,6))
for i in range(len(axs)):
    countplot(ez_df[ez_df['pclass']==i+1]['survived'], ax=axs[i])

Survival rate of passengers with a higher class ticket is more than the others.

In [None]:
fig, axs = plt.subplots(1, 3,figsize=(12,6))
for i in range(len(axs)):
    countplot(ez_df[(ez_df['pclass']==i+1) & (ez_df['sex']=='female')]['survived'], ax=axs[i])

In [None]:
fig, axs = plt.subplots(1, 3,figsize=(12,6))
for i in range(len(axs)):
    countplot(ez_df[(ez_df['pclass']==i+1) & (ez_df['sex']=='male')]['survived'], ax=axs[i])

What about age?

In [None]:
#age_bins = {'0-6':0,'7-12':7,'13-18':13,'18-25':19,'26-40':26,'41-55':41,'55-80':56,'80':80}
age_bins = {'0-12':0,'13-18':13,'18-25':19,'26-40':26,'41-80':41,'80':80}
nums,edges = np.histogram(ez_df['age'],list(age_bins.values()))
plt.bar(range(len(nums)),nums)
plt.xticks(range(len(nums)),list(age_bins.keys())[:-1])

In [None]:
def get_group(x, edges):
    #cs_edges = np.cumsum(edges)
    for i,edge in enumerate(edges):
        if x < edge:
            return i-1
    return len(edges)
    
ez_df['age group'] = [get_group(x,np.array(list(age_bins.values()),dtype='float')) for x in ez_df['age']]

In [None]:
plt.plot([(ez_df[ez_df['age group']==group]['survived']=='1').sum()/len(ez_df[ez_df['age group']==group]['survived']) for group in range(len(age_bins)-1)])
plt.xticks(range(len(age_bins)-1),list(age_bins.keys())[:-1])
plt.title('Survival rate by age group')
plt.show()

In [None]:
500/1309

The suggested steps (we can change it during the lecture!)

**Lvl 1:**
* sex: to binary, labelencoder is fine
* embarked: missing value as the most frequent port (since we do not have many missing values)
* embarked: to-one-hot

**Lvl 2:**
* overall iterative imputation with random forest regressor

**Lvl 3:**
* pclass: leave as is or min max scaler
* sibsp and parch: log(1+x) (but we could easily remove parch)
* age: standard scaling 
* add feature: age <= 12:0, age >12:1
* fare: log(1+x) then standard scaling

**Classifier:** LogisticRegression, Compare Random Forest and SVM

Let's focus on the usage of pipelines and column transformers instead of the final accuracy for now:

In [None]:
# The first step
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8, stratify=y)

In [None]:
from sklearn.metrics import accuracy_score

def test(clf, Xtrain, Xtest, ytrain, ytest, name = None, refit = True):
    if refit:
        clf.fit(Xtrain, ytrain)
    ytrainPred = clf.predict(Xtrain)
    ytestPred = clf.predict(Xtest)
    if name:
        print(name)
    else:
        print(clf.steps[1][0])
    print('Train:',accuracy_score(ytrain,ytrainPred))
    print('Test:',accuracy_score(ytest,ytestPred))

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
# Needed since LabelEncoder does not play nicely with pipelines
from sklearn.preprocessing import LabelEncoder

class PipelineLabelEncoder(LabelEncoder):
    # This is not entirely correct, we need a separate label encoder for each dimension!
    def fit_transform(self, y, *args, **kwargs):
        X = np.empty(y.shape)
        df = False
        if type(y) == pd.DataFrame:
            df = True
        for i in range(y.shape[1]):
            if df:
                X[:,i] = super().fit_transform(y.iloc[:,i])
            else:
                X[:,i] = super().fit_transform(y[:,i])
        return X

    def transform(self, y, *args, **kwargs):
        X = np.empty(y.shape)
        df = False
        if type(y) == pd.DataFrame:
            df = True
        for i in range(y.shape[1]):
            if df:
                X[:,i] = super().transform(y.iloc[:,i])
            else:
                X[:,i] = super().transform(y[:,i])
        return X

In [None]:
# Baseline based on the EDA: Decision Tree CLassifier on just the sex and pclass
# Need to convert sex to binary
dt_ct = ColumnTransformer([('lenc',PipelineLabelEncoder(),['sex']),('pt','passthrough',['pclass'])],remainder='drop')
dt_pipe = Pipeline([('preprocesser',dt_ct),('dt_classifier', DecisionTreeClassifier())])

test(dt_pipe, Xtrain, Xtest, ytrain, ytest, name = 'Decision Tree', refit = True)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(12,8))
plot_tree(dt_pipe.named_steps['dt_classifier'])
plt.show()

In [None]:
Xtrain.shape

In [None]:
Xtrain.columns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

"""
**Lvl 1:**
* sex: to binary, labelencoder is fine
* embarked: missing value as the most frequent port (since we do not have many missing values)
* embarked: to-one-hot
"""

genderLenc = ('GenderLabelEncoder', PipelineLabelEncoder(), ['sex'])
#genderLenc = ('GenderLabelEncoder', LabelEncoder(), ['sex']) uncomment to see

embarkedInit = ('EmbarkedInit', 
                Pipeline([('EmbarkedImpute', SimpleImputer(strategy='most_frequent')),
                          ('EmbarkedOneHot',OneHotEncoder(handle_unknown='ignore'))]),  ['embarked'])

# passthrough changes the order. Furthermore we lose the pandas dataframe. After this, 
# we will need to keep track of the indices. This is a trade-off of scikit-learn where they 
# value numpy array compatibility over pandas dataframe compatibility but work is being done

# order before: pclass, sex, age, sibsp, parch, fare, embarked
# order after the below: sex, embarked x 3 (1 hot), pclass, age, sibsp, parch, fare
firstLevelCT = ColumnTransformer([genderLenc, embarkedInit], remainder='passthrough')

"""
**Lvl 2:**
* overall iterative imputation
"""

# order after the below: sex, embarked x 3 (1hot), pclass, age, sibsp, parch, fare, i.e., no change
#allImputer = ('AllImpute', 
#               IterativeImputer(estimator=RandomForestRegressor(n_estimators=50), max_iter=10, tol=0.01), 
#               np.arange(9))
#secondLevelCT = ColumnTransformer([allImputer])
#secondLevel = Pipeline([('AllImpute',IterativeImputer(estimator=RandomForestRegressor(n_estimators=50), 
#                                                      max_iter=10, 
#                                                      tol=0.01))])

secondLevel = IterativeImputer(estimator=RandomForestRegressor(n_estimators=50), 
                                                      max_iter=10, 
                                                      tol=0.01)
"""
**Lvl 3:**
* pclass: leave as is or min max scaler
* sibsp: log(1+x)
* parch: drop
* age: standard scaling
* age to binary (<12) ?
* fare: log(1+x) then standard scaling
"""
Log1pTransformer = FunctionTransformer(np.log1p, validate=True)
sibsp = ('CountsLogT', Log1pTransformer, [6])
parch = ('Drop','drop',[7])
ageScaler = ('AgeScaler', StandardScaler(), [5])
fare = ('Fare', Pipeline([('FareLT',Log1pTransformer), ('FareSc',StandardScaler())]), [8])

# order after the below: sibsp, parch, age, fare, sex, embarked x 3 (1hot), pclass
thirdLevelCT = ColumnTransformer([sibsp,parch,ageScaler,fare], remainder='passthrough')

MainPipeLine = Pipeline ([('first', firstLevelCT),
                          ('second', secondLevel),
                          ('third', thirdLevelCT)]) 

In [None]:
x1 = firstLevelCT.fit_transform(Xtrain)
#print(x1.shape)
#print(x1)
x2 = secondLevel.fit_transform(x1)
x3 = thirdLevelCT.fit_transform(x2)

print(x1.shape,x2.shape,x3.shape)

In [None]:
x123 = MainPipeLine.fit_transform(Xtrain)
print(x123.shape)
print((x3-x123).sum(axis=1))

# Discrepancy is due to RandomForestRegressor imputation since it introduces some randomness

We can save and load pipelines with the pickle module

In [None]:
import pickle

pickle.dump(MainPipeLine,open('tmp_pp.p','wb'))

In [None]:
tmp=pickle.load(open('tmp_pp.p','rb'))
xtmp = tmp.transform(Xtrain)
print((xtmp-x123).sum())

In [None]:
LrPipeline  = Pipeline([('main', MainPipeLine), ('lr',  LogisticRegression())])
SvmPipeline = Pipeline([('main', MainPipeLine), ('svm', SVC())])
RfPipeline  = Pipeline([('main', MainPipeLine), ('rf',  RandomForestClassifier())])

# Warning:: MainPipeLine is fitted each time and our iterative imputer is a bit expensive
test(LrPipeline, Xtrain, Xtest, ytrain, ytest)
test(SvmPipeline, Xtrain, Xtest, ytrain, ytest)
test(RfPipeline, Xtrain, Xtest, ytrain, ytest)

In [None]:
# Faster Alternative if the preprocessing steps are the same:
XtrainPreProc = MainPipeLine.fit_transform(Xtrain)
XtestPreProc = MainPipeLine.transform(Xtest)

# Then call the LogisticRegression, SVC and RandomForestClassifier fit using XtrainPreProc
dt2 = DecisionTreeClassifier()

dt2.fit(XtrainPreProc, ytrain)
ytrainDt = dt2.predict(XtrainPreProc)
ytestDt = dt2.predict(XtestPreProc)
print('Train:',accuracy_score(ytrain,ytrainDt))
print('Test:',accuracy_score(ytest,ytestDt))

#Worse than the first baseline??

What about hyper parameter search?

In [None]:
LrPipeline.steps[0][1].steps

In [None]:
from sklearn.model_selection import StratifiedKFold
# With grid search, does not change the result all that much
cv = StratifiedKFold(n_splits=3, shuffle=True)

param_grid = {'main__second__estimator__n_estimators': [25,50,100],
              'lr__C':[1,5,10]}

gsLr = GridSearchCV(LrPipeline, param_grid, cv = cv)
test(gsLr, Xtrain, Xtest, ytrain, ytest,'lr with grid')

In [None]:
# Try the other pipelines at home


Let's switch to FeatureUnion with a familiar data

In [None]:
# Familiar data
xP = np.linspace(0.1,7,100)    

#Adding uniform noise
yP = np.log(xP) + np.sin(xP) + np.random.uniform(-0.5,0.5,len(xP))

yGT = np.log(xP) + np.sin(xP)
             
XP = xP[:,np.newaxis]

plt.plot(xP, yP,'.')

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

stepsPoly = [('poly', PolynomialFeatures(4)), 
             ('lr', LinearRegression()) ]

pipePoly = Pipeline(stepsPoly)

pipePoly.fit(XP,yP)
yPredP = pipePoly.predict(XP)

plt.plot(xP,yPredP,'r')
plt.plot(xP,yGT,'k--')
plt.scatter(xP,yP)
plt.legend(['Polynomial Fit','Noiseless GT','Data'])
plt.show()

In [None]:
pipePoly.named_steps['lr'].coef_

In [None]:
a = PolynomialFeatures(3)
a.fit_transform(XP).shape

In [None]:
a.transform([[1],[2],[3]])

In [None]:
# What if we want to add more features?

from sklearn.pipeline import FeatureUnion

featUn = FeatureUnion([('poly', PolynomialFeatures(3)),
                       ('log', FunctionTransformer(np.log))])
featUn.fit_transform(XP).shape

In [None]:
featUn.transform([[1],[2],[3]])

In [None]:
stepsfeatUn = [('featUn', featUn), 
               ('lr', LinearRegression()) ]

pipefeatUn = Pipeline(stepsfeatUn)

pipefeatUn.fit(XP,yP)
yPredF = pipefeatUn.predict(XP)

plt.plot(xP,yPredF,'r')
plt.plot(xP,yPredP,'b')
plt.plot(xP,yGT,'k--')
plt.scatter(xP,yP)
plt.legend(['Polynomial(3)+Log Fit','Polynomial(4) Fit','Noiseless GT','Data'])
plt.show()

In [None]:
featUn2 = FeatureUnion([('sin', FunctionTransformer(np.sin)),
                        ('log', FunctionTransformer(np.log))])
stepsfeatUn2 = [('featUn', featUn2), 
                ('lr', LinearRegression()) ]

pipefeatUn2 = Pipeline(stepsfeatUn2)

pipefeatUn2.fit(XP,yP)
yPredF2 = pipefeatUn2.predict(XP)

plt.plot(xP,yPredF2,'r')
plt.plot(xP,yPredF,'g')
plt.plot(xP,yPredP,'b')
plt.plot(xP,yGT,'k--')
plt.scatter(xP,yP)
plt.legend(['Sine+Log Fit','Polynomial(3)+Log Fit','Polynomial(4) Fit','Noiseless GT','Data'])
plt.show()


In [None]:
featUn2.transform([[1],[2],[3]])

In [None]:
pipefeatUn2.named_steps['lr'].coef_

In [None]:
pipefeatUn.named_steps['lr'].coef_

Why don't we combine multiple features and do feature selection with Lasso regression? Let's do this!

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# We are now extending the required classes
class GaussianRbfFeaturesPipeline(BaseEstimator,TransformerMixin):
    def __init__(self, num_centers=10, width_constant=1.0):
        #We set the centers and widths automatically
        self.k = num_centers
        self.h = width_constant
    
    @staticmethod
    def _rbf(x,c,h):
        # To handle multiple dimensions
        return np.exp(-np.sum(((x-c)/h)**2, axis=1))
    
    # For the API
    def fit(self,X,y=None):
        self.centers_ = np.linspace(X.min(), X.max(), self.k)
        self.widths_ = self.h*(self.centers_[1]-self.centers_[0])
        return self
    
    # For the API
    def transform(self,X):
        return self._rbf(X[:, :, np.newaxis], self.centers_, self.widths_)
    
    # For the API
    def fit_transform(self, X, y=None):
        self.fit(X,y)
        return self.transform(X)
    
    # Additional methods for the hyperparameter search to work
    def get_params(self, deep=True):
        return {'num_centers':self.k,'width_constant':self.h}
    
    def set_params(self, **params):
        self.k = params['num_centers']
        self.h = params['width_constant']

In [None]:
many_feats = FeatureUnion([('poly', PolynomialFeatures(3)),('rbf',GaussianRbfFeaturesPipeline(8,3))])

steps_many_feats = [('feats', many_feats), 
                    ('lr', LinearRegression())]

pipe_mf = Pipeline(steps_many_feats)
pipe_mf.fit(XP,yP)

yPred_mf = pipe_mf.predict(XP)

plt.plot(xP,yPred_mf,'r')
plt.plot(xP,yGT,'k--')
plt.scatter(xP,yP)
plt.legend(['Poly+RBF','Noiseless GT','Data'])
plt.show()

In [None]:
print(pipe_mf.named_steps['lr'].coef_.shape) #3 degree + 1 bias + 8 rbf = 12

print(pipe_mf.named_steps['lr'].coef_ )
print(pipe_mf.named_steps['lr'].intercept_ )

In [None]:
from sklearn.linear_model import Lasso

many_feats = FeatureUnion([('poly', PolynomialFeatures(3)),('rbf',GaussianRbfFeaturesPipeline(8,3))])
steps_many_feats = [('feats', many_feats), 
                    ('lasso', Lasso(alpha=0.001,max_iter=50000))]

pipe_mf = Pipeline(steps_many_feats)
pipe_mf.fit(XP,yP)

yPred_mf = pipe_mf.predict(XP)

plt.plot(xP,yPred_mf,'r')
plt.plot(xP,yGT,'k--')
plt.scatter(xP,yP)
plt.legend(['Poly+RBF','Noiseless GT','Data'])
plt.show()

In [None]:
print(pipe_mf.named_steps['lasso'].coef_.shape) #3 degree + 1 bias + 8 rbf = 12

print(pipe_mf.named_steps['lasso'].coef_ )
print(pipe_mf.named_steps['lasso'].intercept_ )

Note: We can use feature union, pipelines and column transformers within each other! (e.g. parallel pipelines). This is called "composing" and these features will be incorporated within the compose submodule of scikit-learn in the near future.

**Back to Titanic Dataset**

So let's add the age feature now

In [None]:
"""
**Lvl 3:**
* pclass: leave as is or min max scaler
* sibsp: log(1+x)
* parch: drop
* age: standard scaling
* age to binary (<12) (yes this time!)
* fare: log(1+x) then standard scaling
"""

def age_bin(x, thresh = 13):
    X = np.zeros(x.shape)
    for i in range(x.shape[1]):
        X[:,i] = x[:,i]<thresh
    return X

interMediateAgeFeature = FunctionTransformer(age_bin, validate=True)

interMediateFeatUn = FeatureUnion([('age_scaler',StandardScaler()),
                                   ('age_bin',interMediateAgeFeature)])


Log1pTransformer = FunctionTransformer(np.log1p, validate=True)
sibsp = ('CountsLogT', Log1pTransformer, [6])
parch = ('Drop','drop',[7])
fare = ('Fare', Pipeline([('FareLT',Log1pTransformer), ('FareSc',StandardScaler())]), [8])

# order after the below: age_scale, age_bin, sibsp, parch, age, fare, sex, embarked x 3 (1hot), pclass
thirdLevelCT = ColumnTransformer([('Age Stuff', interMediateFeatUn, [5]),sibsp,parch,fare], remainder='passthrough')

MainPipeLine2 = Pipeline ([('first', firstLevelCT),
                          ('second', secondLevel),
                          ('third', thirdLevelCT)]) 

In [None]:
tmpPL = Pipeline ([('first', firstLevelCT), ('second', secondLevel)])

xtmp = tmpPL.fit_transform(Xtrain)

In [None]:
print(xtmp.shape)
xtmp2 = interMediateCT.fit_transform(xtmp)
print(xtmp2.shape)
xtmp2[:,1]

In [None]:
LrPipeline2  = Pipeline([('main', MainPipeLine2), ('lr',  LogisticRegression())])

test(LrPipeline2, Xtrain, Xtest, ytrain, ytest)