In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer

In [2]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Select Features and Target

In [3]:
feature_cols = ['carat', 'depth', 'table']
target_col = 'price'

X = df[feature_cols]
y = df[target_col]

In [4]:
X.head()

Unnamed: 0,carat,depth,table
0,0.23,61.5,55.0
1,0.21,59.8,61.0
2,0.23,56.9,65.0
3,0.29,62.4,58.0
4,0.31,63.3,58.0


In [5]:
y[:5]

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

# Pipelines
## Preparation

## Linear Regression on original data (with three features)

In [6]:
# we can use make_pipeline to conveniently create pipelines
# the last stage of a pipeline must provide the fit AND predict methods
# intermediate stages of a pipeline must provide the fit AND tranfsorm methodes

# this is a pipeline consisting of only one stage
pipeline = make_pipeline(
    LinearRegression()
)

model = pipeline.fit(X, y) 
print("Predictions:")
print(model.predict(X))
print("Original values")
print(y.values)

model.score(X, y)

Predictions:
[-236.0805007  -762.99080215 -585.12110662 ... 2738.57048722 4477.14475352
 3744.61472164]
Original values
[ 326  326  327 ... 2757 2757 2757]


0.8536762884061408

## Linear Regression with pre-processing: standard-scaled features

In [7]:
feature_cols = ['carat', 'depth', 'table']
target_col = 'price'

pipeline = make_pipeline(
    StandardScaler(),  # preparatory steps require fit and transform
    LinearRegression()
)

model = pipeline.fit(X, y)

model.score(X, y)

0.8536762884061408

## Adding encoders to the pipeline
### Using a ColumnTransformer to apply the encoder only to categorical columns

In [8]:
feature_cols = ['carat', 'depth', 'table', 'clarity']  # 'clarity' is the new column
X = df[feature_cols]  # necessary, since we have a further feature

# for the column transformer we need to define which columns should use which encoder / scaler
# we want to apply a StandardScaler on all numerical columns and an OrdinalEncoder for labeling categorical columns
numerical_features = df[feature_cols].dtypes == 'float'
categorical_features = ~numerical_features

In [10]:
X.head()

Unnamed: 0,carat,depth,table,clarity
0,0.23,61.5,55.0,SI2
1,0.21,59.8,61.0,SI1
2,0.23,56.9,65.0,VS1
3,0.29,62.4,58.0,VS2
4,0.31,63.3,58.0,SI2


In [9]:
numerical_features

carat       True
depth       True
table       True
clarity    False
dtype: bool

In [11]:
pipeline = make_pipeline(
    make_column_transformer((categorical_features, OrdinalEncoder()),
                            (numerical_features, StandardScaler())
                           ),
    LinearRegression()
)

model = pipeline.fit(X, y)

model.score(X, y)

0.8690083427882824

### Create your own estimator (=stage in the pipeline)

#### ClarityEncoder: map "clarity" to correctly ordinally scaled data
The advantage of this solution is that the pipeline can be applied to any "raw" DataFrame and includes all necessary preprocessing steps.

##### Explanation of the Concept

In [12]:
# before starting to develop the transformer, we take a look at how to map
# numpy arrays using dictionaries!
np_array = np.array(['SI2', 'VS1', 'VS1', 'I1', 'IF', 'IF'])

# this is the mapping we want to use
mapping_clarity = {'SI2':6, 'SI1':5, 'VS1':3, 'VS2':4, 'VVS2':2, 'VVS1':1, 'I1':7, 'IF':0} # manually generated

# get returns the value for the provided key:
print(mapping_clarity.get('SI2'))
print(mapping_clarity.get('IF'))

# we cannot apply the get-method on the whole array in numpy
#mapping_clarity.get(np_array)  # does not work

# BUT: we can vectorize any function and prepare it to be applied to a numpy array
vectorized_get = np.vectorize(mapping_clarity.get)

# now we can call it on the array
vectorized_get(np_array)

# --> this is the idea: we want to develop a transformer that applies the mapping to the column clarity

NameError: name 'np' is not defined

In [13]:
# let's start with a transformer that does "nothin"
# we need to implement the fit and transform methods

class NothingHappensTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self  # an instance to the transformer itself has to be returned
    
    def transform(self, X):
        return X  # we do not change the incoming column X

In [14]:
# use the NothingHappensTransformer
feature_cols = ['carat', 'depth', 'table']
target_col = 'price'
X = df[feature_cols]  # necessary, since we have a further feature

pipeline = make_pipeline(
    StandardScaler(),
    NothingHappensTransformer(),
    LinearRegression()
)

model = pipeline.fit(X, y)

model.score(X, y)

0.8536762884061408

##### Actual ClarityEncoding

In [15]:
# This class is our own transformation encoder which creates the mapping of "clarity" (and back)
import numpy as np

class ClarityEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        mapping_clarity = {'SI2':6, 'SI1':5, 'VS1':3, 'VS2':4, 'VVS2':2, 'VVS1':1, 'I1':7, 'IF':0} # manually generated
        return np.vectorize(mapping_clarity.get)(X)            

#### DebugStage: just print the first 5 rows of the array in any stage of the pipeline (for debug purposes)

In [16]:
class DebugStage(BaseEstimator, TransformerMixin):
    """ Class for debug purposes - show the first five rows of the feature matrix in an
    arbitrary stage of a pipeline
    """

    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        print(X[:1,:])  # print the first row
        return X

In [17]:
feature_cols = ['carat', 'depth', 'table', 'clarity']  
X = df[feature_cols]

# we need to distinguish which columns are float (--> should be standard-scaled)
numerical_features = df[feature_cols].dtypes == 'float'
categorical_features = ~numerical_features
numerical_features

carat       True
depth       True
table       True
clarity    False
dtype: bool

In [20]:
print(X.head(1))

pipeline = make_pipeline(
    make_column_transformer((categorical_features, ClarityEncoder()),  # encode the only feature column that is categorical
                            (numerical_features, FunctionTransformer())  # the FunctionTransformer without arguments does "nothing"
                           ),
    DebugStage(),
    StandardScaler(),
    DebugStage(),
    LinearRegression()
)

print("fit (uses the pipeline with fit in the end)")
pipeline.fit(X, y)

print("score (uses the pipeline with predict in the end)")
pipeline.score(X, y)

   carat  depth  table clarity
0   0.23   61.5   55.0     SI2
fit (uses the pipeline with fit in the end)
[[ 6.    0.23 61.5  55.  ]]
[[ 1.24521508 -1.19816781 -0.17409151 -1.09967199]]
score (uses the pipeline with predict in the end)
[[ 6.    0.23 61.5  55.  ]]
[[ 1.24521508 -1.19816781 -0.17409151 -1.09967199]]




0.8872302850614864

In [79]:
pipeline.predict(np.array([[0.7, 61.5, 56.0, 'SI1']]))

[['0.7' '61.5' '56.0' 'SI1']]
[['5' '0.7' '61.5' '56.0']]
[[ 0.63809506 -0.20662095 -0.17409151 -0.65213854]]




array([2716.33020603])

In [80]:
df.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.5
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.7,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
