In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer

In [2]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Select Features and Target

In [3]:
feature_cols = ['carat', 'depth', 'table']
target_col = 'price'

X = df[feature_cols]
y = df[target_col]

In [4]:
X.head()

Unnamed: 0,carat,depth,table
0,0.23,61.5,55.0
1,0.21,59.8,61.0
2,0.23,56.9,65.0
3,0.29,62.4,58.0
4,0.31,63.3,58.0


In [5]:
y[:5]

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

# Pipelines
## Preparation

## Linear Regression on original data (with three features)

In [7]:
# we can use make_pipeline to conveniently create pipelines
# the last stage of a pipeline must provide the fit AND predict methods
# intermediate stages of a pipeline must provide the fit AND tranfsorm methodes

# this is a pipeline consisting of only one stage
pipeline = make_pipeline(
    LinearRegression()
)

model = pipeline.fit(X, y) 
print("Predictions:")
print(model.predict(X))
print("Original values")
print(y.values)

model.score(X, y)

Predictions:
[-236.0805007  -762.99080215 -585.12110662 ... 2738.57048722 4477.14475352
 3744.61472164]
Original values
[ 326  326  327 ... 2757 2757 2757]


0.8536762884061408

## Linear Regression with pre-processing: standard-scaled features

In [8]:
feature_cols = ['carat', 'depth', 'table']
target_col = 'price'

pipeline = make_pipeline(
    StandardScaler(),  # preparatory steps require fit and transform
    LinearRegression()
)

model = pipeline.fit(X, y)

model.score(X, y)

0.8536762884061408

## Adding encoders to the pipeline
### Using a ColumnTransformer to apply the encoder only to categorical columns

In [10]:
feature_cols = ['carat', 'depth', 'table', 'clarity']  # 'clarity' is the new column
X = df[feature_cols]  # necessary, since we have a further feature

# for the column transformer we need to define which columns should use which encoder / scaler
# we want to apply a StandardScaler on all numerical columns and an OrdinalEncoder for labeling categorical columns
numerical_features = df[feature_cols].dtypes == 'float'
categorical_features = ~numerical_features

In [11]:
numerical_features

carat       True
depth       True
table       True
clarity    False
dtype: bool

In [13]:
pipeline = make_pipeline(
    make_column_transformer((categorical_features, OrdinalEncoder()),
                            (numerical_features, StandardScaler())
                           ),
    LinearRegression()
)

model = pipeline.fit(X, y)

model.score(X, y)

0.8690083427882824

### Create your own estimator (=stage in the pipeline)

#### ClarityEncoder: map "clarity" to correctly ordinally scaled data
The advantage of this solution is that the pipeline can be applied to any "raw" DataFrame and includes all necessary preprocessing steps.

##### Explanation of the concept

In [17]:
import numpy as np
# before starting to develop the transformer, we take a look at how to map
# numpy arrays using dictionaries!
np_array = np.array(['SI2', 'VS1', 'VS1', 'I1', 'IF', 'IF'])

# this is the mapping we want to use
mapping_clarity = {'SI2':6, 'SI1':5, 'VS1':3, 'VS2':4, 'VVS2':2, 'VVS1':1, 'I1':7, 'IF':0} # manually generated

# get returns the value for the provided key:
print(mapping_clarity.get('SI2'))
print(mapping_clarity.get('IF'))

6
0
