# Playground for Data Leakage

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2021-11-11 19:42:56.379827


In [2]:
pip install category_encoders



In [3]:
import sklearn
sklearn.__version__

'0.22.2.post1'

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# disable chained assignments
pd.options.mode.chained_assignment = None 

In [5]:
[5, 6, 7] + [8, 9]

[5, 6, 7, 8, 9]

# Load Data

In [6]:
train_df = pd.DataFrame(
    data={
      'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + [16, 17, 18, 19, 20], 
      'Salary': [1169.,	5951,	2096,	7882,	4870,	9055,	2835,	6948,	3059,	5234, 6854, 2255, 7965, 7365, 2006] + [10365, 1006, 4532, 6395, 7551],
      'Province': ['ON', 'ON', 'ON', 'ON', 'ON', 'BC', 'BC', 'BC', 'BC', 'BC', 'QB', 'QB', 'QB', 'QB', 'QB'] + ['ON', 'ON', 'BC', 'BC', 'QB'],
      'Default': [0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1] + [0, 1, 0, 1, 1],
    })


X = train_df.drop(['Default'], axis=1)
y = train_df['Default']
train_df

Unnamed: 0,ID,Salary,Province,Default
0,1,1169.0,ON,0
1,2,5951.0,ON,1
2,3,2096.0,ON,1
3,4,7882.0,ON,1
4,5,4870.0,ON,1
5,6,9055.0,BC,0
6,7,2835.0,BC,0
7,8,6948.0,BC,1
8,9,3059.0,BC,1
9,10,5234.0,BC,1


In [7]:
# Manually create some train_test split, just for illustration
X_train = X.iloc[0:15,:]
y_train = y[:15]

X_test = X.iloc[15:,:]
y_test = y[15:]

# Premature Featurization

## Standard Scaling - Prematurely

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler=scaler.fit(X[['Salary']])

X['Salary_Std'] = scaler.transform(X[['Salary']])
X[['ID', 'Salary', 'Salary_Std']]

# Now  split ...
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,ID,Salary,Salary_Std
0,1,1169.0,-1.510705
1,2,5951.0,0.251014
2,3,2096.0,-1.169193
3,4,7882.0,0.962406
4,5,4870.0,-0.147234
5,6,9055.0,1.394547
6,7,2835.0,-0.89694
7,8,6948.0,0.618315
8,9,3059.0,-0.814417
9,10,5234.0,-0.013134


In [17]:
print(scaler.mean_)
print(np.sqrt(scaler.var_))

[5269.65]
[2714.39411794]


## Standard Scaling - The Right Way

In [23]:
# First  split ...
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaler=scaler.fit(X_train[['Salary']])

X_train['Salary_Std'] = scaler.transform(X_train[['Salary']])
X_test['Salary_Std'] = scaler.transform(X_test[['Salary']])


X_train[['ID', 'Salary', 'Salary_Std']]
X_test[['ID', 'Salary', 'Salary_Std']]

Unnamed: 0,ID,Salary,Salary_Std
0,1,1169.0,-1.533589
1,2,5951.0,0.362743
2,3,2096.0,-1.165981
3,4,7882.0,1.128494
4,5,4870.0,-0.065934
5,6,9055.0,1.593654
6,7,2835.0,-0.872926
7,8,6948.0,0.75811
8,9,3059.0,-0.784098
9,10,5234.0,0.078412


Unnamed: 0,ID,Salary,Salary_Std
15,16,10365.0,2.113143
16,17,1006.0,-1.598228
17,18,4532.0,-0.19997
18,19,6395.0,0.538814
19,20,7551.0,0.997234


In [19]:
print(scaler.mean_)
print(np.sqrt(scaler.var_))

[5036.26666667]
[2521.70961761]


## Target Encoding - Prematurely

In [20]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)
enc = enc.fit(X[['Province']], y)

X['Province_TE'] = enc.transform(X[['Province']])
X[['ID', 'Province', 'Province_TE']]

Unnamed: 0,ID,Province,Province_TE
0,1,ON,0.714286
1,2,ON,0.714286
2,3,ON,0.714286
3,4,ON,0.714286
4,5,ON,0.714286
5,6,BC,0.571429
6,7,BC,0.571429
7,8,BC,0.571429
8,9,BC,0.571429
9,10,BC,0.571429


## Target Encoding - The Right Way

In [21]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)
enc = enc.fit(X_train[['Province']], y_train)

X_train['Province_TE'] = enc.transform(X_train[['Province']])
X_train[['ID', 'Province', 'Province_TE']]

X_test['Province_TE'] = enc.transform(X_test[['Province']])
X_test[['ID', 'Province', 'Province_TE']]

Unnamed: 0,ID,Province,Province_TE
0,1,ON,0.8
1,2,ON,0.8
2,3,ON,0.8
3,4,ON,0.8
4,5,ON,0.8
5,6,BC,0.6
6,7,BC,0.6
7,8,BC,0.6
8,9,BC,0.6
9,10,BC,0.6


Unnamed: 0,ID,Province,Province_TE
15,16,ON,0.8
16,17,ON,0.8
17,18,BC,0.6
18,19,BC,0.6
19,20,QB,0.2
