# Playground for Data Leakage

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2022-07-18 20:53:16.305124


In [2]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import sklearn
sklearn.__version__

'1.0.2'

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# disable chained assignments
pd.options.mode.chained_assignment = None 

# Load Data

In [5]:
train_df = pd.DataFrame(
    data={
      'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + [16, 17, 18, 19, 20], 
      'Salary': [1169.,	5951,	2096,	7882,	4870,	9055,	2835,	6948,	3059,	5234, 6854, 2255, 7965, 7365, 2006] + [10365, 1006, 4532, 6395, 7551],
      'Province': ['ON', 'ON', 'ON', 'ON', 'ON', 'BC', 'BC', 'BC', 'BC', 'BC', 'QB', 'QB', 'QB', 'QB', 'QB'] + ['ON', 'ON', 'BC', 'BC', 'QB'],
      'Default': [0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1] + [0, 1, 0, 1, 1],
    })


X = train_df.drop(['Default'], axis=1)
y = train_df['Default']
train_df

Unnamed: 0,ID,Salary,Province,Default
0,1,1169.0,ON,0
1,2,5951.0,ON,1
2,3,2096.0,ON,1
3,4,7882.0,ON,1
4,5,4870.0,ON,1
5,6,9055.0,BC,0
6,7,2835.0,BC,0
7,8,6948.0,BC,1
8,9,3059.0,BC,1
9,10,5234.0,BC,1


In [6]:
# Manually create some train_test split, just for illustration
X_train = X.iloc[0:15,:]
y_train = y[:15]

X_test = X.iloc[15:,:]
y_test = y[15:]

# Premature Featurization

## Standard Scaling - Prematurely

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler=scaler.fit(X[['Salary']])

X['Salary_Std'] = scaler.transform(X[['Salary']])
X[['ID', 'Salary', 'Salary_Std']]

# Now  split ...
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,ID,Salary,Salary_Std
0,1,1169.0,-1.510705
1,2,5951.0,0.251014
2,3,2096.0,-1.169193
3,4,7882.0,0.962406
4,5,4870.0,-0.147234
5,6,9055.0,1.394547
6,7,2835.0,-0.89694
7,8,6948.0,0.618315
8,9,3059.0,-0.814417
9,10,5234.0,-0.013134


In [8]:
print(scaler.mean_)
print(np.sqrt(scaler.var_))

[5269.65]
[2714.39411794]


## Standard Scaling - The Right Way

In [9]:
# First  split ...
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaler=scaler.fit(X_train[['Salary']])

X_train['Salary_Std'] = scaler.transform(X_train[['Salary']])
X_test['Salary_Std'] = scaler.transform(X_test[['Salary']])


X_train[['ID', 'Salary', 'Salary_Std']]
X_test[['ID', 'Salary', 'Salary_Std']]

Unnamed: 0,ID,Salary,Salary_Std
0,1,1169.0,-1.533589
1,2,5951.0,0.362743
2,3,2096.0,-1.165981
3,4,7882.0,1.128494
4,5,4870.0,-0.065934
5,6,9055.0,1.593654
6,7,2835.0,-0.872926
7,8,6948.0,0.75811
8,9,3059.0,-0.784098
9,10,5234.0,0.078412


Unnamed: 0,ID,Salary,Salary_Std
15,16,10365.0,2.113143
16,17,1006.0,-1.598228
17,18,4532.0,-0.19997
18,19,6395.0,0.538814
19,20,7551.0,0.997234


In [10]:
print(scaler.mean_)
print(np.sqrt(scaler.var_))

[5036.26666667]
[2521.70961761]


## Target Encoding - Prematurely

In [11]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)
enc = enc.fit(X[['Province']], y)

X['Province_TE'] = enc.transform(X[['Province']])
X[['ID', 'Province', 'Province_TE']]

  import pandas.util.testing as tm


Unnamed: 0,ID,Province,Province_TE
0,1,ON,0.714286
1,2,ON,0.714286
2,3,ON,0.714286
3,4,ON,0.714286
4,5,ON,0.714286
5,6,BC,0.571429
6,7,BC,0.571429
7,8,BC,0.571429
8,9,BC,0.571429
9,10,BC,0.571429


## Target Encoding - The Right Way

In [12]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)
enc = enc.fit(X_train[['Province']], y_train)

X_train['Province_TE'] = enc.transform(X_train[['Province']])
X_train[['ID', 'Province', 'Province_TE']]

X_test['Province_TE'] = enc.transform(X_test[['Province']])
X_test[['ID', 'Province', 'Province_TE']]



Unnamed: 0,ID,Province,Province_TE
0,1,ON,0.8
1,2,ON,0.8
2,3,ON,0.8
3,4,ON,0.8
4,5,ON,0.8
5,6,BC,0.6
6,7,BC,0.6
7,8,BC,0.6
8,9,BC,0.6
9,10,BC,0.6


Unnamed: 0,ID,Province,Province_TE
15,16,ON,0.8
16,17,ON,0.8
17,18,BC,0.6
18,19,BC,0.6
19,20,QB,0.2


# Demonstration with Modeling

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
df = pd.read_csv('https://raw.githubusercontent.com/stepthom/869_course/main/data/bank.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [15]:
from sklearn.model_selection import train_test_split

X0 = df.drop('y', axis=1)
y0 = df['y']
y0 = y0.map({"no":0, "yes":1})

# To illustrate how bad an improper estimate is, let's hold out some data
# to use as "future, real, new" data, and call it "secret"

X, X_secret, y, y_secret = train_test_split(X0, y0, test_size=0.1, random_state=99)

In [16]:
y0

0       0
1       0
2       0
3       0
4       0
       ..
4516    0
4517    0
4518    0
4519    0
4520    0
Name: y, Length: 4521, dtype: int64

In [17]:
cat_cols = X.select_dtypes('object').columns.tolist()
cat_cols

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [18]:
num_cols = [c for c in X.columns.tolist() if c not in cat_cols]
num_cols

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Improper Way

In [39]:
# Make a copy of X for the improper version
_X = X.copy(deep=True)
_X_secret = X_secret.copy(deep=True)

In [40]:
_X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
143,31,management,single,tertiary,no,874,yes,no,cellular,18,nov,190,2,-1,0,unknown
457,25,services,single,secondary,no,402,no,yes,cellular,7,jul,264,1,-1,0,unknown
4298,26,admin.,single,secondary,no,3529,no,yes,cellular,7,sep,57,2,-1,0,unknown
1403,29,blue-collar,married,secondary,no,912,yes,no,cellular,13,may,785,1,-1,0,unknown
3744,26,admin.,single,secondary,no,321,yes,no,cellular,22,jul,242,7,-1,0,unknown


In [41]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=10, smoothing=0.01, return_df=True)
enc = enc.fit(_X[cat_cols], y)

_X[cat_cols] = enc.transform(_X[cat_cols])

In [42]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler = scaler.fit(_X[num_cols])

_X[num_cols] = scaler.transform(_X[num_cols])

In [43]:
_X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
143,-0.961928,0.134638,0.135814,0.141089,0.114586,-0.183275,0.086303,0.124674,0.141711,0.261784,0.098592,-0.284965,-0.255032,-0.407969,-0.328076,0.090363
457,-1.530889,0.093583,0.135814,0.105441,0.114586,-0.338322,0.152386,0.058158,0.141711,-1.077911,0.078704,-0.000627,-0.572592,-0.407969,-0.328076,0.090363
4298,-1.436063,0.120092,0.135814,0.105441,0.114586,0.688866,0.152386,0.058158,0.141711,-1.077911,0.357143,-0.796005,-0.255032,-0.407969,-0.328076,0.090363
1403,-1.151582,0.067995,0.099002,0.105441,0.114586,-0.170792,0.086303,0.124674,0.141711,-0.347169,0.069323,2.001266,-0.572592,-0.407969,-0.328076,0.090363
3744,-1.436063,0.120092,0.135814,0.105441,0.114586,-0.36493,0.086303,0.124674,0.141711,0.748945,0.078704,-0.08516,1.33277,-0.407969,-0.328076,0.090363


In [44]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, f1_score

clf = RandomForestClassifier(random_state=3, class_weight="balanced", n_estimators=1500, n_jobs=5)

cv_scores = cross_val_score(clf, _X, y, scoring="f1_macro", cv=10)
print(cv_scores)
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))

[0.66302634 0.61590597 0.63010166 0.63336479 0.58098833 0.56126482
 0.65813439 0.59569252 0.65717228 0.68981249]
Estimated Score: 0.629 +/- 0.04


In [25]:
clf.fit(_X, y)

RandomForestClassifier(class_weight='balanced', n_estimators=1500, n_jobs=5,
                       random_state=3)

In [26]:
_X_secret[cat_cols] = enc.transform(_X_secret[cat_cols])
_X_secret[num_cols] = scaler.transform(_X_secret[num_cols])

preds = clf.predict(_X_secret)
print(classification_report(y_secret, preds))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94       398
           1       0.75      0.22      0.34        55

    accuracy                           0.90       453
   macro avg       0.83      0.60      0.64       453
weighted avg       0.88      0.90      0.87       453



In [27]:
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))
print("Actual Score:    {:0.3f}".format(f1_score(y_secret, preds, average="macro")))

Estimated Score: 0.629 +/- 0.04
Actual Score:    0.641


# Proper Way

In [28]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
143,31,management,single,tertiary,no,874,yes,no,cellular,18,nov,190,2,-1,0,unknown
457,25,services,single,secondary,no,402,no,yes,cellular,7,jul,264,1,-1,0,unknown
4298,26,admin.,single,secondary,no,3529,no,yes,cellular,7,sep,57,2,-1,0,unknown
1403,29,blue-collar,married,secondary,no,912,yes,no,cellular,13,may,785,1,-1,0,unknown
3744,26,admin.,single,secondary,no,321,yes,no,cellular,22,jul,242,7,-1,0,unknown


In [29]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

clf = RandomForestClassifier(random_state=3, class_weight="balanced", n_estimators=1500, n_jobs=5)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ])

categorical_transformer = Pipeline(steps=[
      ('encoder', ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)),
      ])

preprocessor2 = Pipeline(steps=[
      ('ct', ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols),
        ],
            remainder = 'passthrough', 
            sparse_threshold=0)),
    ])

pipe2 = Pipeline(steps=[('preprocessor', preprocessor2),  ('clf', clf)])

cv_scores = cross_val_score(pipe2, X, y, scoring="f1_macro", cv=10)
print(cv_scores)
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))



[0.6437117  0.63010166 0.59844715 0.62691667 0.61590597 0.56364522
 0.65813439 0.58098833 0.67389558 0.68567742]
Estimated Score: 0.628 +/- 0.04


In [30]:
pipe2 = pipe2.fit(X, y)



In [31]:
preds = pipe2.predict(X_secret)
print(classification_report(y_secret, preds))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94       398
           1       0.71      0.18      0.29        55

    accuracy                           0.89       453
   macro avg       0.81      0.59      0.62       453
weighted avg       0.88      0.89      0.86       453



In [32]:
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))
print("Actual Score:    {:0.3f}".format(f1_score(y_secret, preds, average="macro")))

Estimated Score: 0.628 +/- 0.04
Actual Score:    0.616
