# Playground for Data Leakage

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2022-07-19 11:35:04.150636


In [2]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 3.3 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


In [3]:
import sklearn
sklearn.__version__

'1.0.2'

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# disable chained assignments
pd.options.mode.chained_assignment = None 

# Load Data

In [5]:
train_df = pd.DataFrame(
    data={
      'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + [16, 17, 18, 19, 20], 
      'Salary': [1169.,	5951,	2096,	7882,	4870,	9055,	2835,	6948,	3059,	5234, 6854, 2255, 7965, 7365, 2006] + [10365, 1006, 4532, 6395, 7551],
      'Province': ['ON', 'ON', 'ON', 'ON', 'ON', 'BC', 'BC', 'BC', 'BC', 'BC', 'QB', 'QB', 'QB', 'QB', 'QB'] + ['ON', 'ON', 'BC', 'BC', 'QB'],
      'Default': [0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1] + [0, 1, 0, 1, 1],
    })


X = train_df.drop(['Default'], axis=1)
y = train_df['Default']
train_df

Unnamed: 0,ID,Salary,Province,Default
0,1,1169.0,ON,0
1,2,5951.0,ON,1
2,3,2096.0,ON,1
3,4,7882.0,ON,1
4,5,4870.0,ON,1
5,6,9055.0,BC,0
6,7,2835.0,BC,0
7,8,6948.0,BC,1
8,9,3059.0,BC,1
9,10,5234.0,BC,1


In [6]:
# Manually create some train_test split, just for illustration
X_train = X.iloc[0:15,:]
y_train = y[:15]

X_test = X.iloc[15:,:]
y_test = y[15:]

# Premature Featurization

## Standard Scaling - Prematurely

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler=scaler.fit(X[['Salary']])

X['Salary_Std'] = scaler.transform(X[['Salary']])
X[['ID', 'Salary', 'Salary_Std']]

# Now  split ...
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,ID,Salary,Salary_Std
0,1,1169.0,-1.510705
1,2,5951.0,0.251014
2,3,2096.0,-1.169193
3,4,7882.0,0.962406
4,5,4870.0,-0.147234
5,6,9055.0,1.394547
6,7,2835.0,-0.89694
7,8,6948.0,0.618315
8,9,3059.0,-0.814417
9,10,5234.0,-0.013134


In [8]:
print(scaler.mean_)
print(np.sqrt(scaler.var_))

[5269.65]
[2714.39411794]


## Standard Scaling - The Right Way

In [9]:
# First  split ...
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaler=scaler.fit(X_train[['Salary']])

X_train['Salary_Std'] = scaler.transform(X_train[['Salary']])
X_test['Salary_Std'] = scaler.transform(X_test[['Salary']])


X_train[['ID', 'Salary', 'Salary_Std']]
X_test[['ID', 'Salary', 'Salary_Std']]

Unnamed: 0,ID,Salary,Salary_Std
0,1,1169.0,-1.533589
1,2,5951.0,0.362743
2,3,2096.0,-1.165981
3,4,7882.0,1.128494
4,5,4870.0,-0.065934
5,6,9055.0,1.593654
6,7,2835.0,-0.872926
7,8,6948.0,0.75811
8,9,3059.0,-0.784098
9,10,5234.0,0.078412


Unnamed: 0,ID,Salary,Salary_Std
15,16,10365.0,2.113143
16,17,1006.0,-1.598228
17,18,4532.0,-0.19997
18,19,6395.0,0.538814
19,20,7551.0,0.997234


In [10]:
print(scaler.mean_)
print(np.sqrt(scaler.var_))

[5036.26666667]
[2521.70961761]


## Target Encoding - Prematurely

In [11]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)
enc = enc.fit(X[['Province']], y)

X['Province_TE'] = enc.transform(X[['Province']])
X[['ID', 'Province', 'Province_TE']]

  import pandas.util.testing as tm


Unnamed: 0,ID,Province,Province_TE
0,1,ON,0.714286
1,2,ON,0.714286
2,3,ON,0.714286
3,4,ON,0.714286
4,5,ON,0.714286
5,6,BC,0.571429
6,7,BC,0.571429
7,8,BC,0.571429
8,9,BC,0.571429
9,10,BC,0.571429


## Target Encoding - The Right Way

In [12]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True)
enc = enc.fit(X_train[['Province']], y_train)

X_train['Province_TE'] = enc.transform(X_train[['Province']])
X_train[['ID', 'Province', 'Province_TE']]

X_test['Province_TE'] = enc.transform(X_test[['Province']])
X_test[['ID', 'Province', 'Province_TE']]



Unnamed: 0,ID,Province,Province_TE
0,1,ON,0.8
1,2,ON,0.8
2,3,ON,0.8
3,4,ON,0.8
4,5,ON,0.8
5,6,BC,0.6
6,7,BC,0.6
7,8,BC,0.6
8,9,BC,0.6
9,10,BC,0.6


Unnamed: 0,ID,Province,Province_TE
15,16,ON,0.8
16,17,ON,0.8
17,18,BC,0.6
18,19,BC,0.6
19,20,QB,0.2


# Demonstration with Modeling

In [271]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [300]:
df = pd.read_csv('https://raw.githubusercontent.com/stepthom/869_course/main/data/generated_german.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 58 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   UserID                                  1000 non-null   object 
 1   FirstName                               1000 non-null   object 
 2   LastName                                1000 non-null   object 
 3   DateOfBirth                             1000 non-null   object 
 4   Sex                                     1000 non-null   object 
 5   Street                                  1000 non-null   object 
 6   City                                    1000 non-null   object 
 7   LicensePlate                            1000 non-null   object 
 8   Married                                 1000 non-null   float64
 9   NumberPets                              1000 non-null   float64
 10  Duration                                1000 non-null   int64

Unnamed: 0,UserID,FirstName,LastName,DateOfBirth,Sex,Street,City,LicensePlate,Married,NumberPets,...,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified,EmploymentDuration,SavingsAccountBonds,BadCredit
0,218-84-8180,Christopher,Gray,1953-09-02,M,503 Linda Locks,North Judithbury,395C,0.0,0.0,...,0,1,0,0,0,1,0,7.0,281.0,0
1,643-21-6917,Jennifer,Rocha,1999-09-30,F,42388 Burgess Meadow Suite 532,East Jill,012 PCY,1.0,0.0,...,0,1,0,0,0,1,0,2.0,0.0,1
2,520-14-4890,Kyle,Cruz,1973-03-01,M,480 Erin Plain Suite 514,West Michael,7-F0482,0.0,2.0,...,0,1,0,0,1,0,0,4.0,0.0,0
3,081-11-7963,Ryan,Romero,1975-10-17,M,52880 Burns Creek,North Judithbury,30Z J39,0.0,1.0,...,0,0,1,0,0,1,0,4.0,0.0,0
4,463-16-4062,Robert,Spence,1969-05-15,M,78248 Brandt Plains,Ramirezstad,3-46578,0.0,0.0,...,0,0,1,0,0,1,0,2.0,0.0,1


In [301]:
for t in ['Purpose', 'Property', 'Job', 'Housing', 'OtherDebtorsGuarantors', 'CreditHistory', 'CheckingAccountStatus', 'OtherInstallmentPlans']:
  cols = [c for c in df.columns.tolist() if t in c]
  df[t] = df[cols].idxmax(axis=1)
  df = df.drop(cols, axis=1)


df['Married'] = df['Married'].astype("object")
df['OwnCar'] = df['OwnCar'].astype("object")
df['ForeignWorker'] = df['ForeignWorker'].astype("object")

df.head()

Unnamed: 0,UserID,FirstName,LastName,DateOfBirth,Sex,Street,City,LicensePlate,Married,NumberPets,...,SavingsAccountBonds,BadCredit,Purpose,Property,Job,Housing,OtherDebtorsGuarantors,CreditHistory,CheckingAccountStatus,OtherInstallmentPlans
0,218-84-8180,Christopher,Gray,1953-09-02,M,503 Linda Locks,North Judithbury,395C,0.0,0.0,...,281.0,0,Purpose.Radio.Television,Property.RealEstate,Job.SkilledEmployee,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.Critical,CheckingAccountStatus.lt.0,OtherInstallmentPlans.None
1,643-21-6917,Jennifer,Rocha,1999-09-30,F,42388 Burgess Meadow Suite 532,East Jill,012 PCY,1.0,0.0,...,0.0,1,Purpose.Radio.Television,Property.RealEstate,Job.SkilledEmployee,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.0.to.200,OtherInstallmentPlans.None
2,520-14-4890,Kyle,Cruz,1973-03-01,M,480 Erin Plain Suite 514,West Michael,7-F0482,0.0,2.0,...,0.0,0,Purpose.Education,Property.RealEstate,Job.UnskilledResident,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.Critical,CheckingAccountStatus.none,OtherInstallmentPlans.None
3,081-11-7963,Ryan,Romero,1975-10-17,M,52880 Burns Creek,North Judithbury,30Z J39,0.0,1.0,...,0.0,0,Purpose.Furniture.Equipment,Property.Insurance,Job.SkilledEmployee,Housing.ForFree,OtherDebtorsGuarantors.Guarantor,CreditHistory.PaidDuly,CheckingAccountStatus.lt.0,OtherInstallmentPlans.None
4,463-16-4062,Robert,Spence,1969-05-15,M,78248 Brandt Plains,Ramirezstad,3-46578,0.0,0.0,...,0.0,1,Purpose.NewCar,Property.Unknown,Job.SkilledEmployee,Housing.ForFree,OtherDebtorsGuarantors.None,CreditHistory.Delay,CheckingAccountStatus.lt.0,OtherInstallmentPlans.None


In [302]:
from sklearn.model_selection import train_test_split

X0 = df.drop('BadCredit', axis=1)
y0 = df['BadCredit']
#y0 = y0.map({"no":0, "yes":1})

# To illustrate how bad an improper estimate is, let's hold out some data
# to use as "future, real, new" data, and call it "secret"

X, X_secret, y, y_secret = train_test_split(X0, y0, 
                                            test_size=0.1,
                                            stratify=y0, 
                                            random_state=99)

In [275]:
y0

0      0
1      1
2      0
3      0
4      1
      ..
995    0
996    0
997    0
998    1
999    0
Name: BadCredit, Length: 1000, dtype: int64

In [276]:
y0.value_counts()

0    700
1    300
Name: BadCredit, dtype: int64

In [277]:
y.value_counts()

0    630
1    270
Name: BadCredit, dtype: int64

In [278]:
y_secret.value_counts()

0    70
1    30
Name: BadCredit, dtype: int64

In [283]:
all_cat_cols = X.select_dtypes('object').columns.tolist()
cat_cols = [c for c in all_cat_cols if c not in ['UserID', 'FirstName', 'LastName', 'DateOfBirth', 'Street']]
cat_cols

['Sex',
 'City',
 'LicensePlate',
 'Married',
 'OwnCar',
 'ForeignWorker',
 'Purpose',
 'Property',
 'Job',
 'Housing',
 'OtherDebtorsGuarantors',
 'CreditHistory',
 'CheckingAccountStatus',
 'OtherInstallmentPlans']

In [284]:
num_cols = [c for c in X.columns.tolist() if c not in all_cat_cols]
num_cols

['NumberPets',
 'Duration',
 'Amount',
 'InstallmentRatePercentage',
 'ResidenceDuration',
 'NumberExistingCredits',
 'NumberPeopleMaintenance',
 'EmploymentDuration',
 'SavingsAccountBonds']

# Improper Way

In [285]:
# Make a copy of X for the improper version
_X = X.copy(deep=True)
_X_secret = X_secret.copy(deep=True)

In [286]:
_X.head()

Unnamed: 0,UserID,FirstName,LastName,DateOfBirth,Sex,Street,City,LicensePlate,Married,NumberPets,...,EmploymentDuration,SavingsAccountBonds,Purpose,Property,Job,Housing,OtherDebtorsGuarantors,CreditHistory,CheckingAccountStatus,OtherInstallmentPlans
385,243-64-8334,Amy,Woodward,1996-10-15,F,04615 Robert Islands,Ramirezstad,MD0 3399,1.0,2.0,...,3.0,0.0,Purpose.Radio.Television,Property.CarOther,Job.SkilledEmployee,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.Critical,CheckingAccountStatus.none,OtherInstallmentPlans.None
714,841-61-8923,Robert,Forbes,1995-04-20,M,687 Jennifer Summit,New Roberttown,957JLC,0.0,0.0,...,6.0,0.0,Purpose.NewCar,Property.Unknown,Job.Management.SelfEmp.HighlyQualified,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.0.to.200,OtherInstallmentPlans.None
536,031-94-2608,Jamie,Ruiz,1945-11-03,F,884 Heidi Radial,East Jill,9LS 610,1.0,2.0,...,,42.0,Purpose.NewCar,Property.Insurance,Job.Management.SelfEmp.HighlyQualified,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.lt.0,OtherInstallmentPlans.None
94,207-69-4443,Evan,Bailey,1966-10-30,M,1986 Cardenas Trail Apt. 404,North Judithbury,473-CLO,0.0,2.0,...,15.0,1062.0,Purpose.NewCar,Property.RealEstate,Job.SkilledEmployee,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.0.to.200,OtherInstallmentPlans.None
517,564-11-5807,Tiffany,Ray,1997-08-28,F,71130 Lopez Burgs,East Jill,V41-QOL,1.0,0.0,...,0.0,0.0,Purpose.Furniture.Equipment,Property.Insurance,Job.SkilledEmployee,Housing.Rent,OtherDebtorsGuarantors.None,CreditHistory.Critical,CheckingAccountStatus.none,OtherInstallmentPlans.None


In [287]:
from sklearn.impute import SimpleImputer

imp_num = SimpleImputer(strategy="mean")
_X[num_cols] = imp_num.fit_transform(_X[num_cols])

imp_cat = SimpleImputer(strategy="most_frequent")
_X[cat_cols] = imp_cat.fit_transform(_X[cat_cols])

In [288]:
import category_encoders as ce

enc = ce.target_encoder.TargetEncoder(min_samples_leaf=2, smoothing=0.01, return_df=True)
enc = ce.woe.WOEEncoder(sigma=5.05, regularization=0.01, return_df=True)
_X[cat_cols] = enc.fit_transform(_X[cat_cols], y)

In [289]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
_X[num_cols] = scaler.fit_transform(_X[num_cols])

In [290]:
drop_cols = [c for c in X.columns.tolist() if ((c not in cat_cols) and (c not in num_cols))]
drop_cols
_X = _X.drop(drop_cols, axis=1)

['UserID', 'FirstName', 'LastName', 'DateOfBirth', 'Street']

In [291]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, f1_score

clf = RandomForestClassifier(random_state=3, class_weight="balanced", n_estimators=1500, n_jobs=5)
clf = DecisionTreeClassifier(random_state=4)

cv_scores = cross_val_score(clf, _X, y, scoring="f1_macro", cv=10)
print(cv_scores)
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))

[0.60205392 0.6484375  0.61224187 0.60205392 0.58397535 0.6
 0.6534018  0.66572575 0.71920963 0.72509091]
Estimated Score: 0.641 +/- 0.05


In [292]:
clf.fit(_X, y)

DecisionTreeClassifier(random_state=4)

In [293]:
_X_secret[num_cols] = imp_num.transform(_X_secret[num_cols])
_X_secret[cat_cols] = imp_cat.transform(_X_secret[cat_cols])
_X_secret[cat_cols] = enc.transform(_X_secret[cat_cols])
_X_secret[num_cols] = scaler.transform(_X_secret[num_cols])
_X_secret = _X_secret.drop(drop_cols, axis=1)

preds = clf.predict(_X_secret)
print(classification_report(y_secret, preds))
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))
print("Actual Score:    {:0.3f}".format(f1_score(y_secret, preds, average="macro")))

              precision    recall  f1-score   support

           0       0.70      0.73      0.71        70
           1       0.30      0.27      0.28        30

    accuracy                           0.59       100
   macro avg       0.50      0.50      0.50       100
weighted avg       0.58      0.59      0.58       100

Estimated Score: 0.641 +/- 0.05
Actual Score:    0.497


# Proper Way

In [294]:
X.head()

Unnamed: 0,UserID,FirstName,LastName,DateOfBirth,Sex,Street,City,LicensePlate,Married,NumberPets,...,EmploymentDuration,SavingsAccountBonds,Purpose,Property,Job,Housing,OtherDebtorsGuarantors,CreditHistory,CheckingAccountStatus,OtherInstallmentPlans
385,243-64-8334,Amy,Woodward,1996-10-15,F,04615 Robert Islands,Ramirezstad,MD0 3399,1.0,2.0,...,3.0,0.0,Purpose.Radio.Television,Property.CarOther,Job.SkilledEmployee,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.Critical,CheckingAccountStatus.none,OtherInstallmentPlans.None
714,841-61-8923,Robert,Forbes,1995-04-20,M,687 Jennifer Summit,New Roberttown,957JLC,0.0,0.0,...,6.0,0.0,Purpose.NewCar,Property.Unknown,Job.Management.SelfEmp.HighlyQualified,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.0.to.200,OtherInstallmentPlans.None
536,031-94-2608,Jamie,Ruiz,1945-11-03,F,884 Heidi Radial,East Jill,9LS 610,1.0,2.0,...,,42.0,Purpose.NewCar,Property.Insurance,Job.Management.SelfEmp.HighlyQualified,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.lt.0,OtherInstallmentPlans.None
94,207-69-4443,Evan,Bailey,1966-10-30,M,1986 Cardenas Trail Apt. 404,North Judithbury,473-CLO,0.0,2.0,...,15.0,1062.0,Purpose.NewCar,Property.RealEstate,Job.SkilledEmployee,Housing.Own,OtherDebtorsGuarantors.None,CreditHistory.PaidDuly,CheckingAccountStatus.0.to.200,OtherInstallmentPlans.None
517,564-11-5807,Tiffany,Ray,1997-08-28,F,71130 Lopez Burgs,East Jill,V41-QOL,1.0,0.0,...,0.0,0.0,Purpose.Furniture.Equipment,Property.Insurance,Job.SkilledEmployee,Housing.Rent,OtherDebtorsGuarantors.None,CreditHistory.Critical,CheckingAccountStatus.none,OtherInstallmentPlans.None


In [304]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

clf = RandomForestClassifier(random_state=3, class_weight="balanced", n_estimators=1500, n_jobs=5)
clf = DecisionTreeClassifier(random_state=4)

numeric_transformer = Pipeline(steps=[
    ('imp', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler()),
    ])

categorical_transformer = Pipeline(steps=[
      ('imp', SimpleImputer(strategy="most_frequent")),
      ('encoder', ce.target_encoder.TargetEncoder(min_samples_leaf=2, smoothing=0.01, return_df=True)),
      ])

preprocessor2 = Pipeline(steps=[
      ('ct', ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols),
        ],
            remainder = 'drop', 
            sparse_threshold=0)),
    ])

pipe2 = Pipeline(steps=[('preprocessor', preprocessor2),  ('clf', clf)])

cv_scores = cross_val_score(pipe2, X, y, scoring="f1_macro", cv=10)
print(cv_scores)
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))

[0.59299039 0.72956731 0.53354812 0.625      0.55876173 0.57671958
 0.64386659 0.72956731 0.61842849 0.66572575]
Estimated Score: 0.627 +/- 0.06


In [305]:
pipe2 = pipe2.fit(X, y)

In [306]:
preds = pipe2.predict(X_secret)
print(classification_report(y_secret, preds))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80        70
           1       0.50      0.33      0.40        30

    accuracy                           0.70       100
   macro avg       0.62      0.60      0.60       100
weighted avg       0.68      0.70      0.68       100



In [307]:
print("Estimated Score: {:0.3f} +/- {:0.2f}".format(np.mean(cv_scores), np.std(cv_scores)))
print("Actual Score:    {:0.3f}".format(f1_score(y_secret, preds, average="macro")))

Estimated Score: 0.627 +/- 0.06
Actual Score:    0.600
