Data leakage happens when information from test data sneaks (or leaks) into your training data during data preparation steps. This often happens during routine data processing tasks without you noticing it. When this happens, the model learns from test data it wasn’t supposed to see, making the test results misleading.

Data leakage is a common problem in machine learning that occurs when data that’s not supposed to be seen by a model (like test data or future data) is accidentally used to train the model. This can lead to the model overfitting and not performing well on new, unseen data.

In [None]:
!pip install -U -q imbalanced-learn

#### Data Preprocessing + Classification (with Leakage)

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Create dataset
dataset_dict = {
    'Outlook': ['sunny', 'sunny', 'overcast', 'rain', 'rain', 'rain', 'overcast', 'sunny', 'sunny', 'rain', 'sunny', 'overcast', 'overcast', 'rain', 'sunny', 'overcast', 'rain', 'sunny', 'sunny', 'rain', 'overcast', 'rain', 'sunny', 'overcast', 'sunny', 'overcast', 'rain', 'overcast'],
    'Temperature': [85.0, 80.0, 83.0, 70.0, 68.0, 65.0, 64.0, 72.0, 69.0, 75.0, 75.0, 72.0, 81.0, 71.0, 81.0, 74.0, 76.0, 78.0, 82.0, 67.0, 85.0, 73.0, 88.0, 77.0, 79.0, 80.0, 66.0, 84.0],
    'Humidity': [85.0, 90.0, 78.0, 96.0, 80.0, 70.0, 65.0, 95.0, 70.0, 80.0, 70.0, 90.0, 75.0, 80.0, 88.0, 92.0, 85.0, 75.0, 92.0, 90.0, 85.0, 88.0, 65.0, 70.0, 60.0, 95.0, 70.0, 78.0],
    'Wind': [False, True, False, False, False, True, True, False, False, False, True, True, False, True, True, False, False, True, False, True, True, False, True, False, False, True, False, False],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(dataset_dict)
df['Wind'] = df['Wind'].astype(int)
X, y = df.drop('Play', axis=1), df['Play']

# Preprocess AND apply SMOTE to ALL data first (causing leakage)
preprocessor = ColumnTransformer(transformers=[
    ('temp_transform', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal'))
    ]), ['Temperature']),
    ('humid_transform', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal'))
    ]), ['Humidity']),
    ('outlook_transform', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
     ['Outlook']),
    ('wind_transform', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=False)),
        ('scaler', StandardScaler())
    ]), ['Wind'])
])

# Transform all data and apply SMOTE before splitting (leakage!)
X_transformed = preprocessor.fit_transform(X)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_transformed, y)

# Split the already transformed and resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.5, shuffle=False)

# Train a classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

print(f"Testing Accuracy (with leakage): {accuracy_score(y_test, clf.predict(X_test)):.2%}")

Testing Accuracy (with leakage): 77.78%


#### Data Preprocessing + Classification (without leakage)

In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Create dataset
dataset_dict = {
    'Outlook': ['sunny', 'sunny', 'overcast', 'rain', 'rain', 'rain', 'overcast', 'sunny', 'sunny', 'rain', 'sunny', 'overcast', 'overcast', 'rain', 'sunny', 'overcast', 'rain', 'sunny', 'sunny', 'rain', 'overcast', 'rain', 'sunny', 'overcast', 'sunny', 'overcast', 'rain', 'overcast'],
    'Temperature': [85.0, 80.0, 83.0, 70.0, 68.0, 65.0, 64.0, 72.0, 69.0, 75.0, 75.0, 72.0, 81.0, 71.0, 81.0, 74.0, 76.0, 78.0, 82.0, 67.0, 85.0, 73.0, 88.0, 77.0, 79.0, 80.0, 66.0, 84.0],
    'Humidity': [85.0, 90.0, 78.0, 96.0, 80.0, 70.0, 65.0, 95.0, 70.0, 80.0, 70.0, 90.0, 75.0, 80.0, 88.0, 92.0, 85.0, 75.0, 92.0, 90.0, 85.0, 88.0, 65.0, 70.0, 60.0, 95.0, 70.0, 78.0],
    'Wind': [False, True, False, False, False, True, True, False, False, False, True, True, False, True, True, False, False, True, False, True, True, False, True, False, False, True, False, False],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(dataset_dict)
df['Wind'] = df['Wind'].astype(int)
X, y = df.drop('Play', axis=1), df['Play']

# Split first (before any processing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)

# Create pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(transformers=[
        ('temp_transform', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal'))
        ]), ['Temperature']),
        ('humid_transform', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal'))
        ]), ['Humidity']),
        ('outlook_transform', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
         ['Outlook']),
        ('wind_transform', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=False)),
            ('scaler', StandardScaler())
        ]), ['Wind'])
    ])),
    ('smote', SMOTE(random_state=42, k_neighbors=3)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Fit pipeline on training data only
pipeline.fit(X_train, y_train)

print(f"Training Accuracy: {accuracy_score(y_train, pipeline.predict(X_train)):.2%}")
print(f"Testing Accuracy: {accuracy_score(y_test, pipeline.predict(X_test)):.2%}")

Training Accuracy: 100.00%
Testing Accuracy: 85.71%
