# How to Avoid Data Leakage When Performing Data Preparation

Author: Jason Brownlee

Article from [machinelearningmastery](https://machinelearningmastery.com/data-preparation-without-data-leakage/).

> Note: In this notebook, I am studying the article mentioned above. Some changes may have been made to the code during its implementation.

# Library

In [25]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Data preparation with train and test sets

## Define dataset

In [2]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

## Summarize the dataset

In [3]:
print(X.shape, y.shape)

(1000, 20) (1000,)


# Train-test evaluation with data preparation

## Standardize the dataset

In [5]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

## Split into train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

## Fit the model

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

## Evaluate the model

In [8]:
yhat = model.predict(X_test)

## Evaluate predictions

In [9]:
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

Accuracy: 84.848


# Train-test evaluation with correct data preparation

## Define dataset

In [10]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

## Split into train and test sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

## Fit on the training dataset

In [12]:
scaler.fit(X_train)

## Scale the training dataset

In [13]:
X_train = scaler.transform(X_train)

## Scale the test dataset

In [14]:
X_test = scaler.transform(X_test)

## Fit the model

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)

## Evaluate the model

In [16]:
yhat = model.predict(X_test)

## Evaluate predictions

In [17]:
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

Accuracy: 85.455


# Cross-validation evaluation with naive data preparation

## Standardize the dataset

In [20]:
X = scaler.fit_transform(X)

## Define the evaluation procedure

In [21]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

## Evaluate the model using cross-validation

In [23]:
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.86, 0.91, 0.88, 0.81, 0.83, 0.84, 0.81, 0.84, 0.88, 0.84, 0.84,
       0.86, 0.85, 0.83, 0.89, 0.87, 0.79, 0.97, 0.84, 0.84, 0.81, 0.88,
       0.8 , 0.85, 0.89, 0.88, 0.87, 0.83, 0.83, 0.87])

## Report performance

In [24]:
print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))

Accuracy: 85.300 (3.607)


# Cross-validation evaluation with correct data preparation

## Define dataset

In [26]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

## Define the pipeline

In [28]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps=steps)
pipeline

## Define the evaluation procedure

In [30]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

## Evaluate the model using cross-validation

In [31]:
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.86, 0.91, 0.87, 0.81, 0.83, 0.84, 0.81, 0.84, 0.88, 0.84, 0.84,
       0.86, 0.85, 0.83, 0.89, 0.88, 0.8 , 0.97, 0.84, 0.84, 0.81, 0.88,
       0.81, 0.85, 0.89, 0.88, 0.87, 0.84, 0.84, 0.87])

## Report performance

In [32]:
print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))

Accuracy: 85.433 (3.471)
