# Data Leakage Test

In this notebook, we are testing the effect of data leakage on the model performance.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def display_results(cv, y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    accuracy = (y_pred == y_test).mean()

    print("Predicted Labels:", labels)
    print("True Labels:", np.unique(y_test))
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

### Defining two pipeline structure for testing


In [4]:
def pipeline_without_transformers():
    
    pipeline = Pipeline([
        ('clf', RandomForestClassifier(random_state=42))
    ])
    
    return pipeline


def pipeline_with_transformers():
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RandomForestClassifier(random_state=42))
    ])
    

    return pipeline

### Load the data

In [19]:
X, y = load_data()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
X_train.shape

(1610,)

In [25]:
X_test.shape

(793,)

### Training the pipeline with transformers outside the pipeline

In [26]:
# Instantiating all the transformer objects
countvectorizer = CountVectorizer(tokenizer=tokenize)
tfidftransfomer = TfidfTransformer()
stdscaler = StandardScaler(with_mean=False)

In [27]:
# fitting the transformer objects with training data
X_train_countvectorizer = countvectorizer.fit_transform(X_train)
X_train_tfidftransfomer = tfidftransfomer.fit_transform(X_train_countvectorizer)
X_train_stdscaler = stdscaler.fit_transform(X_train_tfidftransfomer)

In [28]:
# fitting the transformer objects with testing data
X_test_countvectorizer = countvectorizer.transform(X_test)
X_test_tfidftransfomer = tfidftransfomer.transform(X_test_countvectorizer)
X_test_stdscaler = stdscaler.transform(X_test_tfidftransfomer)

In [29]:
# instantiating the pipeline without transformers and cross-validating it on the training set
model_without = pipeline_without_transformers()

parameters = {'clf__n_estimators': [100]}

cv_without = GridSearchCV(model_without, parameters, cv=10, verbose=3)
cv_without.fit(X_train_stdscaler, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9259259259259259, total=   0.9s
[CV] clf__n_estimators=100 ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] .. clf__n_estimators=100, score=0.9320987654320988, total=   0.9s
[CV] clf__n_estimators=100 ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV] ... clf__n_estimators=100, score=0.937888198757764, total=   0.9s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9192546583850931, total=   0.9s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9130434782608695, total=   0.9s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9254658385093167, total=   0.9s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9130434782608695, total=   0.9s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9254658385093167, total=   0.9s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9316770186335404, total=   1.0s
[CV] clf__n_estimators=100 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.9s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__n_estimators': [100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=3)

In [34]:
cv_without.score(X_train_stdscaler, y_train)

1.0

### Testing the pipeline with transformers outside the pipeline 

In [30]:
cv_without.score(X_test_stdscaler, y_test)

0.94073139974779318

### Training the pipeline with transformers inside the pipeline 

In [31]:
# instantiating the pipeline with transformers
model_with = pipeline_with_transformers()

parameters = {'clf__n_estimators': [100]}

cv_with = GridSearchCV(model_with, parameters, cv=10, verbose=3)
cv_with.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9197530864197531, total=   1.3s
[CV] clf__n_estimators=100 ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV] .. clf__n_estimators=100, score=0.9567901234567902, total=   1.2s
[CV] clf__n_estimators=100 ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.7s remaining:    0.0s


[CV] .. clf__n_estimators=100, score=0.9565217391304348, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] ... clf__n_estimators=100, score=0.937888198757764, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9192546583850931, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9316770186335404, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9316770186335404, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] .. clf__n_estimators=100, score=0.9130434782608695, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] ... clf__n_estimators=100, score=0.937888198757764, total=   1.2s
[CV] clf__n_estimators=100 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.2s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__n_estimators': [100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=3)

In [33]:
cv_with.score(X_train, y_train)

1.0

### Testing the pipeline with transformers inside the pipeline 

In [32]:
cv_with.score(X_test, y_test)

0.94073139974779318

We didn't find any significant change in the testing scores. They are exactly the same. This might be due to low samples in the dataset. Only 1610 samples in the training and 793 samples in the testing set.