# TPS 11/21 - Handling Mislabeled Data

In this notebook, we will explore the [cleanlab](https://github.com/cleanlab/cleanlab) library which provides functions for "finding, quantifying, and learning with label errors in datasets." In particular, we will ese the `LearningWithNoisyLabels` wrapper with various scikit-learn cumpatible models to make predictions despite the mislabeled data.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 6

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import time
import os
import pyarrow
import gc

# cleanlab
import cleanlab
from cleanlab.classification import LearningWithNoisyLabels

# Model evaluation
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.feature_selection import mutual_info_classif

# Models
from sklearn.utils.extmath import softmax
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier


# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import Image

# Optuna 
import optuna

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Loading the Data

In [3]:
%%time

# Load data
train = pd.read_feather(f'../data/train.feather')
test = pd.read_feather('../data/test.feather')
submission = pd.read_csv('../data/sample_submission.csv')

# Get feature columns
features = [x for x in train.columns if x not in ['id', 'target']]

Wall time: 477 ms


# Feature Engineering

We follow the feature engineering from this [kaggle notebook](https://www.kaggle.com/javiervallejos/simple-nn-with-good-results-tps-nov-21) by computing some row statistics on the skewed and bimodal variables.

In [4]:
h_skew = train[features].loc[:,train[features].skew() >= 2].columns  # with Skewed 
l_skew = train[features].loc[:,train[features].skew() < 2].columns   # Bimodal

# Skewed distrubutions
train['median_h'] = train[h_skew].median(axis=1)
test['median_h'] = test[h_skew].median(axis=1)

train['var_h'] = train[h_skew].var(axis=1)
test['var_h'] = test[h_skew].var(axis=1)

# Bimodal distributions
train['mean_l'] = train[l_skew].mean(axis=1)
test['mean_l'] = test[l_skew].mean(axis=1)

train['std_l'] = train[l_skew].std(axis=1)
test['std_l'] = test[l_skew].std(axis=1)

train['median_l'] = train[l_skew].median(axis=1)
test['median_l'] = test[l_skew].median(axis=1)

train['skew_l'] = train[l_skew].skew(axis=1)
test['skew_l'] = test[l_skew].skew(axis=1)

train['max_l'] = train[l_skew].max(axis=1)
test['max_l'] = test[l_skew].max(axis=1)

train['var_l'] = train[l_skew].var(axis=1)
test['var_l'] = test[l_skew].var(axis=1)

# Update feature columns
features = [x for x in train.columns if x not in ['id', 'target']]

# Scoring Function

The following functions accept a scikit-learn compatible model or pipeline with fit, predict and predict_proba methods and return auc scores, out-of-fold predictions and test set predictions (averaged over each fold) for the vanilla models and the wrapped models, respectively.

In [5]:
# Scoring/Training Baseline Function
def train_model(sklearn_model):
    
    # Store the holdout predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        X_train, y_train = train[features].iloc[train_idx].to_numpy(), train['target'].iloc[train_idx].to_numpy()
        X_valid, y_valid = train[features].iloc[valid_idx].to_numpy(), train['target'].iloc[valid_idx].to_numpy()
        X_test = test[features]
        
        # Create model
        model = clone(sklearn_model)
            
        start = time.time()

        model.fit(X_train, y_train)
        
        end = time.time()
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[valid_idx] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(fold_auc, 5)} in {round(end-start,2)}s.')
        scores[fold] = fold_auc
        times[fold] = end-start
        
        time.sleep(0.5)
        
    print("\nAverage AUC:", round(scores.mean(), 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores, test_preds, oof_preds

In [6]:
# Scoring/Training function for LearningWithNoisyLabels
def train_noisy_model(sklearn_model):
    
    # Store the holdout predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        X_train, y_train = train[features].iloc[train_idx].to_numpy(), train['target'].iloc[train_idx].to_numpy()
        X_valid, y_valid = train[features].iloc[valid_idx].to_numpy(), train['target'].iloc[valid_idx].to_numpy()
        X_test = test[features]
        
        # Create model
        model = LearningWithNoisyLabels(
            clf = clone(sklearn_model)
        )
            
        start = time.time()

        model.fit(X_train, y_train)
        
        end = time.time()
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[valid_idx] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(fold_auc, 5)} in {round(end-start,2)}s.')
        scores[fold] = fold_auc
        times[fold] = end-start
        
        time.sleep(0.5)
        
    print("\nAverage AUC:", round(scores.mean(), 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores, test_preds, oof_preds

# Logisitic Regression

In [7]:
# Logistic Regression
logit_pipeline = make_pipeline(
    RobustScaler(),
    LogisticRegression(
        solver = 'saga',
        max_iter = 200,
        n_jobs = -1,
        random_state = RANDOM_SEED,
        C = 0.00093730740668689
    ),
)

In [8]:
# Logistic Regression Baseline
logit_scores, logit_preds, logit_oof = train_model(logit_pipeline)

submission['target'] = logit_preds
submission.to_csv('../output/logit_submission.csv', index=False)


Fold 0 (AUC): 0.74915 in 119.43s.
Fold 1 (AUC): 0.74864 in 120.18s.
Fold 2 (AUC): 0.74984 in 119.35s.
Fold 3 (AUC): 0.74989 in 128.84s.
Fold 4 (AUC): 0.74917 in 117.17s.
Fold 5 (AUC): 0.74772 in 116.77s.

Average AUC: 0.74907
Training Time: 721.75s


In [9]:
# Logistic Regression w/ Wrapper
noisy_logit_scores, noisy_logit_preds, noisy_logit_oof = train_noisy_model(logit_pipeline)

submission['target'] = noisy_logit_preds
submission.to_csv('../output/noisy_logit_submission.csv', index=False)


Fold 0 (AUC): 0.74924 in 552.16s.
Fold 1 (AUC): 0.74877 in 551.88s.
Fold 2 (AUC): 0.74985 in 545.86s.
Fold 3 (AUC): 0.7498 in 549.98s.
Fold 4 (AUC): 0.7491 in 544.48s.
Fold 5 (AUC): 0.74786 in 544.65s.

Average AUC: 0.7491
Training Time: 3289.0s


# Ridge Regression

The wrapper function expects an estimator with a `predict_proba` method, so we hack together an equivalent using the softmax function:

In [10]:
# Class extending Ridge Regression
class ExtendedRidgeClassifier(RidgeClassifier):
    def predict_proba(self, X):
        temp = self.decision_function(X)
        return softmax(np.c_[-temp, temp])
    
# Ridge Regression
ridge_pipeline = make_pipeline(
    RobustScaler(),
    ExtendedRidgeClassifier(
        alpha = 2.5553397058054763,
        solver = 'saga',
        random_state = RANDOM_SEED,
    ),
)

In [11]:
# Ridge Regression Baseline
ridge_scores, ridge_preds, ridge_oof = train_model(ridge_pipeline)

submission['target'] = ridge_preds
submission.to_csv('../output/ridge_submission.csv', index=False)


Fold 0 (AUC): 0.74906 in 8.92s.
Fold 1 (AUC): 0.74859 in 8.25s.
Fold 2 (AUC): 0.74975 in 9.75s.
Fold 3 (AUC): 0.74979 in 9.84s.
Fold 4 (AUC): 0.74914 in 8.71s.
Fold 5 (AUC): 0.74758 in 9.27s.

Average AUC: 0.74898
Training Time: 54.74s


In [12]:
# Ridge Regression w/ Wrapper
noisy_ridge_scores, noisy_ridge_preds, noisy_ridge_oof = train_noisy_model(ridge_pipeline)

submission['target'] = noisy_ridge_preds
submission.to_csv('../output/noisy_ridge_submission.csv', index=False)


Fold 0 (AUC): 0.74891 in 59.58s.
Fold 1 (AUC): 0.74857 in 54.6s.
Fold 2 (AUC): 0.74968 in 61.91s.
Fold 3 (AUC): 0.74957 in 58.01s.
Fold 4 (AUC): 0.7491 in 68.36s.
Fold 5 (AUC): 0.74747 in 60.98s.

Average AUC: 0.74888
Training Time: 363.44s


# Linear Discriminant Analysis

In [13]:
# Linear Discriminant Analysis
lda_pipeline = make_pipeline(
    StandardScaler(),
    LinearDiscriminantAnalysis(
        solver = 'eigen', 
        shrinkage = 0.17788226997464066
    ),
)

In [14]:
lda_scores, lda_preds, lda_oof = train_model(lda_pipeline)

submission['target'] = lda_preds
submission.to_csv('../output/lda_submission.csv', index=False)


Fold 0 (AUC): 0.74906 in 4.87s.
Fold 1 (AUC): 0.74863 in 4.89s.
Fold 2 (AUC): 0.74971 in 4.9s.
Fold 3 (AUC): 0.74971 in 4.92s.
Fold 4 (AUC): 0.74914 in 4.9s.
Fold 5 (AUC): 0.74763 in 4.89s.

Average AUC: 0.74898
Training Time: 29.36s


In [15]:
noisy_lda_scores, noisy_lda_preds, noisy_lda_oof = train_noisy_model(lda_pipeline)

submission['target'] = noisy_lda_preds
submission.to_csv('../output/noisy_lda_submission.csv', index=False)


Fold 0 (AUC): 0.74887 in 23.79s.
Fold 1 (AUC): 0.74855 in 23.9s.
Fold 2 (AUC): 0.74957 in 23.97s.
Fold 3 (AUC): 0.74941 in 23.87s.
Fold 4 (AUC): 0.74904 in 23.73s.
Fold 5 (AUC): 0.74747 in 23.9s.

Average AUC: 0.74882
Training Time: 143.16s


# SGDClassifier

Again, since the wrapper function expects an estimator with a `predict_proba` method, we create an equivalent using softmax:

In [16]:
# Extended SGDClassifier
class ExtendedSGDClassifier(SGDClassifier):
    def predict_proba(self, X):
        temp = self.decision_function(X)
        return softmax(np.c_[-temp, temp])

# SGDClassifier
sgd_pipeline = make_pipeline(
    RobustScaler(), 
    ExtendedSGDClassifier(
        learning_rate = 'adaptive', 
        penalty = 'l2', 
        alpha = 0.0064925580312465685, 
        eta0 = 0.00018074654973375143,
        random_state = RANDOM_SEED
    )
)

In [17]:
sgd_scores, sgd_preds, sgd_oof = train_model(sgd_pipeline)

submission['target'] = sgd_preds
submission.to_csv('../output/sgd_submission.csv', index=False)


Fold 0 (AUC): 0.74921 in 6.74s.
Fold 1 (AUC): 0.74877 in 6.63s.
Fold 2 (AUC): 0.74994 in 6.8s.
Fold 3 (AUC): 0.74994 in 6.67s.
Fold 4 (AUC): 0.74922 in 6.66s.
Fold 5 (AUC): 0.74779 in 6.64s.

Average AUC: 0.74914
Training Time: 40.14s


In [18]:
noisy_sgd_scores, noisy_sgd_preds, noisy_sgd_oof = train_noisy_model(sgd_pipeline)

submission['target'] = noisy_sgd_preds
submission.to_csv('../output/noisy_sgd_submission.csv', index=False)


Fold 0 (AUC): 0.74919 in 40.55s.
Fold 1 (AUC): 0.74875 in 40.58s.
Fold 2 (AUC): 0.74992 in 40.49s.
Fold 3 (AUC): 0.74982 in 40.53s.
Fold 4 (AUC): 0.74913 in 41.1s.
Fold 5 (AUC): 0.74785 in 40.56s.

Average AUC: 0.74911
Training Time: 243.8s


# Naive Bayes Classifier

In [19]:
# Naive Bayes Classifier
nb_pipeline = make_pipeline(
    StandardScaler(),
    MinMaxScaler(),
    MultinomialNB(),
)

In [20]:
nb_scores, nb_preds, nb_oof = train_model(nb_pipeline)

submission['target'] = nb_preds
submission.to_csv('../output/nb_submission.csv', index=False)


Fold 0 (AUC): 0.72464 in 2.03s.
Fold 1 (AUC): 0.72559 in 2.01s.
Fold 2 (AUC): 0.72667 in 1.98s.
Fold 3 (AUC): 0.72574 in 2.04s.
Fold 4 (AUC): 0.72654 in 2.0s.
Fold 5 (AUC): 0.72384 in 2.04s.

Average AUC: 0.7255
Training Time: 12.1s


In [21]:
noisy_nb_scores, noisy_nb_preds, noisy_nb_oof = train_noisy_model(nb_pipeline)

submission['target'] = noisy_nb_preds
submission.to_csv('../output/noisy_nb_submission.csv', index=False)


Fold 0 (AUC): 0.72113 in 9.81s.
Fold 1 (AUC): 0.72219 in 9.8s.
Fold 2 (AUC): 0.72345 in 9.82s.
Fold 3 (AUC): 0.72247 in 9.87s.
Fold 4 (AUC): 0.72345 in 9.9s.
Fold 5 (AUC): 0.72098 in 9.89s.

Average AUC: 0.72228
Training Time: 59.1s


# Multi-Layer Perceptron Classifier

In [22]:
# Multi-layer Perceptron Classifier
mlp_pipeline = make_pipeline(
    StandardScaler(),
    MLPClassifier(
        hidden_layer_sizes=(128, 64),
        batch_size = 256, 
        early_stopping = True,
        validation_fraction = 0.2,
        n_iter_no_change = 5,
        random_state = RANDOM_SEED
    ),
)

In [23]:
mlp_scores, mlp_preds, mlp_oof = train_model(mlp_pipeline)

submission['target'] = mlp_preds
submission.to_csv('../output/mlp_submission.csv', index=False)


Fold 0 (AUC): 0.7526 in 28.65s.
Fold 1 (AUC): 0.75186 in 22.55s.
Fold 2 (AUC): 0.75366 in 30.8s.
Fold 3 (AUC): 0.75401 in 28.9s.
Fold 4 (AUC): 0.75352 in 26.8s.
Fold 5 (AUC): 0.75279 in 26.77s.

Average AUC: 0.75307
Training Time: 164.48s


In [24]:
noisy_mlp_scores, noisy_mlp_preds, noisy_mlp_oof = train_noisy_model(mlp_pipeline)

submission['target'] = noisy_mlp_preds
submission.to_csv('../output/noisy_mlp_submission.csv', index=False)


Fold 0 (AUC): 0.74878 in 124.57s.
Fold 1 (AUC): 0.74928 in 139.05s.
Fold 2 (AUC): 0.7496 in 125.93s.
Fold 3 (AUC): 0.74939 in 124.82s.
Fold 4 (AUC): 0.74871 in 121.33s.
Fold 5 (AUC): 0.74677 in 129.83s.

Average AUC: 0.74876
Training Time: 765.52s


# XGBoost with Linear Models

In [25]:
# XGBoost Classifier
xgb_pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(
        booster = 'gblinear',
        eval_metric = 'auc',
        random_state = RANDOM_SEED,
        alpha = 1.6282976774133507e-08, 
        **{'lambda': 0.008014767952226397}
    ),
)

In [26]:
xgb_scores, xgb_preds, xgb_oof = train_model(xgb_pipeline)

submission['target'] = xgb_preds
submission.to_csv('../output/xgb_submission.csv', index=False)


Fold 0 (AUC): 0.74914 in 9.28s.
Fold 1 (AUC): 0.74864 in 9.13s.
Fold 2 (AUC): 0.74983 in 9.11s.
Fold 3 (AUC): 0.74987 in 9.17s.
Fold 4 (AUC): 0.74918 in 9.13s.
Fold 5 (AUC): 0.74769 in 9.19s.

Average AUC: 0.74906
Training Time: 55.02s


In [27]:
noisy_xgb_scores, noisy_xgb_preds, noisy_xgb_oof = train_noisy_model(xgb_pipeline)

submission['target'] = noisy_xgb_preds
submission.to_csv('../output/noisy_xgb_submission.csv', index=False)


Fold 0 (AUC): 0.74922 in 43.26s.
Fold 1 (AUC): 0.74881 in 44.37s.
Fold 2 (AUC): 0.74998 in 43.83s.
Fold 3 (AUC): 0.74988 in 43.29s.
Fold 4 (AUC): 0.74933 in 43.6s.
Fold 5 (AUC): 0.74784 in 43.42s.

Average AUC: 0.74918
Training Time: 261.77s
