# TPS 02-2022: Linear Classifiers

We test out several linear classifiers (mostly from `sklearn.linear_model`) with various preprocessing steps to see which give the best results on a small subset of the training data.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 3
NUM_SAMPLES = 50000

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import time
import re
import math
import os
import pyarrow
import gc

# Model evaluation
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_selection import mutual_info_classif


# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import Image

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
def sample_data(samples):
    
    train = pd.read_feather('../data/train.feather')
    train, _ = train_test_split(train, train_size = samples, stratify = train['target'])
    return train

In [4]:
%%time
from sklearn.preprocessing import LabelEncoder

train = sample_data(NUM_SAMPLES)
features = [x for x in train.columns if x not in ['row_id', 'target']]

# Label encoding
encoder = LabelEncoder()
train['target'] = encoder.fit_transform(train['target'])

print(f'Training Samples: {len(train)}')

Training Samples: 50000
Wall time: 603 ms


# Scoring Function

In [5]:
# Scoring/Training Baseline Function
def score_model(sklearn_model):
    
    # Store the holdout predictions
    oof_preds = np.zeros((train.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        X_train, y_train = train[features].iloc[train_idx].to_numpy(), train['target'].iloc[train_idx].to_numpy()
        X_valid, y_valid = train[features].iloc[valid_idx].to_numpy(), train['target'].iloc[valid_idx].to_numpy()
        
        # Create model
        model = clone(sklearn_model)
            
        start = time.time()

        model.fit(X_train, y_train)
        
        end = time.time()
        
        # validation
        try:
            valid_preds = model.predict_proba(X_valid)
            scores[fold] = accuracy_score(y_valid, np.argmax(valid_preds, axis = 1))
        except:
            scores[fold] = accuracy_score(y_valid, model.predict(X_valid))
        end = time.time()
        print(f'Fold {fold}: {round(scores[fold], 5)} accuracy in {round(end-start,2)}s.')
        times[fold] = end-start
        
        
    print("\nAverage Accuracy:", round(scores.mean(), 5))
    print(f'Training Time: {round(times.sum(), 2)}s')

# 1. Logistic Regression

We test out several logistic regression pipelines involving different preprocessing steps such as feature scaling and PCA-based dimension reduction as well as regularization.

In [6]:
# Model
from sklearn.linear_model import LogisticRegression

## 1.1 Solver

We test the `sag` and `saga` solvers to see which give better results in speed and accuracy. 

In [7]:
# SAG solver
score_model(
    LogisticRegression(
        penalty = 'none', 
        solver = 'sag', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
)


Fold 0: 0.80884 accuracy in 27.82s.
Fold 1: 0.80914 accuracy in 26.55s.
Fold 2: 0.80019 accuracy in 26.39s.

Average Accuracy: 0.80606
Training Time: 80.76s


In [8]:
# SAGA solver
score_model(
    LogisticRegression(
        penalty = 'none', 
        solver = 'saga', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
)


Fold 0: 0.78868 accuracy in 41.38s.
Fold 1: 0.78688 accuracy in 41.51s.
Fold 2: 0.77739 accuracy in 41.38s.

Average Accuracy: 0.78432
Training Time: 124.27s


## 1.2 Scaling

In [9]:
# Preprocessing 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [10]:
# Standard scaler
score_model(make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty = 'none', 
        solver = 'sag', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.83452 accuracy in 26.61s.
Fold 1: 0.83356 accuracy in 26.62s.
Fold 2: 0.83667 accuracy in 27.27s.

Average Accuracy: 0.83492
Training Time: 80.5s


In [11]:
# Robust scaler
score_model(make_pipeline(
    RobustScaler(),
    LogisticRegression(
        penalty = 'none', 
        solver = 'sag', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.74435 accuracy in 26.64s.
Fold 1: 0.75388 accuracy in 26.65s.
Fold 2: 0.75117 accuracy in 26.6s.

Average Accuracy: 0.7498
Training Time: 79.9s


In [12]:
# MinMax scaler
score_model(make_pipeline(
    MinMaxScaler(),
    LogisticRegression(
        penalty = 'none', 
        solver = 'sag', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.85672 accuracy in 34.74s.
Fold 1: 0.85666 accuracy in 35.05s.
Fold 2: 0.85251 accuracy in 35.24s.

Average Accuracy: 0.8553
Training Time: 105.03s


## 1.3 Regularization

In [13]:
# L1 Regularization
score_model(make_pipeline(
    MinMaxScaler(),
    LogisticRegression(
        penalty = 'l1', 
        solver = 'saga', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.83872 accuracy in 107.51s.
Fold 1: 0.83686 accuracy in 107.11s.
Fold 2: 0.83325 accuracy in 108.23s.

Average Accuracy: 0.83628
Training Time: 322.85s


In [14]:
# L2 Regularization
score_model(make_pipeline(
    MinMaxScaler(),
    LogisticRegression(
        penalty = 'l2', 
        solver = 'sag', 
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.8203 accuracy in 11.7s.
Fold 1: 0.82498 accuracy in 11.22s.
Fold 2: 0.81393 accuracy in 11.17s.

Average Accuracy: 0.81974
Training Time: 34.1s


In [15]:
# Elasticnet Regularization
score_model(make_pipeline(
    MinMaxScaler(),
    LogisticRegression(
        penalty = 'elasticnet', 
        solver = 'saga', 
        l1_ratio = 0.15,
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.8215 accuracy in 93.65s.
Fold 1: 0.82648 accuracy in 91.95s.
Fold 2: 0.81723 accuracy in 95.6s.

Average Accuracy: 0.82174
Training Time: 281.2s


## 1.2 Dimension Reduction

In [16]:
from sklearn.decomposition import PCA

In [17]:
# Dimension reduction
score_model(make_pipeline(
    MinMaxScaler(),
    PCA(0.99), 
    LogisticRegression(penalty = 'none', solver = 'sag', n_jobs = -1)
))


Fold 0: 0.8482 accuracy in 35.19s.
Fold 1: 0.8464 accuracy in 35.36s.
Fold 2: 0.84273 accuracy in 35.43s.

Average Accuracy: 0.84578
Training Time: 105.98s


In [18]:
# Dimension reduction
score_model(make_pipeline(
    MinMaxScaler(),
    PCA(0.95), 
    LogisticRegression(penalty = 'none', solver = 'sag', n_jobs = -1)
))


Fold 0: 0.82636 accuracy in 29.02s.
Fold 1: 0.82792 accuracy in 28.97s.
Fold 2: 0.82599 accuracy in 29.36s.

Average Accuracy: 0.82676
Training Time: 87.35s


In [19]:
# Dimension reduction
score_model(make_pipeline(
    MinMaxScaler(),
    PCA(0.9), 
    LogisticRegression(penalty = 'none', solver = 'sag', n_jobs = -1)
))


Fold 0: 0.82036 accuracy in 23.91s.
Fold 1: 0.81898 accuracy in 23.99s.
Fold 2: 0.81363 accuracy in 23.79s.

Average Accuracy: 0.81766
Training Time: 71.69s


In [20]:
# Dimension reduction
score_model(make_pipeline(
    MinMaxScaler(),
    PCA(0.85), 
    LogisticRegression(penalty = 'none', solver = 'sag', n_jobs = -1)
))


Fold 0: 0.80956 accuracy in 19.72s.
Fold 1: 0.81304 accuracy in 19.88s.
Fold 2: 0.79797 accuracy in 19.56s.

Average Accuracy: 0.80686
Training Time: 59.16s


# 2. SGDClassifier

In [21]:
# Model
from sklearn.linear_model import SGDClassifier

## 2.1 Scaling

In [22]:
# Standard
score_model(make_pipeline(
    StandardScaler(),
    SGDClassifier(
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.7978 accuracy in 11.69s.
Fold 1: 0.79648 accuracy in 12.05s.
Fold 2: 0.79605 accuracy in 11.75s.

Average Accuracy: 0.79678
Training Time: 35.49s


In [23]:
# Robust
score_model(make_pipeline(
    RobustScaler(),
    SGDClassifier(
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.69701 accuracy in 3.51s.
Fold 1: 0.72983 accuracy in 4.49s.
Fold 2: 0.72015 accuracy in 5.23s.

Average Accuracy: 0.71566
Training Time: 13.23s


In [24]:
# MinMax
score_model(make_pipeline(
    MinMaxScaler(),
    SGDClassifier(
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.73475 accuracy in 0.85s.
Fold 1: 0.74771 accuracy in 0.85s.
Fold 2: 0.77091 accuracy in 0.8s.

Average Accuracy: 0.75112
Training Time: 2.5s


## 2.2 Regularization

In [25]:
# None
score_model(make_pipeline(
    StandardScaler(),
    SGDClassifier(
        penalty = None,
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.7933 accuracy in 30.89s.
Fold 1: 0.79516 accuracy in 31.22s.
Fold 2: 0.79515 accuracy in 31.48s.

Average Accuracy: 0.79454
Training Time: 93.59s


In [26]:
# L1 Regularization
score_model(make_pipeline(
    StandardScaler(),
    SGDClassifier(
        penalty = 'l1',
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.79012 accuracy in 50.25s.
Fold 1: 0.78628 accuracy in 51.23s.
Fold 2: 0.77727 accuracy in 51.58s.

Average Accuracy: 0.78456
Training Time: 153.06s


In [27]:
# L2 Regularization
score_model(make_pipeline(
    StandardScaler(),
    SGDClassifier(
        penalty = 'l2',
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.7978 accuracy in 11.75s.
Fold 1: 0.79648 accuracy in 11.95s.
Fold 2: 0.79605 accuracy in 11.7s.

Average Accuracy: 0.79678
Training Time: 35.39s


In [28]:
# Elasticnet
score_model(make_pipeline(
    StandardScaler(),
    SGDClassifier(
        penalty = 'l2',
        n_jobs = -1,
        random_state = RANDOM_SEED,
    )
))


Fold 0: 0.7978 accuracy in 11.73s.
Fold 1: 0.79648 accuracy in 12.04s.
Fold 2: 0.79605 accuracy in 11.7s.

Average Accuracy: 0.79678
Training Time: 35.46s


# 3. Linear SVC

The default `SGDClassifier` is very similar to `LinearSVC`, but we check it separately since it's likely to outperform it.

In [29]:
from sklearn.svm import LinearSVC

## 3.1 Scaling

In [30]:
# Standard
score_model(make_pipeline(
    StandardScaler(),
    LinearSVC(
        random_state = RANDOM_SEED,
        dual = False
    )
))


Fold 0: 0.80386 accuracy in 195.61s.
Fold 1: 0.80098 accuracy in 168.43s.
Fold 2: 0.79041 accuracy in 289.99s.

Average Accuracy: 0.79842
Training Time: 654.03s


In [31]:
# Robust
score_model(make_pipeline(
    RobustScaler(),
    LinearSVC(
        random_state = RANDOM_SEED,
        dual = False
    )
))


Fold 0: 0.79936 accuracy in 71.6s.
Fold 1: 0.79732 accuracy in 56.58s.
Fold 2: 0.78765 accuracy in 52.05s.

Average Accuracy: 0.79478
Training Time: 180.22s


In [32]:
# MinMax
score_model(make_pipeline(
    MinMaxScaler(),
    LinearSVC(
        random_state = RANDOM_SEED,
        dual = False
    )
))


Fold 0: 0.8014 accuracy in 15.39s.
Fold 1: 0.80716 accuracy in 15.22s.
Fold 2: 0.79659 accuracy in 14.29s.

Average Accuracy: 0.80172
Training Time: 44.91s
