# Imbalanced Data Workflow
Imbalanced data is one of the most common (and most frustrating) issues encountered in machine learning, and I have developed a number of data augmentation techniques to help address it.

Always the first step is to understand the data, but once exploration is complete we have to roll up our sleeves and get down to business.

In [90]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.append('./*'); sys.path.append('..')
import torch
import torch.nn as nn
from PIL import Image
from torchvision.transforms import v2
import matplotlib.pyplot as plt
from os import path
from torch.utils.data import DataLoader, Dataset, random_split, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from trainer import Trainer
import pandas as pd
import numpy as np
import sklearn

device = 'cuda' if torch.cuda.is_available() else 'cpu'

d = '../../../../ml_data/metal_sheet_data'
dataset_d = path.join(d, 'preprocessed_training_set')

from datasets.ClassificationSet import ClassificationSet
from models.SheetClassifier import SheetClassifier


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Create Artificially Imbalanced Dataset

In [63]:
df = pd.read_csv(f'{path.join(d, 'preprocessed_training_set')}/metadata.csv')
bubbles = df[df.target==2]; print('Pre-Cut Bubbles Len:', len(bubbles))
bubbles = bubbles.iloc[:int(len(bubbles)*.2)]

print('Post-Cut Bubbles Len:',len(bubbles))
print('Number of bubbles compared to line:', round(len(bubbles)/len(df[df.target==1]),2))
print('Number of bubbles compared to no defect:', round(len(bubbles)/len(df[df.target==0]),2))
df = pd.concat([df[df.target != 2], bubbles])

Pre-Cut Bubbles Len: 1693
Post-Cut Bubbles Len: 338
Number of bubbles compared to line: 0.2
Number of bubbles compared to no defect: 0.21


Alright, so now our dataset is highly skewed against bubbles. Given that let's see how training goes without stratifying.

In [76]:
trainer = Trainer()
data = ClassificationSet(dataset_d, df)
model = SheetClassifier()

trainer.run_experiment(model, data, epochs=20, criterion=nn.CrossEntropyLoss, train_shuffle=False, show=True)
trainer.evaluate_model(model, show=True)

Epoch [1/10], Train Loss: 7.36, Accuracy: 0.46
Epoch [6/10], Train Loss: 0.27, Accuracy: 0.92
Experiment Complete
[Evaluation over 8 Batches], Test Loss: 3.48, Accuracy: 0.50


SheetClassifier(
  (model): Sequential(
    (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (lin1): Linear(in_features=16384, out_features=3, bias=True)
)

In [68]:
def stratified_ds_split(df:pd.DataFrame, root_dir:str, dataset_class:type, target_column_names:list, reporting:bool = False) -> tuple:
  ''' Stratifies the dataset by the given list of target columns, attempting to ensure consistent distribution of classes therein.
      Allows for multiple columns to form the strata key (so if you want to group by age, sex, and whatever else, you can).
  '''

  # Ensures consistent distribution of the given classes
  if len(target_column_names) == 1: stratification_key = target_column_names[0]
  else:
    stratification_key = 'strat_key'
    df[stratification_key] = df[target_column_names].astype(str).sum(axis=1)

  # Split the complete df into training and test
  training_features, testing_features, training_targets, testing_targets = train_test_split(df, df[stratification_key], test_size=0.2, stratify=df[stratification_key])

  # Generate weights for the classes based on their representation in the target series
  target = np.array(training_features[stratification_key])
  target_classes = np.unique(target)
  weights = {t: len(np.where(target == t)[0])/len(training_features) for t in target_classes}
  if reporting:
    whole_df_target = np.array(df[stratification_key])
    whole_df_weights = {t: len(np.where(whole_df_target == t)[0])/len(df) for t in target_classes}
    print('Our target classes are as follows:',target_classes,'\n','-'*50,
                      '\nTheir representation in the dataset as a whole are:\n',{str(item[0]):round(float(item[1]),5) for item in whole_df_weights.items()},
                      '\nTheir representation in our training dataloader are:\n',{str(item[0]):round(float(item[1]),5) for item in weights.items()})

  weights_to_samples = np.array([weights[target[row_i]] for row_i in range(len(target))])
  weights_to_samples = torch.from_numpy(weights_to_samples)
  training_sampler = WeightedRandomSampler(weights_to_samples, len(weights_to_samples))

  training_features['target'], testing_features['target'] = training_targets, testing_targets


  return dataset_class(root_dir, training_features), dataset_class(root_dir, testing_features), training_sampler

In [74]:
# training_set, testing_set, train_y, test_y = train_test_split(df, df.target, test_size=0.2, stratify=df.target)
# training_set.target = train_y; testing_set.target = test_y

In [79]:
training_set, testing_set, sampler = stratified_ds_split(df, dataset_d, ClassificationSet, ['target'], True)

Our target classes are as follows: [0 1 2] 
 -------------------------------------------------- 
Their representation in the dataset as a whole are:
 {'0': 0.43978, '1': 0.46749, '2': 0.09273} 
Their representation in our training dataloader are:
 {'0': 0.43964, '1': 0.46742, '2': 0.09294}


In [80]:
trainer = Trainer()
model = SheetClassifier()

trainer.run_experiment(model, training_dataset=training_set, testing_dataset=testing_set, epochs=20, criterion=nn.CrossEntropyLoss,sampler=sampler, train_shuffle=False, show=True)
trainer.evaluate_model(model, show=True)

Epoch [1/20], Train Loss: 4.47, Accuracy: 0.59
Epoch [6/20], Train Loss: 0.27, Accuracy: 0.93
Epoch [11/20], Train Loss: 0.09, Accuracy: 0.98
Epoch [16/20], Train Loss: 0.10, Accuracy: 0.98
Experiment Complete
[Evaluation over 8 Batches], Test Loss: 1.93, Accuracy: 0.59


SheetClassifier(
  (model): Sequential(
    (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (lin1): Linear(in_features=16384, out_features=3, bias=True)
)

### Results
As can be seen, the stratified set performed about 9% better than the unstratified. These results are fairly underwhelming, so let's combine them with some data augmentation and then evaluate the resultant model on our original data.