# Training with Data Imbalance using Cost-Sensitive Learning

This Notebook focuses on training the data using cost-sensitive learning to solve the data imbalance issue.

## Import Libraries

In [1]:
# change the python's path to the parent directory
import sys
sys.path.append("../")

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

from models.popular_models import PopularModels
from torch.utils.data import DataLoader
from torchvision import transforms
from data.dataset import MultiLabelDataset
from tqdm import tqdm
from tools.tools import get_data, load_data, remove_class
from tools.metrics import get_f1_score
from typing import List
# from sklearn.model_selection import train_test_split

## Import Data

In [3]:
# import data
train_data = get_data("../dataset/train.csv")
test_data = get_data("../dataset/test.csv")

# perform text cleaning and get the pandas' dataframe
train_data = load_data(train_data)
test_data = load_data(test_data, has_label=False)

print(f"Number of training instances: {train_data.shape[0]}")
print(f"Number of testing instances:  {test_data.shape[0]}")

Number of training instances: 30000
Number of testing instances:  10000


## Remove the Imbalanced Class (Class 1)

In [4]:
# remove an imbalanced class
train_data = remove_class(train_data, class_no=1)
print(f"Number of training instances: {train_data.shape[0]}")

Number of training instances: 15925


## Preprocess Images (Transformation)

In [5]:
# define the image transformation: currently following resnet18
transform = transforms.Compose([
    transforms.Resize((232, 232)),
    transforms.CenterCrop(224),
    transforms.ToTensor(), # converts images to [0, 1]
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
])

## Define Model, Optimizer & Loss Function

### Create a function to count each class

In [6]:
def count_class(train_data: pd.DataFrame) -> dict[str, int]:
	"""
	Count the frequency of each class.

	Argument:
		train_data (pd.DataFrame): the csv data.
	"""
	counter = {}
	for n in range(19):
		data = train_data[train_data['class ' + str(n + 1)] == 1.]
		data = data['class ' + str(n + 1)]
		freq = data.count()
		counter['class ' + str(n + 1)] = freq
	return counter

In [7]:
def calculate_pos_weights(class_counts: List[int], data) -> torch.Tensor:
	"""
	Calculates the positive weights of the data.
	Reference: https://stackoverflow.com/questions/57021620/how-to-calculate-unbalanced-weights-for-bcewithlogitsloss-in-pytorch
	
	Argument:
		class_counts (List[int]): the frequency of each class.
		data (pd.DataFrame): the csv file.
	"""
	pos_weights = np.zeros(len(class_counts))
	neg_counts = [len(data) - pos_count for pos_count in class_counts]
	for cdx, (pos_count, neg_count) in enumerate(zip(class_counts, neg_counts)):
		pos_weights[cdx] = neg_count / (pos_count + 1e-5)
		if (pos_count == 0):
			pos_weights[cdx] = 0
	return torch.from_numpy(pos_weights).float()

class_counts = list(count_class(train_data).values())
pos_weights = calculate_pos_weights(class_counts, train_data)
if torch.cuda.is_available():
	pos_weights = pos_weights.to('cuda')

In [8]:
# define output dimension
n_out = 19

# get the model
c_pm = PopularModels(
    choice='regnet_x_1_6gf', 
    pretrained=True, 
    freeze=True, 
    n_out=n_out,
)
model = c_pm.get_model()

loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
optimizer = torch.optim.Adam(
    params=model.parameters(),
    lr=0.01,
)

# utilise GPU
if torch.cuda.is_available():
    print('using GPU')
    model = model.to('cuda')

using GPU


## Create Datasets and DataLoaders

In [10]:
# initialize the dataset
train_dataset = MultiLabelDataset(
    csv_file=train_data,
    root_dir='../dataset/data/',
    vectorizer=None,
    transform=transform,
    is_test=False,
)
test_dataset = MultiLabelDataset(
    csv_file=test_data,
    root_dir='../dataset/data/',
    vectorizer=None,
    transform=transform,
    is_test=True,
)

BATCH_SIZE=16

# load the dataset into batches 
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

## Perform Training

In [11]:
# define hyperparameters
EPOCHS = 20
THRESHOLD = 0.5
N_CLASSES = 19

# define the training function
train_losses = []
for epoch in range(EPOCHS):

	train_loss = 0.
	start = 0 # index
	y_true = np.zeros((train_data.shape[0], N_CLASSES))
	y_pred = np.zeros((train_data.shape[0], N_CLASSES))
	model.train()
	for _, images, _, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training: "):

		if torch.cuda.is_available():
			images = images.to('cuda')
			labels = labels.to('cuda')

		optimizer.zero_grad()
		outputs = model(images)

		# backward
		loss = loss_fn(outputs, labels)
		loss.backward()

		# update
		optimizer.step()

		# multi-hot encoded
		predicted = (F.sigmoid(outputs) > THRESHOLD).int()

		# steps for getting the classification metrics
		n = images.shape[0]
		y_true[start:start+n] = labels.cpu().numpy()
		y_pred[start:start+n] = predicted.cpu().numpy()
		start = start + n

		train_loss += loss.item()

	train_losses.append(train_loss / len(train_dataloader))
	print("Epoch {:d}, Train Loss: {:.9f}".format(epoch+1, train_losses[-1]))
	
	# show the metrics
	get_f1_score(y_true, y_pred, display=True)

  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 1 Training: 100%|██████████| 996/996 [01:04<00:00, 15.48it/s]


Epoch 1, Train Loss: 0.901551606
F1 Score (Micro):    45.0%
F1 Score (Macro):    38.6%
F1 Score (Weighted): 52.7%
F1 Score (Sample):   47.3%


  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 2 Training: 100%|██████████| 996/996 [01:02<00:00, 15.95it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2, Train Loss: 0.815753995
F1 Score (Micro):    49.5%
F1 Score (Macro):    42.5%
F1 Score (Weighted): 56.0%
F1 Score (Sample):   52.3%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 3 Training: 100%|██████████| 996/996 [01:04<00:00, 15.50it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3, Train Loss: 0.817497700
F1 Score (Micro):    50.1%
F1 Score (Macro):    43.3%
F1 Score (Weighted): 56.5%
F1 Score (Sample):   52.8%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 4 Training: 100%|██████████| 996/996 [01:03<00:00, 15.77it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4, Train Loss: 0.816952223
F1 Score (Micro):    50.5%
F1 Score (Macro):    43.6%
F1 Score (Weighted): 56.7%
F1 Score (Sample):   53.2%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 5 Training: 100%|██████████| 996/996 [01:03<00:00, 15.74it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5, Train Loss: 0.795229766
F1 Score (Micro):    51.4%
F1 Score (Macro):    44.4%
F1 Score (Weighted): 57.3%
F1 Score (Sample):   54.1%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 6 Training: 100%|██████████| 996/996 [01:03<00:00, 15.61it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6, Train Loss: 0.810457282
F1 Score (Micro):    51.5%
F1 Score (Macro):    44.4%
F1 Score (Weighted): 57.4%
F1 Score (Sample):   54.2%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 7 Training: 100%|██████████| 996/996 [01:03<00:00, 15.65it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7, Train Loss: 0.802493653
F1 Score (Micro):    51.6%
F1 Score (Macro):    44.8%
F1 Score (Weighted): 57.4%
F1 Score (Sample):   54.2%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 8 Training: 100%|██████████| 996/996 [01:03<00:00, 15.71it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 8, Train Loss: 0.801504653
F1 Score (Micro):    51.7%
F1 Score (Macro):    44.8%
F1 Score (Weighted): 57.4%
F1 Score (Sample):   54.5%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 9 Training: 100%|██████████| 996/996 [01:04<00:00, 15.51it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 9, Train Loss: 0.807439914
F1 Score (Micro):    52.0%
F1 Score (Macro):    45.0%
F1 Score (Weighted): 57.6%
F1 Score (Sample):   54.4%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 10 Training: 100%|██████████| 996/996 [01:04<00:00, 15.56it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 10, Train Loss: 0.801733895
F1 Score (Micro):    52.2%
F1 Score (Macro):    45.3%
F1 Score (Weighted): 57.7%
F1 Score (Sample):   54.8%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 11 Training: 100%|██████████| 996/996 [01:03<00:00, 15.65it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 11, Train Loss: 0.787789459
F1 Score (Micro):    52.7%
F1 Score (Macro):    45.8%
F1 Score (Weighted): 58.1%
F1 Score (Sample):   55.3%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 12 Training: 100%|██████████| 996/996 [01:03<00:00, 15.75it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 12, Train Loss: 0.797108424
F1 Score (Micro):    52.8%
F1 Score (Macro):    45.7%
F1 Score (Weighted): 58.1%
F1 Score (Sample):   55.3%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 13 Training: 100%|██████████| 996/996 [01:02<00:00, 15.86it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 13, Train Loss: 0.793405984
F1 Score (Micro):    52.7%
F1 Score (Macro):    45.8%
F1 Score (Weighted): 58.1%
F1 Score (Sample):   55.2%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 14 Training: 100%|██████████| 996/996 [01:02<00:00, 15.90it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 14, Train Loss: 0.800184739
F1 Score (Micro):    52.5%
F1 Score (Macro):    45.6%
F1 Score (Weighted): 57.9%
F1 Score (Sample):   55.0%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 15 Training: 100%|██████████| 996/996 [01:03<00:00, 15.81it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 15, Train Loss: 0.824680950
F1 Score (Micro):    52.4%
F1 Score (Macro):    45.6%
F1 Score (Weighted): 58.0%
F1 Score (Sample):   55.0%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 16 Training: 100%|██████████| 996/996 [01:02<00:00, 15.85it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 16, Train Loss: 0.813551928
F1 Score (Micro):    52.5%
F1 Score (Macro):    45.7%
F1 Score (Weighted): 57.9%
F1 Score (Sample):   55.0%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 17 Training: 100%|██████████| 996/996 [01:03<00:00, 15.69it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 17, Train Loss: 0.804795247
F1 Score (Micro):    52.8%
F1 Score (Macro):    45.8%
F1 Score (Weighted): 58.2%
F1 Score (Sample):   55.3%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 18 Training: 100%|██████████| 996/996 [01:03<00:00, 15.66it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 18, Train Loss: 0.809111725
F1 Score (Micro):    52.6%
F1 Score (Macro):    45.7%
F1 Score (Weighted): 58.0%
F1 Score (Sample):   55.2%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 19 Training: 100%|██████████| 996/996 [01:02<00:00, 15.88it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 19, Train Loss: 0.818967652
F1 Score (Micro):    52.3%
F1 Score (Macro):    45.4%
F1 Score (Weighted): 57.7%
F1 Score (Sample):   54.8%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 20 Training: 100%|██████████| 996/996 [01:02<00:00, 15.90it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 20, Train Loss: 0.802912372
F1 Score (Micro):    53.1%
F1 Score (Macro):    46.1%
F1 Score (Weighted): 58.4%
F1 Score (Sample):   55.3%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Draw the graphs

### Import Library

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.title("Training Losses vs Epoch")
plt.plot(train_losses)
plt.ylabel("Epoch")
plt.xlabel("Loss")
plt.show()

## Others (Saving Model, Create a Submission File, Loading Model)

In [13]:
torch.save(model.state_dict(), '../regnet_x_1_6gf_True_True_19.pth')

In [14]:
# create a file for test submission
f = open('../submission.csv', "w")
f.write("ImageID,Labels\n")

model.eval()
for image_names, images, _ in tqdm(test_dataloader, desc="Testing: "):
	
	if torch.cuda.is_available():
		images = images.to('cuda')
	
	outputs = model(images)
	predicted = (F.sigmoid(outputs) > THRESHOLD).int()

	# NOTE: add 1 to the output of predicted!
	# write the output
	for i, predicted_label in enumerate(predicted):
		label = (predicted_label == torch.max(predicted_label)).nonzero().flatten()
		label += 1
		label = label.tolist()
		label = " ".join(str(x) for x in label)

		f.write(image_names[i].split("/")[-1] + "," + str(label) +"\n")

f.close()

Testing: 100%|██████████| 625/625 [00:37<00:00, 16.50it/s]


In [16]:
model = PopularModels(
    choice="regnet_x_1_6gf",
	pretrained=True,
	freeze=True,
	n_out=19
).get_model()
model.load_state_dict(torch.load('../regnet_x_1_6gf_True_True_19.pth'))

<All keys matched successfully>

: 