# Text Classification - Embedding Finetune

----

## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Load - Loading our data from pandas
* Dataset - Create PyTorch Dataset
* Model - Create PyTorch Vanilla Model
* Helper - Training helper functions
* Training - Training Loop


## $\color{blue}{Preamble:}$

This note book will create and train a classification model based on the vanilla embeddings. The notebook will establish hyper-parameter tuning technique that can be recycled for other classifiers.

## $\color{blue}{Admin:}$

In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'


Mounted at /content/drive
/content/drive/MyDrive


In [3]:
%%capture
!pip install torch
!pip install dill

In [4]:
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
import os
from getpass import getpass
from huggingface_hub import login

# Prompt for your Hugging Face token securely
token = getpass("Please enter your Hugging Face token: ")

Please enter your Hugging Face token: ··········


In [6]:
# Use the token for Hugging Face login
if token:
    print("HuggingFace token has been successfully entered.")
    login(token=token)
else:
    print("Continuing without Hugging Face login")

HuggingFace token has been successfully entered.
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## $\color{blue}{Load:}$

In [7]:
import pandas as pd
path = "class/datasets/"
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)
df_test= df_test.reset_index(drop=True)

In [8]:
df_train.head()

Unnamed: 0,index,master,book_idx,book,chapter_idx,chapter,author,content,vanilla_embedding,vanilla_embedding.1,ft_embedding,ft_embedding_pal
0,8114,Dubliners,3,Dubliners,31,GRACE,Joyce,“Is it John of Tuam?” “Are you sure of that ...,"[-0.012913608, -0.026916211, 0.0023321153, -0....","[tensor(-0.0129), tensor(-0.0269), tensor(0.00...","[-0.033624846, -0.028869793, 0.015241957, -0.0...","[0.05582258, -0.052688017, 0.02023214, -0.0299..."
1,4951,Ulysses,2,Nostos,15,Eumaeus,Joyce,sibly there were several others. He personally...,"[-0.019626686, -0.035692617, -0.034875672, 0.0...","[tensor(-0.0196), tensor(-0.0357), tensor(-0.0...","[-0.018538317, 0.021703975, -0.015190964, 0.03...","[-0.008198347, -0.038665075, -0.109002225, 0.0..."
2,4629,Ulysses,2,Nostos,15,Eumaeus,Joyce,"Stephen, who was trying his dead best to yawn ...","[0.015934143, -0.0034991587, 0.0035751674, 0.0...","[tensor(0.0159), tensor(-0.0035), tensor(0.003...","[-0.017514218, 0.023529347, -0.013798427, 0.03...","[0.07348006, 0.02775626, -0.009280508, 0.00855..."
3,11556,Dracula,4,Dracula,59,CHAPTER XXVII: MINA HARKER’S JOURNAL,Bram Stoker,"Now to the historical, for as Madam Mina write...","[-4.009433e-05, -0.0041142944, 0.026873538, -0...","[tensor(-4.0125e-05), tensor(-0.0041), tensor(...","[-0.0038039144, -0.0067709954, 0.011352386, -0...","[0.006440056, -0.00080459623, -0.012438459, -0..."
4,12262,Republic,5,Republic,62,Book III,Plato,The harmonies which you mean are the mixed or ...,"[0.0048890463, -0.0060007297, 0.0054147574, -0...","[tensor(0.0049), tensor(-0.0060), tensor(0.005...","[0.06594565, 0.068737105, -0.015259049, -0.030...","[-0.016349742, -0.04981008, -0.005950525, 0.02..."


## $\color{blue}{Dataset:}$

In [9]:
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm import tqdm
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")

# get the texts and tokenize
train_texts = [df_train.loc[i]['content'] for i in range(df_train.shape[0])]
dev_texts = [df_dev.loc[i]['content'] for i in range(df_dev.shape[0])]
test_texts = [df_test.loc[i]['content'] for i in range(df_test.shape[0])]

train_tokens = []
train_attention_masks = []
dev_tokens = []
dev_attention_masks = []
test_tokens = []
test_attention_masks = []

for sent in tqdm(train_texts):
  encoding = tokenizer(sent, truncation=True, padding='max_length', return_tensors='pt')
  train_tokens.append(encoding['input_ids'].squeeze(0))
  train_attention_masks.append(encoding['attention_mask'].squeeze(0))

for sent in tqdm(dev_texts):
  encoding = tokenizer(sent, truncation=True, padding='max_length', return_tensors='pt')
  dev_tokens.append(encoding['input_ids'].squeeze(0))
  dev_attention_masks.append(encoding['attention_mask'].squeeze(0))

for sent in tqdm(test_texts):
  encoding = tokenizer(sent, truncation=True, padding='max_length', return_tensors='pt')
  test_tokens.append(encoding['input_ids'].squeeze(0))
  test_attention_masks.append(encoding['attention_mask'].squeeze(0))


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 12000/12000 [00:08<00:00, 1429.98it/s]
100%|██████████| 964/964 [00:00<00:00, 1445.94it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1386.96it/s]


In [None]:
import random
from collections import defaultdict
import torch
from torch.utils.data import Dataset, DataLoader, Sampler, SequentialSampler

class CustomTripletDataset(Dataset):
    def __init__(self, tokens, attention_masks, labels):
        self.tokens = tokens
        self.attention_masks = attention_masks
        self.labels = torch.Tensor(labels)
        self.label_dict = defaultdict(list)


        for i in range(len(tokens)):
            self.label_dict[int(self.labels[i])].append(i)
        self.unique_classes = list(self.label_dict.keys())

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, index):
        ids = self.tokens[index].to(device)
        ams = self.attention_masks[index].to(device)
        y = self.labels[index].to(device)
        return ids, ams, y


In [None]:
class CustomBatchSampler(SequentialSampler):
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.unique_classes = sorted(dataset.unique_classes)
        self.label_dict = dataset.label_dict
        self.num_batches = len(self.dataset) // self.batch_size
        self.class_size = self.batch_size // 4

    def __iter__(self):
        total_samples_used = 0
        weights = np.repeat(1, len(self.unique_classes))

        while total_samples_used < len(self.dataset):
            batch = []
            classes = []
            for _ in range(4):
                next_selected_class = self._select_class(weights)
                while next_selected_class in classes:
                  next_selected_class = self._select_class(weights)
                weights[next_selected_class] += 1
                classes.append(next_selected_class)
                new_choices = self.label_dict[next_selected_class]
                remaining_samples = list(np.random.choice(new_choices, min(self.class_size, len(new_choices)), replace=False))
                batch.extend(remaining_samples)

            total_samples_used += len(batch)

            yield batch

    def _select_class(self, weights):
        dist = 1/weights
        dist = dist/np.sum(dist)
        selected = int(np.random.choice(self.unique_classes, p=dist))
        return selected

    def __len__(self):
        return self.num_batches


In [None]:
class CustomValidationSampler(SequentialSampler):
    def __init__(self, batches):
        self.batches = batches

    def __iter__(self):
      return iter(self.batches)

    def __len__(self):
      return len(self.batches)

In [None]:
train_labels = [df_train.loc[i]['chapter_idx'] for i in range(df_train.shape[0])]
dev_labels = [df_dev.loc[i]['chapter_idx'] for i in range(df_dev.shape[0])]
test_labels = [df_test.loc[i]['chapter_idx'] for i in range(df_test.shape[0])]

train_dataset = CustomTripletDataset(train_tokens, train_attention_masks, train_labels)
dev_dataset = CustomTripletDataset(dev_tokens, dev_attention_masks, dev_labels)
test_dataset = CustomTripletDataset(test_tokens, test_attention_masks, test_labels)

cannot enfore deterministic behavior in custom batch sampler for reproductability of batches and consistent metrics, building a solution to generate batches beforehand

In [None]:
import dill

def save_object(obj, filename):
  with open(filename, 'wb') as file:
    dill.dump(obj, file)
  print(f"Object saved to {filename}")

def load_object(filename):
  with open(filename, 'rb') as file:
    obj = dill.load(file)
  print(f"Object loaded from {filename}")
  return obj

In [None]:
def generate_batches(dataset):
  batches = []
  np.random.seed(42)
  total_samples = 0
  while total_samples < len(dataset):
    batch = []
    for i in range(4):
      cat = int(np.random.choice(dataset.unique_classes))
      batch.extend(dataset.label_dict[cat][-8:])
    batches.append(batch)
    total_samples += len(batch)
  return batches

In [None]:
# path = 'class/misc/'
# dev_batches = generate_batches(dev_dataset)
# test_batches = generate_batches(test_dataset)
# save_object(dev_batches, path + 'dev_batches')
# save_object(test_batches, path + 'test_batches')

Object saved to class/misc/dev_batches
Object saved to class/misc/test_batches


In [None]:
path = 'class/misc/'
dev_batches = load_object(path + 'dev_batches')
test_batches = load_object(path + 'test_batches')

Object loaded from class/misc/dev_batches
Object loaded from class/misc/test_batches


In [None]:
# Create the custom batch sampler
batch_size = 32
train_batch_sampler = CustomBatchSampler(train_dataset, batch_size)
dev_batch_sampler = CustomValidationSampler(dev_batches)
test_batch_sampler = CustomValidationSampler(test_batches)

In [None]:
# Create DataLoader with the custom batch sampler
train_loader = DataLoader(train_dataset, batch_sampler=train_batch_sampler)
dev_loader = DataLoader(dev_dataset, batch_sampler=dev_batch_sampler)
test_loader = DataLoader(test_dataset, batch_sampler=test_batch_sampler)

In [None]:
# for batch_idx, (toks, ams, ys) in enumerate(dev_loader):
#     print(ys)

## $\color{blue}{Model:}$

In [None]:
import torch.nn.functional as F
import torch.nn as nn

class EmbeddingModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    def average_pool(self, last_hidden_states, attention_mask):
        # average the token embeddings
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = self.average_pool(last_hidden_state, attention_mask)
        normalized_output = F.normalize(pooled_output, p=2, dim=1)
        return normalized_output

In [None]:
base_model = AutoModel.from_pretrained("thenlper/gte-base")
model = EmbeddingModel(base_model)
sum(p.numel() for p in model.parameters() if p.requires_grad)

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

109482240

## $\color{blue}{Helper:}$

In [None]:
import numpy as np

def train(model, train_loader, criterion, optimizer, scheduler):
    model.train()
    epoch_train_losses = []

    for idx, (ids, attention_masks, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        embeddings = model(ids, attention_masks)

        distance_matrix = torch.cdist(embeddings, embeddings, p=2) # Create square distance matrix

        anchors = []
        positives = []
        negatives = []


        for i in range(len(labels)):

            anchor_label = labels[i].item()
            anchor_distance = distance_matrix[i] # distance between anchor and all other points

            # Hardest positive (farthest in the same class)
            hardest_positive_idx = (labels == anchor_label).nonzero(as_tuple=True)[0] # all same class indices
            hardest_positive_idx = hardest_positive_idx[hardest_positive_idx != i] # disclude own label
            hardest_positive = hardest_positive_idx[anchor_distance[hardest_positive_idx].argmax()] # label of furthest same class

            # Hardest negative (closest from different class)
            hardest_negative_idx = (labels != anchor_label).nonzero(as_tuple=True)[0] # all diff class indices
            hardest_negative = hardest_negative_idx[anchor_distance[hardest_negative_idx].argmin()] # label of closest different class

            # load selected
            anchors.append(embeddings[i])
            positives.append(embeddings[hardest_positive])
            negatives.append(embeddings[hardest_negative])

        # Convert lists to tensors
        anchors = torch.stack(anchors)
        positives = torch.stack(positives)
        negatives = torch.stack(negatives)

        # Calculate loss
        loss = criterion(anchors, positives, negatives)
        epoch_train_losses.append(loss.item())

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        # Update Learning Rate
        scheduler.step()

    return np.mean(epoch_train_losses)

In [None]:
def validate(model, dev_loader, criterion):
    model.eval()
    epoch_dev_losses = []

    with torch.no_grad():
        for idx, (ids, attention_masks, labels) in enumerate(train_loader):

            embeddings = model(ids, attention_masks)

            distance_matrix = torch.cdist(embeddings, embeddings, p=2) # Create square distance matrix

            anchors = []
            positives = []
            negatives = []

            for i in range(len(labels)):

                anchor_label = labels[i].item()
                anchor_distance = distance_matrix[i] # distance between anchor and all other points

                # Hardest positive (farthest in the same class)
                hardest_positive_idx = (labels == anchor_label).nonzero(as_tuple=True)[0] # all same class indices
                hardest_positive_idx = hardest_positive_idx[hardest_positive_idx != i] # disclude own label
                hardest_positive = hardest_positive_idx[anchor_distance[hardest_positive_idx].argmax()] # label of furthest same class

                # Hardest negative (closest from different class)
                hardest_negative_idx = (labels != anchor_label).nonzero(as_tuple=True)[0] # all diff class indices
                hardest_negative = hardest_negative_idx[anchor_distance[hardest_negative_idx].argmin()] # label of closest different class

                # load selected
                anchors.append(embeddings[i])
                positives.append(embeddings[hardest_positive])
                negatives.append(embeddings[hardest_negative])

            # Convert lists to tensors
            anchors = torch.stack(anchors)
            positives = torch.stack(positives)
            negatives = torch.stack(negatives)

            # Calculate loss
            loss = criterion(anchors, positives, negatives)
            epoch_dev_losses.append(loss.item())

    return np.mean(epoch_dev_losses)

In [None]:
from collections import namedtuple
Stats = namedtuple('Stats', [
    'train_loss',
    'dev_loss',
    'epoch',
    'lr',
    'alpha'
])

In [None]:
def search_stats(results):
  best_stats = None
  min_dev_loss = 1e7
  for i in range(len(results)):
    loss = results[i].dev_loss
    if loss < min_dev_loss:
      best_stats = results[i]
      min_dev_loss = loss
  return best_stats

## $\color{blue}{Training:}$

In [None]:
def tv_run(epochs, model, lr, alpha, min_loss, path, verbose = 0):
  """
  Runs a training setup
  verbose == 1 - print model results
  verbose == 2 -> print epoch and model results
  """

  # Prepare data loaders
  train_loader = DataLoader(train_dataset, batch_sampler=train_batch_sampler)
  dev_loader = DataLoader(dev_dataset, batch_sampler=dev_batch_sampler)

  # Set up new model
  criterion = nn.TripletMarginLoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=alpha)

  total_steps = len(train_loader) * epochs
  warmups = total_steps // 12 # 8%

  scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmups,  # Proportion of the training to perform learning rate warmup
    num_training_steps=total_steps
  )



  # Hold epoch stats
  train_losses = []
  dev_losses = []
  epoch_holder = []

  # Break if no improvement
  current_best = 1e7
  no_improvement = 0

  # Run epochs
  for epoch in range(epochs):

    # break out of epochs
    if no_improvement >= 5:
      break

    # call training and validation functions
    train_loss = train(model, train_loader, criterion, optimizer, scheduler)
    dev_loss = validate(model, dev_loader, criterion)

    # Store epoch stats
    train_losses.append(train_loss)
    dev_losses.append(dev_loss)
    epoch_holder.append(epoch + 1)

    # check for improvement
    if dev_loss < current_best:
      current_best = dev_loss
      no_improvement = 0
    else:
      no_improvement += 1

    # save best model
    if dev_loss < min_loss:
      torch.save(model.state_dict(), path)
      min_loss = dev_loss

    # optionally print epoch results
    if verbose == 2:
      print(f'\n --------- \nEpoch: {epoch + 1}\n')
      print(f'Epoch {epoch + 1} train loss: {train_loss:.4f}')
      print(f'Epoch {epoch + 1} dev loss: {dev_loss:.4f}')

  # save best results
  min_ind = np.argmin(dev_loss)

  stats = Stats(
      train_losses[min_ind],
      dev_losses[min_ind],
      epoch_holder[min_ind],
      lr,
      alpha,
  )

  # optionally print model results
  if verbose in [1,2]:
    print('\n ######## \n')
    print(f'lr:{stats.lr}, alpha:{stats.alpha} @ epoch {stats.epoch}.')
    print(f'TL:{stats.train_loss}')
    print(f'DL:{stats.dev_loss}')

  return stats

In [None]:
"""
Main Admin
"""
epochs = 15
min_loss = 1e7
path = "class/models/embedding_ft.1.pt"
results = []

for lr in [0.00005]:
    for alpha in [0.01]:


      # define model
      model = EmbeddingModel(base_model)
      model = model.to(device)

      # run training
      res = tv_run(epochs, model, lr, alpha, min_loss, path, verbose = 2)
      min_loss = res.dev_loss
      results.append(res)

      # get best result of the round or even so far
      stats = search_stats(results)
      print(stats) # debug


 --------- 
Epoch: 1

Epoch 1 train loss: 1.0435
Epoch 1 dev loss: 1.0247

 --------- 
Epoch: 2

Epoch 2 train loss: 1.0089
Epoch 2 dev loss: 1.0077

 --------- 
Epoch: 3

Epoch 3 train loss: 1.0050
Epoch 3 dev loss: 1.0023

 --------- 
Epoch: 4

Epoch 4 train loss: 1.0056
Epoch 4 dev loss: 1.0059

 --------- 
Epoch: 5

Epoch 5 train loss: 1.0041
Epoch 5 dev loss: 1.0004

 --------- 
Epoch: 6

Epoch 6 train loss: 0.9802
Epoch 6 dev loss: 0.9407

 --------- 
Epoch: 7

Epoch 7 train loss: 0.8536
Epoch 7 dev loss: 0.7224

 --------- 
Epoch: 8

Epoch 8 train loss: 0.7114
Epoch 8 dev loss: 0.7864

 --------- 
Epoch: 9

Epoch 9 train loss: 0.6596
Epoch 9 dev loss: 0.6985

 --------- 
Epoch: 10

Epoch 10 train loss: 0.6744
Epoch 10 dev loss: 0.7250

 --------- 
Epoch: 11

Epoch 11 train loss: 0.6462
Epoch 11 dev loss: 0.6312

 --------- 
Epoch: 12

Epoch 12 train loss: 0.6558
Epoch 12 dev loss: 0.6186

 --------- 
Epoch: 13

Epoch 13 train loss: 0.6388
Epoch 13 dev loss: 0.5955

 --------- 


In [None]:
import dill
def save_results_to_file(namedtuples, filename):
    """Saves a list of namedtuples to a specified file using dill."""
    with open(filename, 'wb') as f:
        dill.dump(namedtuples, f)

def load_results_from_file(filename):
    """Loads a list of namedtuples from a specified file using dill."""
    with open(filename, 'rb') as f:
        return dill.load(f)

In [None]:
path = 'class/results/'
save_results_to_file(results, path + 'embedding_ft.1.pk')