# Text Classification - Embedding PAL FT - classify

----

## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Load - Loading our data from pandas
* Dataset - Create PyTorch Dataset
* Model - Create PyTorch Vanilla model
* Helper - Training helper functions
* Training - Training Loop


## $\color{blue}{Preamble:}$

This note book will create and train a classification model based on the Embedding FT (hard batch triplet loss) embeddings.

## $\color{blue}{Admin:}$

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'


Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%%capture
!pip install torch
!pip install dill

In [None]:
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## $\color{blue}{Load:}$

In [None]:
import pandas as pd
path = "class/datasets/"
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

In [None]:
df_train.head()

Unnamed: 0,index,master,book_idx,book,chapter_idx,chapter,author,content,vanilla_embedding,vanilla_embedding.1,ft_embedding
0,8114,Dubliners,3,Dubliners,31,GRACE,Joyce,“Is it John of Tuam?” “Are you sure of that ...,"[-0.012913608, -0.026916211, 0.0023321153, -0....","[tensor(-0.0129), tensor(-0.0269), tensor(0.00...","[-0.03558476, -0.032069266, 0.016694317, -0.01..."
1,4951,Ulysses,2,Nostos,15,Eumaeus,Joyce,sibly there were several others. He personally...,"[-0.019626686, -0.035692617, -0.034875672, 0.0...","[tensor(-0.0196), tensor(-0.0357), tensor(-0.0...","[-0.020282326, 0.019139778, -0.013562409, 0.03..."
2,4629,Ulysses,2,Nostos,15,Eumaeus,Joyce,"Stephen, who was trying his dead best to yawn ...","[0.015934143, -0.0034991587, 0.0035751674, 0.0...","[tensor(0.0159), tensor(-0.0035), tensor(0.003...","[-0.018985722, 0.021503495, -0.012215637, 0.03..."
3,11556,Dracula,4,Dracula,59,CHAPTER XXVII: MINA HARKER’S JOURNAL,Bram Stoker,"Now to the historical, for as Madam Mina write...","[-4.009433e-05, -0.0041142944, 0.026873538, -0...","[tensor(-4.0125e-05), tensor(-0.0041), tensor(...","[0.005192333, -0.0079266345, 0.0034984224, -0...."
4,12262,Republic,5,Republic,62,Book III,Plato,The harmonies which you mean are the mixed or ...,"[0.0048890463, -0.0060007297, 0.0054147574, -0...","[tensor(0.0049), tensor(-0.0060), tensor(0.005...","[0.06559832, 0.06647674, -0.011808932, -0.0327..."


## $\color{blue}{Dataset:}$

In [None]:
train_embeddings = [torch.tensor(array) for array in df_train['ft_embedding']]
train_x = torch.stack(train_embeddings).to(device)

dev_embeddings = [torch.tensor(array) for array in df_dev['ft_embedding']]
dev_x = torch.stack(dev_embeddings).to(device)

test_embeddings = [torch.tensor(array) for array in df_test['ft_embedding']]
test_x = torch.stack(test_embeddings).to(device)

In [None]:
# train_y = torch.LongTensor(list(df_train['book_idx'])).to(device)
# dev_y = torch.LongTensor(list(df_dev['book_idx'])).to(device)
# test_y = torch.LongTensor(list(df_test['book_idx'])).to(device)

train_y = torch.LongTensor(list(df_train['chapter_idx'])).to(device)
dev_y = torch.LongTensor(list(df_dev['chapter_idx'])).to(device)
test_y = torch.LongTensor(list(df_test['chapter_idx'])).to(device)

In [None]:
from torch.utils.data import Dataset, DataLoader
# assuming already tensors, allready on device
class VanillaDataset(Dataset):
  """Dataset maker"""

  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __getitem__(self,index):
    x = self.x[index]
    y = self.y[index]

    return x, y

  def __len__(self):
    return len(self.y)


In [None]:
train_dataset = VanillaDataset(train_x, train_y)
dev_dataset = VanillaDataset(dev_x, dev_y)
test_dataset = VanillaDataset(test_x, test_y)

In [None]:
train_dataset[0][0].size()

torch.Size([768])

## $\color{blue}{Model:}$

In [None]:
import torch.nn as nn

class DenseBlock(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate):
        super(DenseBlock, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.batch_norm = nn.BatchNorm1d(output_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.linear(x)
        x = self.batch_norm(x)
        x = self.activation(x)
        x = self.dropout(x)
        return x

class VanillaModel(nn.Module):
    def __init__(self, dropout_rate):
        super(VanillaModel, self).__init__()

        # Define the dense blocks
        self.block1 = DenseBlock(768, 400, dropout_rate)
        self.block2 = DenseBlock(400, 200, dropout_rate)
        self.final_layer = nn.Linear(200, 70)

        self.initialize_weights()

    def forward(self, x):
        x = self.block1(x)  # Bx768 -> Bx400
        x = self.block2(x)  # Bx400 -> Bx200
        x = self.final_layer(x)  # Bx200 -> Bx70

        return x

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)


## $\color{blue}{Helper:}$

In [None]:
def accuracy(outputs, labels):
    # argmax to get predicted classes
    _, predicted = torch.max(outputs, 1)

    # count correct
    correct = (predicted == labels).sum().item()

    # get average
    acc = correct / labels.size(0)  # Total number of samples
    return acc

In [None]:
import numpy as np

def train(model, train_loader, criterion, optimizer):
    model.train()
    epoch_train_losses = []
    epoch_train_accuracy = []

    for batch_idx, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()

        out = model(x)
        train_loss = criterion(out, y)
        train_accuracy = accuracy(out, y)

        epoch_train_losses.append(train_loss.item())
        epoch_train_accuracy.append(train_accuracy)

        # Backpropagation and optimization
        train_loss.backward()
        optimizer.step()

    return np.mean(epoch_train_losses), np.mean(epoch_train_accuracy)

In [None]:
def validate(model, dev_loader, criterion):
    model.eval()
    epoch_dev_losses = []
    epoch_dev_accuracy = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(dev_loader):
            out = model(x)

            dev_loss = criterion(out, y)
            dev_accuracy = accuracy(out, y)

            epoch_dev_losses.append(dev_loss.item())
            epoch_dev_accuracy.append(dev_accuracy)

    return np.mean(epoch_dev_losses), np.mean(epoch_dev_accuracy)

In [None]:
from collections import namedtuple
Stats = namedtuple('Stats', [
    'train_loss',
    'train_accuracy',
    'dev_loss',
    'dev_accuracy',
    'epoch',
    'bs',
    'lr',
    'alpha',
    'max_accuracy'
])

In [None]:
def gen_config(lr_low, lr_high, alpha_low, alpha_high, b_size, b_step):
  bs_list = [b_size - b_step, b_size, b_size + b_step]
  bs = int(2**np.random.choice(bs_list))
  lr = round(10**float(np.random.uniform(lr_low,lr_high)),6)
  alpha = round(10**float(np.random.uniform(alpha_low,alpha_high)),6)
  return lr, alpha, bs

In [None]:
def gen_ranges( lr, lr_range, alpha, alpha_range, b_size, iteration):

  lr_center = lr
  lr_low = lr_center - lr_range/2
  lr_high = lr_center + lr_range/2
  lr_diff = lr_high - lr_low

  alpha_center = alpha
  alpha_low = alpha_center - alpha_range/2
  alpha_high = alpha_center + alpha_range/2
  alpha_diff = alpha_high - alpha_low

  b_step = 2 - iteration

  return (lr_low, lr_high, alpha_low, alpha_high, b_size, b_step)

In [None]:
def search_stats(results):
  best_stats = None
  max_dev_accuracy = 0
  for i in range(len(results)):
    acc = results[i].dev_accuracy
    if acc > max_dev_accuracy:
      best_stats = results[i]
      max_dev_accuracy = acc
  return best_stats

## $\color{blue}{Training:}$

In [None]:
def tv_run(epochs, model, bs, lr, alpha, max_accuracy, path, verbose = 0):
  """
  Runs a training setup
  verbose == 1 - print model results
  verbose == 2 -> print epoch and model results
  """
  if bs < 16:
    bs = 16

  # Set up new model
  model = model.to(device)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=alpha)

  # Prepare data loaders
  train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
  dev_loader = DataLoader(dev_dataset, batch_size=bs)

  # Hold epoch stats
  train_losses = []
  train_accuracy = []
  dev_losses = []
  dev_accuracy = []
  epoch_holder = []

  # Break if no improvement
  current_best = 0
  no_improvement = 0

  # Run epochs
  for epoch in range(epochs):

    # break out of epochs
    if no_improvement >= 4:
      break

    # call training and validation functions
    train_loss, train_acc = train(model, train_loader, criterion, optimizer)
    dev_loss, dev_acc = validate(model, dev_loader, criterion)

    # Store epoch stats
    train_losses.append(train_loss)
    train_accuracy.append(train_acc)
    dev_losses.append(dev_loss)
    dev_accuracy.append(dev_acc)
    epoch_holder.append(epoch + 1)

    # check for improvement
    if dev_acc > current_best:
      current_best = dev_acc
      no_improvement = 0
    else:
      no_improvement += 1

    # save best model
    if dev_acc > max_accuracy:
      torch.save(model.state_dict(), path)
      max_accuracy = dev_acc

    # optionally print epoch results
    if verbose == 2:
      print(f'\n --------- \nEpoch: {epoch + 1}\n')
      print(f'Epoch {epoch + 1} train loss: {train_loss:.4f}')
      print(f'Epoch {epoch + 1} train accuracy: {train_acc:.4f}')
      print(f'Epoch {epoch + 1} dev loss: {dev_loss:.4f}')
      print(f'Epoch {epoch + 1} dev accuracy: {dev_acc:.4f}')

  # save best results
  max_ind = np.argmax(dev_accuracy)

  stats = Stats(
      train_losses[max_ind],
      train_accuracy[max_ind],
      dev_losses[max_ind],
      dev_accuracy[max_ind],
      epoch_holder[max_ind],
      bs, lr, alpha,
      max_accuracy
  )

  # optionally print model results
  if verbose in [1,2]:
    print('\n ######## \n')
    print(f'bs:{stats.bs}, lr:{stats.lr}, alpha:{stats.alpha} @ epoch {stats.epoch}.')
    print(f'TL:{stats.train_loss}, TA:{stats.train_accuracy}.')
    print(f'DL:{stats.dev_loss}, DA:{stats.dev_accuracy}')

  return stats

In [None]:
"""
Main Admin
"""
epochs = 40
max_accuracy = 0
path = "class/models/embedding_ft_classify.pt"
results = []

"""
init random search
lr [10^-5 - 10^-1]
alpha [10^-5 - 10^-1]
bs [8, 32, 128]
"""
lr_low = -5
lr_high = -1
lr_range = lr_high - lr_low

alpha_low = -5
alpha_high = -1
alpha_range = alpha_high - alpha_low

b_size = 5
b_step = 2

count = 0

"""
Hyperparameter Search
"""

for i in range(3):
  # debug
  print(f'round: {i}')
  print(f'lr_low{lr_low}, lr_high{lr_high}, lr_range{lr_range}')
  print(f'alpha_low{alpha_low}, lr_high{alpha_high}, lr_range{alpha_range}')
  print(f'b_size{b_size}')
  print(f'b_step{b_step}')
  print('max', max_accuracy)

  for j in range(27):
    count += 1
    print(count)

    # get config
    lr, alpha, bs = gen_config(lr_low, lr_high, alpha_low, alpha_high, b_size, b_step)

    # define model
    model = VanillaModel(.1) # model with dropout
    model = model.to(device)

    # run training
    res = tv_run(epochs, model, bs, lr, alpha, max_accuracy, path, verbose = 0)
    max_accuracy = res.max_accuracy
    results.append(res)

  # get best result of the round or even so far
  stats = search_stats(results)


  print(stats) # debug

  # reconfigure the new hypers
  lr = np.log10(stats.lr)
  lr_range = lr_range / 3

  alpha = np.log10(stats.alpha)
  alpha_range = alpha_range / 3

  bs = np.log2(stats.bs)

  config = gen_ranges(lr, lr_range, alpha, alpha_range, bs, i + 1)
  lr_low, lr_high, alpha_low, alpha_high, b_size, b_step = config
  lr_range = lr_high - lr_low
  alpha_range = alpha_high - alpha_low


round: 0
lr_low-5, lr_high-1, lr_range4
alpha_low-5, lr_high-1, lr_range4
b_size5
b_step2
max 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
Stats(train_loss=1.9579376748402912, train_accuracy=0.3785, dev_loss=2.103779028673641, dev_accuracy=0.3780737704918033, epoch=25, bs=16, lr=5.3e-05, alpha=1.4e-05, max_accuracy=0.3780737704918033)
round: 1
lr_low-4.942390797065878, lr_high-3.6090574637325443, lr_range1.3333333333333335
alpha_low-5.520538630988429, lr_high-4.187205297655095, lr_range1.333333333333334
b_size4.0
b_step1
max 0.3780737704918033
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
Stats(train_loss=1.9579376748402912, train_accuracy=0.3785, dev_loss=2.103779028673641, dev_accuracy=0.3780737704918033, epoch=25, bs=16, lr=5.3e-05, alpha=1.4e-05, max_accuracy=0.3780737704918033)
round: 2
lr_low-4.497946352621433, lr_high-4.0535019081769885, lr_range0.44444444444444464
alpha_low-5.076094186543984, lr_high-4.631649742099

In [None]:
import dill
def save_results_to_file(namedtuples, filename):
    """Saves a list of namedtuples to a specified file using dill."""
    with open(filename, 'wb') as f:
        dill.dump(namedtuples, f)

def load_results_from_file(filename):
    """Loads a list of namedtuples from a specified file using dill."""
    with open(filename, 'rb') as f:
        return dill.load(f)

In [None]:
path = 'class/results/'
save_results_to_file(results, path + 'embedding_ft_classify.pk')