In [1]:
from argparse import Namespace
args = Namespace(
  # Data and path information
  dataset_path="/content/drive/MyDrive/thesis/dataset/",
  model_state_file="model_subject_random_numbers.pth", 
  save_dir="/content/drive/MyDrive/thesis/models/", # save models here
  no_classes = 2285, # subject 2285, chapter 389, volume 47
  class_name = 'subject',
  # Training hyper parameters
  seed=1338,
  num_epochs=15,
  early_stopping_criteria=2,
  learning_rate=0.00002,
  batch_size=8,
  # Runtime options
  cuda=True,
  reload_from_files=False, # to continue training from checkpoint or evaluate a model
  reload_name="model_chapter_8.pth",
  expand_filepaths_to_save_dir=True,
  run_training=False #if false it will run only on the test set
)

###tmp

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# from collections import Counter
import string
import re
import random

import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm

!pip install torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
print('Loading torchmetrics lib...')
# # https://torchmetrics.readthedocs.io/en/latest/index.html
# !pip install torchmetrics
# from torchmetrics import MetricCollection, Accuracy, Precision, Recall, F1
from sklearn.metrics import f1_score, recall_score, precision_score

# reading json files
import json
from os import listdir
from os.path import isfile, join
import os

# huggingface lib bert
print('Loading transformers lib...')
!pip install transformers
from transformers import AutoTokenizer, AutoModel

Collecting torch
  Downloading torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 52 kB/s 
[?25hCollecting typing-extensions
  Downloading typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, torch
Successfully installed torch-1.9.0 typing-extensions-3.10.0.0
Loading torchmetrics lib...
Loading transformers lib...
Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 237 kB/s 
[?25hCollecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 2.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |███████████████████████████████

In [3]:
# Check CUDA and gpu available
if not torch.cuda.is_available():
  args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))
if args.cuda:
  print("GPU: {}".format(torch.cuda.get_device_name(0)))

Using CUDA: False


### Utils

In [None]:
# sets the seed everywhere for reprodusable results
def set_seed_everywhere(seed, cuda):
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  random.seed(seed)
  # !!!! may need to add hugingface init seed
  if cuda:
    torch.cuda.manual_seed_all(seed)

# creates non existing directories
def handle_dirs(dirpath):
  if not os.path.exists(dirpath):
    os.makedirs(dirpath)

### The Vectorizer

In [None]:
class LegalVectorizer(object):
  """ The Vectorizer"""
  def __init__(self):
    print('Loading BERT tokenizer...')
    self.tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1", 
                                                    model_max_length=512, use_fast=True)

  def vectorize(self, text):
    """
    Args:
        text (list of str):
    Returns:
        dictionary: "vector" is a tensor with a list of encoded text paded to max_len, ready for import to BERT
                    "mask" is a tensor with a list of masks ready for import to BERT
    """
    encoded_dict = self.tokenizer(
                    text,                      
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    padding = True, # pad to longest in batch
                    truncation = True, # truncates sentenses to 512, max bert length
                    # padding = 'max_length',
                    # max_length = 512,           # Pad
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',     # Return pytorch tensors.
                )
    return {"vector" : encoded_dict['input_ids'],
            "mask" : encoded_dict['attention_mask']}

  def get_pad_tocken(self):
    return self.tokenizer.pad_token_id

  def to_words(self, vector, remove_pads=True):
    """
    Args:
        vector (tensor): list of vectors to decode
        remove_pads : remove pad 
    Returns:
        list of lists of words coresponding to the vector values
    """
    if remove_pads:
      ans=[self.tokenizer.convert_ids_to_tokens(v[v.nonzero()]) for v in vector]
    else:
      ans=[self.tokenizer.convert_ids_to_tokens(v) for v in vector]
    return ans
      

###The dataset

In [None]:
class LegalDataset(Dataset):
  def __init__(self):
    print("loading validation set...")
    self.val_df = pd.read_pickle(args.dataset_path + "dev.pkl")
    self.validation_size = len(self.val_df)
    print("loading training set...")
    self.train_df = pd.read_pickle(args.dataset_path + "train.pkl")
    self.train_size = len(self.train_df)
    print("loading test set...")
    self.test_df = pd.read_pickle(args.dataset_path + "test.pkl")
    self.test_size = len(self.test_df)
    # check the dataset size (dataset has extra files when uploaded to google drive. it mekes copies ex. "123 (1).json")
    if self.val_df.shape[0]!=9511 or self.train_df.shape[0]!=28536 or self.test_df.shape[0]!=9516:
      print(self.val_df.shape[0])
      print(self.train_df.shape[0])
      print(self.test_df.shape[0])
      print("!! ERROR dataset size !!")
      exit()

    print("Processing dataset...")
    # replace class names with 0...n numbers
    class_namess = pd.concat([ self.val_df[args.class_name],
                             self.train_df[args.class_name],
                             self.test_df[args.class_name] ]).unique()
    self.class_names = dict(zip(class_namess, range(len(class_namess))))
    self.val_df[args.class_name] = self.val_df[args.class_name].replace(self.class_names)
    self.train_df[args.class_name] = self.train_df[args.class_name].replace(self.class_names)
    self.test_df[args.class_name] = self.test_df[args.class_name].replace(self.class_names)
    # delete usless stuff
    self.val_df = self.val_df.drop(['title', 'type', 'year', 'law_id', 'leg_uri'], axis=1)
    self.train_df = self.train_df.drop(['title', 'type', 'year', 'law_id', 'leg_uri'], axis=1)
    self.test_df = self.test_df.drop(['title', 'type', 'year', 'law_id', 'leg_uri'], axis=1)

    def random_digits(y):
      while True:
        start = random.choice(string.digits)
        if int(start) != 0:
          break
      return start+''.join(random.choice(string.digits) for x in range(len(y.group())-1))

    text = []
    for _, row in self.val_df.iterrows():
      tmp = row['header'] + ' ' + row['articles']
      text.append(re.sub(r"\d+", random_digits, tmp))
    self.val_df['header'] = text

    text = []
    for _, row in self.train_df.iterrows():
      tmp = row['header'] + ' ' + row['articles']
      text.append(re.sub(r"\d+", random_digits, tmp))
    self.train_df['header'] = text

    text = []
    for _, row in self.test_df.iterrows():
      tmp = row['header'] + ' ' + row['articles']
      text.append(re.sub(r"\d+", random_digits, tmp))
    self.test_df['header'] = text

    self._vectorizer = LegalVectorizer()
    self._lookup_dict = {'train': (self.train_df, self.train_size),
                          'val': (self.val_df, self.validation_size),
                          'test': (self.test_df, self.test_size)}

    self.set_split('train')

    print("Calculating frequences...")
    self.class_counts = []
    for v in range(args.no_classes):
      tmp = self.train_df[self.train_df[args.class_name]==v][args.class_name].count()
      if tmp>0:
        self.class_counts.append(tmp)
      else:
        self.class_counts.append(0)
    # self.class_weights = 10000.0 / torch.tensor(self.class_counts, dtype=torch.float32) 

  def get_vectorizer(self):
    """ returns the vectorizer """
    return self._vectorizer

  def set_split(self, split):
    """ selects the splits in the dataset using _lookup_dict """
    # self._target_split = split
    self._target_df, self._target_size = self._lookup_dict[split]

  def __len__(self):
    return self._target_size

  def __getitem__(self, index):
    """the primary entry point method for PyTorch datasets
    Args:
        index (int): the index to the data point 
    Returns:
        a dictionary holding the data point's. text is NOT vectorized yet.
    """
    row = self._target_df.iloc[index]
    target = row[args.class_name]
    text = row['header']

    return {'target': target,
            'text' : text}
      
  def get_num_batches(self, batch_size):
    """Given a batch size, return the number of batches in the dataset
    Args:
      batch_size (int)
    Returns:
      number of batches in the dataset
    """
    return len(self) // batch_size

###Dataloader

In [None]:
# this is a cool way to save some time in training. Havent done it yet, may effect accuracy
# http://mccormickml.com/2020/07/29/smart-batching-tutorial
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
  dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                          shuffle=shuffle, drop_last=drop_last)

  for data_dict in dataloader:
    #vectorize batch text
    tmp = dataset.get_vectorizer().vectorize(data_dict['text'])
    mask, vector = tmp['mask'], tmp['vector']
    data_dict['vector'] = vector
    data_dict['mask'] = mask
    del data_dict['text']

    out_data_dict = {}
    for name, tensor in data_dict.items():
        out_data_dict[name] = data_dict[name].to(device)
    yield out_data_dict

###The Model

In [None]:
class LegalClassifier(nn.Module):
  """ greek-Bert model with an extra linear layer, which takes 
      the cls tocken as input, for classification """
  def __init__(self, no_classes):
    """
    Args:
        no_classes (int): the size of the linear layer
    """
    super(LegalClassifier, self).__init__()
    print("Loading greek-Bert...")
    self.bert = AutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(768,no_classes)
    # for param in self.bert.parameters():
    #   param.requires_grad = False

  def forward(self, input, mask, apply_softmax=False):
    """The forward pass of the classifier
    Args:
        input (torch.Tensor): an input data tensor.
        mask (torch.Tensor): the coresponding masks for BERT
        apply_softmax (bool): whether or not to apply soflmax to the output layer
    Returns:
        the resulting tensor. tensor.shape should be (batch, output_dim)
    """
    output = self.bert(input_ids=input, attention_mask=mask, 
                       output_attentions=False, output_hidden_states=False)
    x = output.pooler_output # coresponds to CLS token
    x = self.dropout(x) 
    x = self.fc(x)
    if apply_softmax:
        x = F.softmax(x, dim=1)
    return x

###helper functions

In [None]:
def make_train_state(args):
  return {'stop_early': False,
          'early_stopping_step': 0,
          'early_stopping_best_val': 1e8,
          'learning_rate': args.learning_rate,
          'epoch_index': 0,
          'train_loss': [],
          'train_acc': [],
          'val_loss': [],
          'val_acc': [],
          'test_loss': -1,
          'test_acc': -1,
          'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
  """Handle the training state updates.

  Components:
    - Early Stopping: Prevent overfitting.
    - Model Checkpoint: Model is saved if the model is better

  :param args: main arguments
  :param model: model to train
  :param train_state: a dictionary representing the training state values
  :returns:
      a new train_state
  """
  # Save one model at least
  if train_state['epoch_index'] == 0:
    torch.save(model.state_dict(), train_state['model_filename'])
    train_state['stop_early'] = False

  # Save model if performance improved
  elif train_state['epoch_index'] >= 1:
    loss_tm1, loss_t = train_state['val_loss'][-2:]

    # If loss worsened
    if loss_t >= train_state['early_stopping_best_val']:
      # Update step
      train_state['early_stopping_step'] += 1
    # Loss decreased
    else:
      # Save the best model
      if loss_t < train_state['early_stopping_best_val']:
          torch.save(model.state_dict(), train_state['model_filename'])
      # Reset early stopping step
      train_state['early_stopping_step'] = 0

    # Stop early ?
    train_state['stop_early'] = \
        train_state['early_stopping_step'] >= args.early_stopping_criteria
  return train_state

def compute_accuracy(y_pred, y_target):
  _, y_pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(y_pred_indices, y_target).sum().item()
  return n_correct / len(y_pred_indices) * 100

###Initializations

In [None]:
if args.expand_filepaths_to_save_dir:
  args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
  args.reload_name = os.path.join(args.save_dir, args.reload_name)
  print("Expanded filepaths: ")
  print("\t{}".format(args.model_state_file))
  print("\t{}".format(args.reload_name))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
# handle_dirs(args.save_dir)

dataset = LegalDataset()
vectorizer = dataset.get_vectorizer()
classifier = LegalClassifier(no_classes=args.no_classes)

if args.reload_from_files:
    # training from a checkpoint
    print("Reloading previous model!")
    classifier.load_state_dict(torch.load(args.reload_name))
else:
    print("Creating fresh!")

Expanded filepaths: 
	/content/drive/MyDrive/thesis/models/model_subject_random_numbers.pth
	/content/drive/MyDrive/thesis/models/model_chapter_8.pth
loading validation set...
loading training set...
loading test set...
Processing dataset...
Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=459.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=529930.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…


Calculating frequences...
Loading greek-Bert...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=454248854.0, style=ProgressStyle(descri…


Creating fresh!


###Training Loop

In [None]:
if args.run_training:
  classifier = classifier.to(args.device)
  # dataset.class_weights = dataset.class_weights.to(args.device)

  loss_func = nn.CrossEntropyLoss()#weight=dataset.class_weights)
  optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, 
                                                  mode='min', factor=0.5, patience=1)

  train_state = make_train_state(args)

  epoch_bar = tqdm(desc='training routine', 
                            total=args.num_epochs,
                            position=0)

  dataset.set_split('train')
  train_bar = tqdm(desc='split=train',
                            total=dataset.get_num_batches(args.batch_size), 
                            position=1, 
                            leave=True)
  dataset.set_split('val')
  val_bar = tqdm(desc='split=val',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)

  try:
    for epoch_index in range(args.num_epochs):
      train_state['epoch_index'] = epoch_index

      # Iterate over training dataset

      # setup: batch generator, set loss and acc to 0, set train mode on

      dataset.set_split('train')
      batch_generator = generate_batches(dataset, 
                                          batch_size=args.batch_size, 
                                          device=args.device)
      running_loss = 0.0
      running_acc = 0.0
      classifier.train()

      for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is these 5 steps:

        # --------------------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(batch_dict['vector'].squeeze(), batch_dict['mask'].squeeze())
        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------
        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        # update bar
        train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                        epoch=epoch_index)
        train_bar.update()

      train_state['train_loss'].append(running_loss)
      train_state['train_acc'].append(running_acc)

      # Iterate over val dataset

      # setup: batch generator, set loss and acc to 0; set eval mode on
      dataset.set_split('val')
      batch_generator = generate_batches(dataset, 
                                          batch_size=args.batch_size, 
                                          device=args.device)
      running_loss = 0.
      running_acc = 0.
      classifier.eval()
      with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
          # compute the output
          y_pred =  classifier(batch_dict['vector'].squeeze(), batch_dict['mask'].squeeze())

          # compute the loss
          loss = loss_func(y_pred, batch_dict['target'])
          loss_t = loss.to("cpu").item()
          running_loss += (loss_t - running_loss) / (batch_index + 1)

          # compute the accuracy
          acc_t = compute_accuracy(y_pred, batch_dict['target'])
          running_acc += (acc_t - running_acc) / (batch_index + 1)
          val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                          epoch=epoch_index)
          
          val_bar.update()

      train_state['val_loss'].append(running_loss)
      train_state['val_acc'].append(running_acc)

      train_state = update_train_state(args=args, model=classifier,
                                        train_state=train_state)

      scheduler.step(train_state['val_loss'][-1])

      if train_state['stop_early']:
        break

      train_bar.n = 0
      val_bar.n = 0
      epoch_bar.update()
  except KeyboardInterrupt:
      print("Exiting loop")

### Testing

In [None]:
# run test set and save results

preds = []
correct = []

classifier.load_state_dict(torch.load(args.model_state_file))
classifier = classifier.to(args.device)
# dataset.class_weights = dataset.class_weights.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)

test_bar = tqdm(desc='split=test',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)

classifier.eval()
with torch.no_grad():
  for batch_index, batch_dict in enumerate(batch_generator):
    # compute output
    y_pred =  classifier(batch_dict['vector'].squeeze(), batch_dict['mask'].squeeze(), apply_softmax=True)
    #save output
    preds += y_pred
    correct += batch_dict['target']
    test_bar.update()

predictions = {'predictions' : preds,
              'correct' : correct}

HBox(children=(FloatProgress(value=0.0, description='split=test', max=1189.0, style=ProgressStyle(description_…

In [None]:
# tensors to cpu and find predicted class
tmp = []
for item in tqdm(predictions['predictions'], desc='predictions to cpu'):
  item = item.cpu()
  tmp.append(item.tolist().index(max(item)))
predictions['predictions'] = tmp
predictions['correct'] = torch.tensor(predictions['correct'], device = 'cpu').tolist()

frequent = {'predictions' : [],
            'correct' : []}
few_shot = {'predictions' : [],
            'correct' : []}

for pred, target in zip(predictions['predictions'], predictions['correct']):
  if dataset.class_counts[target] < 10 and dataset.class_counts[target] != 0:
    few_shot['predictions'].append(pred)
    few_shot['correct'].append(target)
  elif dataset.class_counts[target] > 10:
    frequent['predictions'].append(pred)
    frequent['correct'].append(target)

for group, dict_ in zip(['All     \t','Frequent\t','Fewshot \t'], [predictions, frequent, few_shot]):
  if 'All' in group:
    l = list(set(dict_['correct']) | set(dict_['predictions']))
  else:
    l = list(set(dict_['correct']))
  f1 = round(f1_score(dict_['correct'], dict_['predictions'], average='micro', labels=l)*100,2)
  rec = round(recall_score(dict_['correct'], dict_['predictions'], average='micro', labels=l)*100,2)
  prec = round(precision_score(dict_['correct'], dict_['predictions'], average='micro', labels=l)*100,2)
  print(group+'\t f1: '+str(f1)+'\t recall: '+str(rec)+'\t precision: '+str(prec))