In [1]:
from argparse import Namespace
args = Namespace(
  # Data and path information
  dataset_path="/content/drive/MyDrive/thesis/dataset/",
  model_state_file="model2.pth",
  save_dir="/content/drive/MyDrive/thesis/",
  # Training hyper parameters
  seed=1337,
  num_epochs=1,
  early_stopping_criteria=100,
  learning_rate=0.00001,
  batch_size=8,
  # Runtime options
  cuda=True,
  reload_from_files=False,
  expand_filepaths_to_save_dir=True,
)

###tmp

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# from collections import Counter

# import string

import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# https://torchmetrics.readthedocs.io/en/latest/index.html
!pip install torchmetrics
from torchmetrics import MetricCollection, Accuracy, Precision, Recall, F1

# reading json files
import json
from os import listdir
from os.path import isfile, join
import os

# huggingface lib bert
print('Loading transformers lib...')
!pip install transformers
from transformers import AutoTokenizer, AutoModel

Loading transformers lib...


In [4]:
# Check CUDA and gpu available
if not torch.cuda.is_available():
  args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))
if args.cuda:
  print("GPU: {}".format(torch.cuda.get_device_name(0)))

Using CUDA: True
GPU: Tesla T4


###Transform dataset
transform json files to dataframes and save them as pickles

In [5]:
# original_dataset_path = "/content/drive/MyDrive/RAPTARCHIS47k/"

# def transform_save_data(path):
#   """
#     transforms jason original dataset to dataframe and saves it to pickle 
#   """
#   df = pd.DataFrame(columns = column_names)
  
#   path1 = join(original_dataset_path,path)
#   bar = tqdm(desc=path, total=len(listdir(path1)), 
#               position=1, leave=True)
  
#   for f in listdir(path1):
#     path2 = join(path1, f)
#     if isfile(path2):
#       with open(path2) as json_file:
#         data = json.load(json_file)
#         tmp = pd.Series([data['title'], data['type'], data['year'], 
#                     data['law_id'] if data['law_id'] is not None else "None",
#                     data['leg_uri'] if data['leg_uri'] is not None else "None",
#                   data['volume'], data['chapter'], data['subject'], data['header'],
#                   data['header']], index = column_names)
#         df = df.append(tmp, ignore_index=True)
#         bar.update()
#   df.to_pickle(join(args.save_dir,'dataset/',path+'.pkl'))


# column_names=['title', 'type', 'year', 'law_id', 'leg_uri', 
#               'volume', 'chapter', 'subject', 'header', 'articles']
# train_df = pd.DataFrame(columns = column_names)
# val_df = pd.DataFrame(columns = column_names)
# test_df = pd.DataFrame(columns = column_names)

# transform_save_data('test')
# transform_save_data('dev')
# transform_save_data('train')

### Utils

In [6]:
def set_seed_everywhere(seed, cuda):
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  # random.seed(seed)
  # !!!! may need to add hugingface init seed
  if cuda:
    torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
  if not os.path.exists(dirpath):
    os.makedirs(dirpath)

### The Vectorizer

In [7]:
class LegalVectorizer(object):
  """ The Vectorizer"""
  def __init__(self, max_len=0):
    """
    Args:
        LegalVectorizer (max_len): maps characters to integers and pads to max_len
    """
    print('Loading BERT tokenizer...')
    self.tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1", model_max_length=512)
    self.max_len = max_len

  def vectorize(self, text):
    """
    Args:
        text (list of str):
    Returns:
        dictionary: "vector" is a tensor with a list of encoded text paded to max_len, ready for import to BERT
                    "mask" is a tensor with a list of masks ready for import to BERT
    """
    encoded_dict = self.tokenizer(
                    text,                      # Sentence to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    # padding = 'longest', # pad to longest in bach
                    truncation = True, # truncates sentenses to 512, max bert length
                    padding = 'max_length',
                    # max_length = 512,           # Pad
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',     # Return pytorch tensors.
                )

    return {"vector" : encoded_dict['input_ids'],
            "mask" : encoded_dict['attention_mask']}

  def to_words(self, vector, remove_pads=True):
    """
    Args:
        vector (tensor): list of vectors to decode
        remove_pads : remove pad 
    Returns:
        list of lists of words coresponding to the vector values
    """
    if remove_pads:
      ans=[self.tokenizer.convert_ids_to_tokens(v[v.nonzero()]) for v in vector]
    else:
      ans=[self.tokenizer.convert_ids_to_tokens(v) for v in vector]
    return ans
      

In [8]:
# vec=LegalVectorizer(20)
# batch_sentences = ["δεν ξερω αμα δουλευει",
#                    "και"]
# a=vec.vectorize(batch_sentences)
# print(a)
# print(vec.to_words(a['vector'],False))

###The dataset

In [9]:
class LegalDataset(Dataset):
  def __init__(self):
    """
    Args:
    """

    print("loading validation set...")
    self.val_df = pd.read_pickle(args.dataset_path + "dev.pkl")
    self.validation_size = len(self.val_df)

    class_names = self.val_df['volume'].unique()
    self.class_names = dict(zip(class_names, range(len(class_names))))

    print("loading training set...")
    self.train_df = pd.read_pickle(args.dataset_path + "train.pkl")
    self.train_size = len(self.train_df)
    print("loading test set...")
    self.test_df = pd.read_pickle(args.dataset_path + "test.pkl")
    self.test_size = len(self.test_df)
    if self.val_df.shape[0]!=9511 or self.train_df.shape[0]!=28536 or self.test_df.shape[0]!=9516:
      print(self.val_df.shape[0])
      print(self.train_df.shape[0])
      print(self.test_df.shape[0])
      print("!! ERROR dataset size !!")
      exit()

    self.val_df['volume'] = self.val_df['volume'].replace(self.class_names)
    self.train_df['volume'] = self.train_df['volume'].replace(self.class_names)
    self.test_df['volume'] = self.test_df['volume'].replace(self.class_names)

    self._vectorizer = LegalVectorizer()
    self._lookup_dict = {'train': (self.train_df, self.train_size),
                          'val': (self.val_df, self.validation_size),
                          'test': (self.test_df, self.test_size)}

    self.set_split('train')
    
    # Class weights
    # class_counts = df['target'].value_counts().to_dict()
    # print(class_counts)
    # def sort_key(item):
    #     return self._vectorizer.vocab.lookup_token(item[0])
    # sorted_counts = sorted(class_counts.items(), key=sort_key)
    # frequencies = [count for _, count in sorted_counts] ; 
    # print(frequencies)
    # self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
    # print(torch.tensor(frequencies, dtype=torch.float32))
    # print(self.class_weights)

  def get_vectorizer(self):
      """ returns the vectorizer """
      return self._vectorizer

  def set_split(self, split):
      """ selects the splits in the dataset using _lookup_dict """
      # self._target_split = split
      self._target_df, self._target_size = self._lookup_dict[split]

  def __len__(self):
      return self._target_size

  def __getitem__(self, index):
      """the primary entry point method for PyTorch datasets
      Args:
          index (int): the index to the data point 
      Returns:
          a dictionary holding the data point's:
              features (x_surname)
              label (y_nationality)
      """
      row = self._target_df.iloc[index]
      # id =  row['law_id']
      volume = row['volume']
      # chapter = row['chapter']
      # subject = row['subject']
      # title type header ????
      tmp = self._vectorizer.vectorize(row['articles'])
      mask, vector = tmp['mask'], tmp['vector']

      return {#'id': id,
              'volume': volume,
              #'chapter': chapter,
              #'subject': subject,
              'vector': vector,
              'mask' : mask}

  def get_num_batches(self, batch_size):
      """Given a batch size, return the number of batches in the dataset
      Args:
          batch_size (int)
      Returns:
          number of batches in the dataset
      """
      return len(self) // batch_size

In [10]:
# val_df = pd.read_pickle(args.dataset_path + "dev.pkl")
# vec=LegalVectorizer(20)
# a=vec.vectorize(val_df['articles'][0])
# print(a)
# print(vec.to_words(a['vector'],False))

###Dataloader

In [11]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

###The Model

In [12]:
class LegalClassifier(nn.Module):
  """ greek-Bert model with an extra linear layer, which takes 
      the cls tocken as input, for classification """
  def __init__(self, no_classes):
    """
    Args:
        no_classes (int): the size of the linear layer
    """
    super(LegalClassifier, self).__init__()
    print("Loading greek-Bert...")
    self.bert = AutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(768,no_classes)
    # for param in self.bert.parameters():
    #   param.requires_grad = False

  def forward(self, input, mask, apply_softmax=False):
    """The forward pass of the classifier
    Args:
        input (torch.Tensor): an input data tensor. 
                shape should be (batch, input_dim)
        mask (torch.Tensor): the coresponding masks for BERT
    Returns:
        the resulting tensor. tensor.shape should be (batch, output_dim)
    """
    output = self.bert(input_ids=input, attention_mask=mask, 
                       output_attentions=False, output_hidden_states=False)
    x = output[0][:,0,:] # [0][0] coresponds to CLS token
    x = self.dropout(x) 
    x = self.fc(x)
    if apply_softmax:
        x = F.softmax(x, dim=1)
    return x

###helper functions

In [13]:
def make_train_state(args):
  return {'stop_early': False,
          'early_stopping_step': 0,
          'early_stopping_best_val': 1e8,
          'learning_rate': args.learning_rate,
          'epoch_index': 0,
          'train_loss': [],
          'train_acc': [],
          'val_loss': [],
          'val_acc': [],
          'test_loss': -1,
          'test_acc': -1,
          'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
  """Handle the training state updates.

  Components:
    - Early Stopping: Prevent overfitting.
    - Model Checkpoint: Model is saved if the model is better

  :param args: main arguments
  :param model: model to train
  :param train_state: a dictionary representing the training state values
  :returns:
      a new train_state
  """

  # Save one model at least
  if train_state['epoch_index'] == 0:
    torch.save(model.state_dict(), train_state['model_filename'])
    train_state['stop_early'] = False

  # Save model if performance improved
  elif train_state['epoch_index'] >= 1:
    loss_tm1, loss_t = train_state['val_loss'][-2:]

    # If loss worsened
    if loss_t >= train_state['early_stopping_best_val']:
      # Update step
      train_state['early_stopping_step'] += 1
    # Loss decreased
    else:
      # Save the best model
      if loss_t < train_state['early_stopping_best_val']:
          torch.save(model.state_dict(), train_state['model_filename'])
      # Reset early stopping step
      train_state['early_stopping_step'] = 0

    # Stop early ?
    train_state['stop_early'] = \
        train_state['early_stopping_step'] >= args.early_stopping_criteria

  return train_state

def compute_accuracy(y_pred, y_target):
  _, y_pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(y_pred_indices, y_target).sum().item()
  return n_correct / len(y_pred_indices) * 100

###Initializations

In [14]:
if args.expand_filepaths_to_save_dir:
  args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
  print("Expanded filepaths: ")
  # print("\t{}".format(args.vectorizer_file))
  print("\t{}".format(args.model_state_file))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
# handle_dirs(args.save_dir)

if args.reload_from_files:
    # training from a checkpoint
    print("Reloading!")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(args.tweet_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    print("Creating fresh!")
    dataset = LegalDataset()
    # dataset.save_vectorizer(args.vectorizer_file)
    
vectorizer = dataset.get_vectorizer()
classifier = LegalClassifier(no_classes=47)
classifier.load_state_dict(torch.load(args.model_state_file))

Expanded filepaths: 
	/content/drive/MyDrive/thesis/model2.pth
Creating fresh!
loading validation set...
loading training set...
loading test set...
Loading BERT tokenizer...
Loading greek-Bert...


<All keys matched successfully>

###Training Loop

In [15]:
classifier = classifier.to(args.device)
# dataset.class_weights = dataset.class_weights.to(args.device)

loss_func = nn.CrossEntropyLoss()#dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, 
                                                 mode='min', factor=0.5, patience=1) #adjusts learning rate
     #torch.optim.lr_scheduler provides several methods to adjust the learning rate based on 
     #the number of epochs. torch.optim.lr_scheduler.ReduceLROnPlateau allows dynamic 
     #learning rate reducing based on some validation measurements.

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

# metric_collection = MetricCollection([
#     Accuracy(),
#     Precision(num_classes=47, average='micro'),
#     Recall(num_classes=47, average='micro'),
#     F1(num_classes=47, average='micro')
# ]).to(args.device)

try:
  for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    # Iterate over training dataset

    # setup: batch generator, set loss and acc to 0, set train mode on

    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                        batch_size=args.batch_size, 
                                        device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
      # the training routine is these 5 steps:

      # --------------------------------------
      # step 1. zero the gradients
      optimizer.zero_grad()
      # step 2. compute the output
      y_pred = classifier(batch_dict['vector'].squeeze(), batch_dict['mask'].squeeze())
      # step 3. compute the loss
      loss = loss_func(y_pred, batch_dict['volume'])
      loss_t = loss.item()
      running_loss += (loss_t - running_loss) / (batch_index + 1)

      # step 4. use loss to produce gradients
      loss.backward()

      # step 5. use optimizer to take gradient step
      optimizer.step()
      # -----------------------------------------
      # compute the accuracy
      acc_t = compute_accuracy(y_pred, batch_dict['volume'])
      running_acc += (acc_t - running_acc) / (batch_index + 1)

      # update bar
      train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                      epoch=epoch_index)
      train_bar.update()

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # Iterate over val dataset

    # setup: batch generator, set loss and acc to 0; set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, 
                                        batch_size=args.batch_size, 
                                        device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
      # compute the output
      y_pred =  classifier(batch_dict['vector'].squeeze(), batch_dict['mask'].squeeze())

      # compute the loss
      loss = loss_func(y_pred, batch_dict['volume'])
      loss_t = loss.to("cpu").item()
      running_loss += (loss_t - running_loss) / (batch_index + 1)

      # compute the accuracy
      acc_t = compute_accuracy(y_pred, batch_dict['volume'])
      running_acc += (acc_t - running_acc) / (batch_index + 1)
      val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                      epoch=epoch_index)
      
      # metric_collection.update(y_pred, batch_dict['volume'])

      val_bar.update()

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    # metrics = metric_collection.compute()
    # print(metrics)
    # metric_collection.reset()


    train_state = update_train_state(args=args, model=classifier,
                                      train_state=train_state)

    scheduler.step(train_state['val_loss'][-1])

    # if train_state['stop_early']:
    #   break

    train_bar.n = 0
    val_bar.n = 0
    epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(FloatProgress(value=0.0, description='training routine', max=1.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=3567.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=1188.0, style=ProgressStyle(description_w…

In [17]:
# compute the metrics on the test set using the best available model

metric_collection = MetricCollection([
    Accuracy(),
    Precision(num_classes=47, average='micro'),
    Recall(num_classes=47, average='micro'),
    F1(num_classes=47, average='micro')
]).to(args.device)

classifier.load_state_dict(torch.load(args.model_state_file))

classifier = classifier.to(args.device)
# dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss()#dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)

test_bar = tqdm(desc='split=test',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)

running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['vector'].squeeze(), batch_dict['mask'].squeeze(), apply_softmax=True)
    
    # # compute the loss
    # loss = loss_func(y_pred, batch_dict['volume'])
    # loss_t = loss.item()
    # running_loss += (loss_t - running_loss) / (batch_index + 1)

    # # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['volume'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

    metric_collection.update(y_pred, batch_dict['volume'])
    test_bar.update()
metrics = metric_collection.compute()
print(metrics)
# train_state['test_loss'] = running_loss
# train_state['test_acc'] = running_acc
print(running_acc)

RuntimeError: ignored