# Environment Preparation

## Mount Drive and Install Dependencies

In [None]:
#Mount Drive and install dependencies
def install_dependecies():
  !pip install transformers==4.6.1
  !pip install pytorch-lightning==1.3.5
  !pip install optuna==2.8.0

from sys import path
import os
import sys

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/topic-mining-paper/repository'
  drive_mount_location = '/content/drive'
  module_path = root_PATH + '/src'
  
  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)

  install_dependecies()
else:
  root_PATH = os.path.abspath("../..")
  module_path = os.path.abspath(os.path.join('../../src'))

%load_ext autoreload
%autoreload 2

if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
from torch import cuda
from transformers import BertTokenizer, BertModel, BertConfig
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
import torch.nn as nn
import transformers
import numpy as np
from sklearn import metrics
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score
from pytorch_lightning.loggers import TensorBoardLogger
import gc 
import optuna

## Helper Classes

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.text
        self.targets = dataframe.target
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        
        comment_text = str(self.comment_text[index])

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
          	padding='max_length',
            return_token_type_ids=True,
            truncation = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [None]:
import pytorch_lightning as pl
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import numpy as np
import abc
 
from re import T
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_fscore_support
import pandas as pd
 
class CustomModel(pl.LightningModule):
 
    def __init__(self, hyperparams, training_dataset, validation_dataset, labels, model_to_use, tokenizer, training_sampler):
        super().__init__()
 
        self.hyperparams = hyperparams
        self.training_dataset = training_dataset
        self.validation_dataset = validation_dataset
        self.labels = labels
        self.tokenizer = tokenizer
        self.define_model(model_to_use, tokenizer)
        self.training_sampler = training_sampler
 
        self.best_f1 = 0
        self.epoch_best_f1 = 0
 
    @abc.abstractmethod
    def define_model(self, model_to_use, tokenizer):
        pass
 
    @abc.abstractmethod
    def forward(self, ids, mask, token_type_ids):
        pass
 
    def loss_fn(self, outputs, targets):
      return torch.nn.CrossEntropyLoss()(outputs, targets)
 
    def general_step(self, batch, batch_idx, mode):
      ids = batch['ids']
      mask = batch['mask']
      token_type_ids = batch['token_type_ids']
      targets = batch['targets']
 
      outputs = self.forward(ids, mask, token_type_ids)
 
      return {'outputs': outputs, 'targets': targets}
 
    #******Training******
    #This method runs on each GPU
    def training_step(self, batch, batch_idx):
      return self.general_step(batch, batch_idx, "train")
    
    #This method aggregates the results of training_step in the different GPUs
    def training_step_end(self, aggregated_outputs):
      loss = self.loss_fn(aggregated_outputs["outputs"], aggregated_outputs["targets"])
      self.log('training_loss',loss)
      return {'loss':loss}
    
    #This method runs at the end of each epoch
    def training_epoch_end(self, results_of_each_batch):
      pass
 
    #******Validation******
    #This method runs in each GPU
    def validation_step(self, batch, batch_idx):
      return self.general_step(batch, batch_idx, "val")
 
    #This method aggregates the results of validation_step in the different GPUs
    def validation_step_end (self, aggregated_outputs):
      outputs = torch.tensor(aggregated_outputs['outputs'].cpu().detach().numpy().tolist())
      prediction_values, prediction_indexes = torch.max(outputs, dim=1)
      targets = aggregated_outputs['targets'].cpu().detach().numpy()
 
      return {'predictions': prediction_indexes, 'targets': targets}
 
    #This method runs at the end of each epoch
    def validation_epoch_end(self, results_of_each_batch):
      targets = np.empty([0])
      predictions = np.empty([0])
      
      for result in results_of_each_batch:
        targets = np.concatenate((targets,result['targets']))
        predictions = np.concatenate((predictions,result['predictions']))
      
      accuracy = accuracy_score(targets, predictions)
      precision, recall, f1, _ = precision_recall_fscore_support(targets, 
                                                                 predictions, 
                                                                 average="binary", 
                                                                 pos_label=1,
                                                                 zero_division=0)
      auc_score = 0
      try:
        auc_score = roc_auc_score(targets, predictions)
      except:
        pass
 
      results = { 'accuracy':accuracy,
                  'precision':precision,
                  'recall':recall,
                  'f1':f1,
                  'auc':auc_score
                }
 
      self.log('f1', f1)
 
      if f1 > self.best_f1:
        self.best_f1 = f1
        self.epoch_best_f1 = self.current_epoch
 
      print(f'epoch {self.current_epoch}: {results}')
 
    def configure_optimizers(self):
      return torch.optim.Adam(params = self.parameters(), lr=self.hyperparams["learning_rate"])
 
    #******Dataloaders******
    def train_dataloader(self):
      if self.training_sampler is None:
        shuffle = self.hyperparams["shuffle"]
      else:
        shuffle = False
      
      return DataLoader(self.training_dataset, 
                        batch_size=self.hyperparams["train_batch_size"], 
                        shuffle=shuffle, 
                        num_workers=2, 
                        sampler=self.training_sampler )
 
    def val_dataloader(self):
      return DataLoader(self.validation_dataset, 
                        batch_size=self.hyperparams["validation_batch_size"], 
                        shuffle= False, 
                        num_workers=2)

In [None]:
from transformers import BertTokenizer, BertModel, BertConfig
import transformers
import torch
 
class BERTCustomModel(CustomModel):
 
    def define_model(self, model_to_use, tokenizer):
      self.l1 = transformers.BertModel.from_pretrained(model_to_use)
      self.l1.resize_token_embeddings(len(tokenizer)) 
 
      self.l2 = torch.nn.Dropout(0.3)
      self.l3 = torch.nn.Linear(768, len(self.labels))
 
    def forward(self, ids, mask, token_type_ids):
 
      transformer_output = self.l1(input_ids = ids, token_type_ids = token_type_ids, attention_mask= mask)
      hidden_state = transformer_output.last_hidden_state
      
      output_1 = hidden_state[:, 0] #CLS token
      output_2 = self.l2(output_1)
      output = self.l3(output_2)
      
      return output

In [None]:
from enum import Enum
 
logs_PATH = '/tmp'
 
def category_to_number(category):
  result = 0
 
  if category == 'irr':
      result = 0
  elif category == 'pbr':
      result = 1
  elif category == 'inq':
      result = 2
  return result
 
def define_target(row, category):
  if row.category == category:
    return 1
  else:
    return 0
 
class Category(Enum):
    PROBLEM = 'pbr'
    INQUIRY = 'inq'
    IRRELEVANT = 'irr'
 
class Language(Enum):
    ENGLISH = 'en'
    ITALIAN = 'it'
 
class Source(Enum):
    APP = 'app_review'
    TWITTER = 'tweet'
 
def train_model(train_df, validation_df, model_to_use, model_params, logs_path, undersampling, max_epochs,save_checkpoints, early_stop, gpus_to_use,max_len):
 
    if undersampling:
      #Sampler Configuration
      class_count = train_df['target'].value_counts().values
      target_list = train_df['target'].tolist()
 
      class_weights = 1./torch.tensor(class_count, dtype=torch.float)
      class_weights_all = class_weights[target_list]
 
      weighted_sampler = WeightedRandomSampler(
          weights=class_weights_all,
          num_samples=len(class_weights_all),
          replacement=True
      )
    else:
      weighted_sampler = None
 
    #Tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_to_use)
    tokenizer.add_tokens("@mention",special_tokens=False)
 
    #Datasets
    training_set = CustomDataset(train_df, tokenizer, max_len)
    validation_set = CustomDataset(validation_df, tokenizer, max_len)
 
    #Training
    labels = [0,1]
    logger = TensorBoardLogger(logs_path, name='my_BERT_model')
    model = BERTCustomModel(model_params, 
                            training_set, 
                            validation_set, 
                            labels, 
                            model_to_use,
                            tokenizer,
                            weighted_sampler)
    
    callbacks = []

    if save_checkpoints:
      checkpoint_callback = ModelCheckpoint(
                                          monitor='f1',
                                          dirpath=logs_path,
                                          save_top_k=1,
                                          mode='max',
                                          )
      callbacks.append(checkpoint_callback)

    if early_stop:
      early_stop_callback = pl.callbacks.EarlyStopping('f1', patience=1)
      callbacks.append(early_stop_callback)

    trainer = pl.Trainer(gpus=gpus_to_use,
                                #accelerator='dp',
                                #limit_train_batches=2,
                                #limit_val_batches=2,
                                #limit_test_batches=32,
                                #logger=logger,
                                checkpoint_callback=save_checkpoints,
                                max_epochs=max_epochs,
                                #default_root_dir=logs_path,
                                #deterministic=True,
                                #callbacks=[checkpoint_callback]
                                callbacks = callbacks
                                )
    
    trainer.fit(model)
    gc.collect()
    torch.cuda.empty_cache()
    #print(f'best f1: {model.best_f1}, best epoch: {model.epoch_best_f1}')
    return model.best_f1

from sklearn.model_selection import KFold

def objective(trial, dataset, validation_batch_size, shuffle, gpus_to_use,max_len, train_batch_size):

  max_epochs = 2
  save_checkpoints = False
  early_stop = False

  model_params = {
                    'learning_rate': trial.suggest_uniform("LEARNING_RATE", 1e-05, 1e-04), 
                    'train_batch_size': train_batch_size, 
                    'validation_batch_size': validation_batch_size,
                    'shuffle': shuffle
                }

  #Perform cross-validation
  fold = KFold(n_splits=3, shuffle=True, random_state=0)
  scores = []
  splits = fold.split(dataset)

  for split in splits:
    train_df = dataset.iloc[split[0]].reset_index(drop=True)
    validation_df = dataset.iloc[split[1]].reset_index(drop=True)

    logs_path = "/tmp/hyperparam-optim/logs"

    best_f1 = train_model(train_df, validation_df, model_to_use, model_params,logs_path,undersampling,max_epochs,save_checkpoints,early_stop,gpus_to_use,max_len)
    
    scores.append(best_f1)

  return np.mean(scores)


def train_models_all_categories(language,source,model_to_use,data_df,undersampling, multilingual_model, experiment_name):
  for category in Category:
    print(f'Model: {model_to_use}, Language: {language.value}, Source: {source.value}, Category: {category.value}({category_to_number(category.value)}), Undersampling: {undersampling}, Multilingual: {multilingual_model}')
    
    logs_path = logs_PATH + f'/logs/{experiment_name}/{model_to_use}-{source.value}-{category.value}-{language.value}'
    print(f'logs_path: {logs_path}')
 
    preprocessed_data_df = data_df.copy()
    preprocessed_data_df['target'] = preprocessed_data_df.apply(lambda row: define_target(row, category.value), axis=1)
 
    validation_df = preprocessed_data_df[((preprocessed_data_df['lang'] == language.value) & 
                                          (preprocessed_data_df['source'] == source.value) & 
                                          (preprocessed_data_df['is_test'] == 1))].reset_index()
 
    if multilingual_model:
      train_df = preprocessed_data_df[preprocessed_data_df['lang'] != language.value].reset_index()
    else:
      train_df = preprocessed_data_df[(preprocessed_data_df['lang'] == language.value) & 
                                      (preprocessed_data_df['source'] == source.value) & 
                                      (preprocessed_data_df['is_test'] == 0)].reset_index()
 
    print(f'training dataset size: {len(train_df)}')
    print(f'validation dataset size: {len(validation_df)}')
    
    print('******************************************************************')

    #Configuration variables
    MAX_LEN = 200
    VALID_BATCH_SIZE = 32
    TRAIN_BATCH_SIZE = 32
    #LEARNING_RATE = trial.suggest_uniform("LEARNING_RATE", 1e-05, 1e-04)
    #TRAIN_BATCH_SIZE = trial.suggest_categorical("TRAIN_BATCH_SIZE", [16,32])
    #Model parameters
    gpus_to_use = [0]

    #Hyperparameter optimization
    study_name = f'{experiment_name}/{model_to_use}-{source.value}-{category.value}-{language.value}'
    study = optuna.create_study(study_name = study_name, direction="maximize")
    study.optimize(lambda trial: objective(trial, train_df, VALID_BATCH_SIZE, True, gpus_to_use, MAX_LEN, TRAIN_BATCH_SIZE), n_trials=10)

    #Training best model for 2 epochs
    model_params = {
                    'learning_rate': study.best_params['LEARNING_RATE'], 
                    'train_batch_size': TRAIN_BATCH_SIZE, 
                    'validation_batch_size': VALID_BATCH_SIZE,
                    'shuffle': True
                    }

    max_epochs = 2
    save_checkpoints = False
    early_stop = False
    print(f'------------ Training Best Model for {experiment_name}/{model_to_use}-{source.value}-{category.value}-{language.value}------------------------')
    best_f1 = train_model(train_df, validation_df, model_to_use, model_params,logs_path,undersampling,max_epochs,save_checkpoints,early_stop,gpus_to_use,MAX_LEN)
    print('******************************************************************')

## Read Data

In [None]:
data_location = root_PATH + '/data/preprocessed-AIRE-dataset/dataset-preprocessed.csv'
data_df = pd.read_csv(data_location).reset_index()

# Experiments

## BERT Monolingual

In [None]:
multilingual_model = False
model_to_use_english = 'bert-base-uncased'
model_to_use_italian = 'dbmdz/bert-base-italian-cased'

#Donwload of models and tokenizers if they are not in cache (so the donwload bar don't clutter the training logs)
BertTokenizer.from_pretrained(model_to_use_english)
BertModel.from_pretrained(model_to_use_english)

BertTokenizer.from_pretrained(model_to_use_italian)
BertModel.from_pretrained(model_to_use_italian)

### BERT Monolingual With Undersampling

In [None]:
undersampling = True
experiment_name = 'BERT-monolingual-with-undersampling'

In [None]:
#English
language = Language.ENGLISH
model_to_use = model_to_use_english

for source in Source:
  train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)

gc.collect()
torch.cuda.empty_cache()

In [None]:
#Italian
language = Language.ITALIAN
source = Source.TWITTER
model_to_use = model_to_use_italian

train_models_all_categories(language,source,model_to_use,data_df,undersampling, multilingual_model,experiment_name)

gc.collect()
torch.cuda.empty_cache()

### BERT Monolingual **NO** Undersampling

In [None]:
undersampling = False
experiment_name = 'BERT-monolingual-no-undersampling'

In [None]:
#English
language = Language.ENGLISH
model_to_use = model_to_use_english

for source in Source:
  train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)
  
gc.collect()
torch.cuda.empty_cache()

In [None]:
#Italian
language = Language.ITALIAN
source = Source.TWITTER
model_to_use = model_to_use_italian

train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)

gc.collect()
torch.cuda.empty_cache()

## BERT Multilingual

In [None]:
multilingual_model = True
model_to_use_english = 'bert-base-multilingual-uncased'
model_to_use_italian = 'bert-base-multilingual-uncased'

#Donwload of models and tokenizers if they are not in cache (so the donwload bar don't clutter the training logs)
BertTokenizer.from_pretrained(model_to_use_english)
BertModel.from_pretrained(model_to_use_english)

BertTokenizer.from_pretrained(model_to_use_italian)
BertModel.from_pretrained(model_to_use_italian)

### BERT Multilingual With Undersampling

In [None]:
undersampling = True
experiment_name = 'BERT-multilingual-with-undersampling'

In [None]:
#English
language = Language.ENGLISH
model_to_use = model_to_use_english

for source in Source:
  train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)
  
gc.collect()
torch.cuda.empty_cache()

In [None]:
#Italian
language = Language.ITALIAN
source = Source.TWITTER
model_to_use = model_to_use_italian

train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)

gc.collect()
torch.cuda.empty_cache()

### BERT Multilingual **NO** Undersampling

In [None]:
undersampling = False
experiment_name = 'BERT-multilingual-no-undersampling'

In [None]:
#English
language = Language.ENGLISH
model_to_use = model_to_use_english

for source in Source:
  train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)
  
gc.collect()
torch.cuda.empty_cache()

In [None]:
#Italian
language = Language.ITALIAN
source = Source.TWITTER
model_to_use = model_to_use_italian

train_models_all_categories(language,source,model_to_use,data_df,undersampling,multilingual_model,experiment_name)

gc.collect()
torch.cuda.empty_cache()