# Install and Load specific libraries

In [None]:
!pip install --upgrade pip
!git clone https://github.com/google-research/bleurt.git
!pip install bert_score
!pip install datasets
!pip install sacrebleu
!pip install -q transformers
!pip install --upgrade nltk
!pip install rouge_score

import os
os.chdir('bleurt')

!pip install .

os.chdir('..')

In [None]:
"""The training, testing and validation data can be downloaded from here: 
    https://drive.google.com/drive/folders/18VKDa4cB8gW8pMypARc6pyBWGFFUIPKw?usp=sharing
"""

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import re
import os
import unicodedata
import codecs
import math
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
import datetime
import time
from sklearn import preprocessing
import copy
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel, BertConfig
from datasets import load_metric
import tensorflow as tf
import utils
import Models
import shutil
import nltk
nltk.download('wordnet')

tf.compat.v1.flags.DEFINE_string('f','','')

%matplotlib inline

In [None]:
bertscore = load_metric("bertscore")
bleu = load_metric("sacrebleu")
meteor = load_metric("meteor")
rouge = load_metric("rouge")

# Run Experiments: Model Training

In [None]:
#Load the experiment dict
experiments_df = pd.read_csv('./experiment_parameters.csv')
PATH = "<path>"

In [None]:
""" This loop will run all the 60 experiments seqentially, and save the best model
    and training + validation metrics. It is recommended to break down the 
    experiments_df into sets of expertiments, and run the sets independently in multiple GPUs
    to reduce overall runtime.
"""

for ix, row in experiments_df.iterrows():
  name = row['name']
  loss = row['loss']
  wordnet = row['wordnet']
  similarity = row['similarity']
  dataset = row['dataset']
  smoothen = row['smoothen']

  """ Remove a handful of examples from each dataset which have errors in them. In total, 
      we remove 2 examples from ED dataset's validation set,  9 examples from DD train set,
      and 1 from DD test set"""

  if dataset == 'empatheticdialogues':
    experiment_dict = pickle.load(open('./experiment_dict_v3.pkl', 'rb'))
    word_similarity_labels = pickle.load(open('./word_similarity_labels_v2.pkl', 'rb'))
    experiment_dict['valid_dict']['X_chat'] = [i for ix, i in enumerate(experiment_dict['valid_dict']['X_chat']) if ix != 4690]
    experiment_dict['valid_dict']['Y_chat'] = [i for ix, i in enumerate(experiment_dict['valid_dict']['Y_chat']) if ix != 4690]
    
  else:
    experiment_dict = pickle.load(open('./experiment_dict_dailydialogue.pkl', 'rb'))
    word_similarity_labels = pickle.load(open('./word_similarity_labels_dailydialogue.pkl', 'rb'))
    train_ = [4433, 9123, 9124, 10075, 13853, 18344, 22919, 27316, 28067]
    test_ = [123]
    experiment_dict['train_dict']['X_chat'] = [i for ix, i in enumerate(experiment_dict['train_dict']['X_chat']) if ix not in train_]
    experiment_dict['train_dict']['Y_chat'] = [i for ix, i in enumerate(experiment_dict['train_dict']['Y_chat']) if ix not in train_]
    experiment_dict['test_dict']['X_chat'] = [i for ix, i in enumerate(experiment_dict['test_dict']['X_chat']) if ix not in test_]
    experiment_dict['test_dict']['Y_chat'] = [i for ix, i in enumerate(experiment_dict['test_dict']['Y_chat']) if ix not in test_]

  print(experiment_dict.keys())
  print(len(experiment_dict['vocab_word2vec']))

  """ Create the config file for the current experiment """
  config = {'name':name,
            'wordnet':wordnet,
            'dataset':dataset,
            'tgt_sos':0,
            'src_pad_idx':1,
            'tgt_pad_idx':1,
            'device':torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            'loss':loss,
            'smoothing': smoothen,
            'hid_dim':300,
            'out_dim':len(experiment_dict['word2index']),
            'pretrained_encoder':'none',
            'encoder_trainable':True,
            'max_len':50,
            'max_tgt_len':150,
            'n_layers':3,
            'n_heads':10,
            'pf_dim':512,
            'dropout':0.1,
            'batch_size':64,
            'clip':1.0,
            'optimizer':'AdamW',
            'lr':2e-4,
            'embedding_pretrained':True,
            'epochs':15,
            'sample_index':0,
            'cosine_threshold':similarity,
            'cosine_power':7,
            'weights_matrix':utils.get_word_vectors(experiment_dict['vocab_word2vec'])
            }
  loss_settings = utils.get_loss_settings(loss)
  config = {**config, **loss_settings}
  print(config)

  if config['wordnet'] == 0:
    experiment_dict['similarity'] = utils.format_similarity_matrix_smoothen(experiment_dict['similarity'], 
                                                                            config['cosine_threshold'],
                                                                            config['smoothing'])
  else:
    experiment_dict['similarity'] = utils.format_similarity_matrix_wordnet_smoothen(experiment_dict['similarity'], 
                                                                                    word_similarity_labels,
                                                                                    config['cosine_threshold'],
                                                                                    config['smoothing'])
  model, criterion, optimizer = utils.init_all(config)
  tot_t_loss, tot_v_loss = [], []
  bert_score_list, bleu_score_list, bleurt_score_list, bert_score_hash_list = [], [], [], []

  best_valid_loss = float('inf')

  """ Start training and validation of the model for the current experiment.
      All the losses and metrics are logged. """
  for epoch in range(1, config['epochs']+1):
    train_iterator, valid_iterator, test_iterator = utils.get_iterators(experiment_dict, config)
    start_time = time.time()
    tr_l = utils.train(model, train_iterator, optimizer, criterion, config['clip'])
    tot_t_loss.append(tr_l)
    tr_v, hypothesis, corpus = utils.evaluate(model, valid_iterator, criterion, 
                                              experiment_dict['index2word'], 
                                              config, test=False)
    tot_v_loss.append(tr_v)
    end_time = time.time()
    epoch_mins, epoch_secs = utils.epoch_time(start_time, end_time)

    bleu_score = bleu.compute(predictions=hypothesis, references=[[i] for i in corpus])
    bert_score = bertscore.compute(predictions=hypothesis, references=corpus, lang='en')

    bert_score_list.append(np.mean(bert_score['f1'].tolist()))
    bleu_score_list.append(bleu_score['score'])
    bert_score_hash_list.append(bert_score['hashcode'])

    print("\nMETRIC scores : ")
    print("BLEU: \n", bleu_score_list[-1])
    print("BERTScore: \n", bert_score_list[-1])
    print("Few hypothesis:\n",hypothesis[:4])
    print("Few corpus:\n",corpus[:4])
    print("\n")
    model_name = name+".pt"
    if tr_v < best_valid_loss:
        best_valid_loss = tr_v
        torch.save(model.state_dict(), "Best-"+model_name)
        print("\nBest Model Saved !!")
    print("\n")
    print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Total Loss: {tr_l:.3f}')
    print(f'\tVal. Total Loss: {tr_v:.3f}')
    print("_________________________________________________________________")

  results_dict = utils.make_results_dict(tot_t_loss, tot_v_loss, bert_score_list, bleu_score_list, bleurt_score_list, bert_score_hash_list, best_valid_loss, model_name, config)
  results_name = model_name+'_results.pkl'
  best_model_name = "Best-"+model_name
  pickle.dump(results_dict, open('./'+results_name, 'wb'))
  shutil.copy2('./'+results_name, PATH+'/results/')
  shutil.copy2('./'+best_model_name, PATH+'/models/')
  print("EXPERIMENT DONE !")

# Model Inference

In [None]:
results_loc = PATH +'/results/'
models_loc = PATH + '/models/'
experiments_loc = PATH + '/experiment_parameters.csv'

In [None]:
result_filenames = os.listdir(results_loc)
result_filenames = [fn for fn in result_filenames if fn.replace('.pt_results.pkl', '') in all_valid_names]

model_filenames = os.listdir(models_loc)
model_filenames = [fn for fn in model_filenames if fn.replace('.pt', '').replace('Best-', '') in all_valid_names]

len(result_filenames), len(model_filenames)

In [None]:
name_to_model_map = {}
for filename in result_filenames:
    tmp = pickle.load(open(results_loc+filename, 'rb'))
    k = tmp['config']['name']
    v = {'tot_t_loss': tmp['tot_t_loss'],
         'tot_v_loss': tmp['tot_v_loss']}
    name_to_model_map[k] = {'model_name': tmp['model_name'],
                            'config': tmp['config']}
    if tmp['config']['dataset'] == 'dailydialog':
        dailydialog_results[k] = v
        dailydialog_best_val.append([k.replace('-dt-01_03_2021', '').replace('ds-dailydialog-', ''), max(tmp['bleu_score_list'])])
    else:
        empatheticdialogues_results[k] = v
        empatheticdialogues_best_val.append([k.replace('-dt-01_03_2021', '').replace('ds-empatheticdialogues-', ''), max(tmp['bleu_score_list'])])

In [None]:
""" For each experiment, this loop will load the best Pytorch model, run the model on the 
    test set, and compute the automatic evaluation metrics """
    
test_dict_results = {}
for k,v in name_to_model_map.items():
    print("Processing for ",k)
    config = v['config']
    model_name = v['model_name']
    if config['dataset'] == 'empatheticdialogues':
        experiment_dict = pickle.load(open('./experiment_dict_v3.pkl', 'rb'))
        word_similarity_labels = pickle.load(open('./word_similarity_labels_v2.pkl', 'rb'))
        experiment_dict['valid_dict']['X_chat'] = [i for ix, i in enumerate(experiment_dict['valid_dict']['X_chat']) if ix != 4690]
        experiment_dict['valid_dict']['Y_chat'] = [i for ix, i in enumerate(experiment_dict['valid_dict']['Y_chat']) if ix != 4690]
    else:
        experiment_dict = pickle.load(open('./experiment_dict_dailydialogue.pkl', 'rb'))
        word_similarity_labels = pickle.load(open('./word_similarity_labels_dailydialogue.pkl', 'rb'))
        train_ = [4433, 9123, 9124, 10075, 13853, 18344, 22919, 27316, 28067]
        test_ = [123]
        experiment_dict['train_dict']['X_chat'] = [i for ix, i in enumerate(experiment_dict['train_dict']['X_chat']) if ix not in train_]
        experiment_dict['train_dict']['Y_chat'] = [i for ix, i in enumerate(experiment_dict['train_dict']['Y_chat']) if ix not in train_]
        experiment_dict['test_dict']['X_chat'] = [i for ix, i in enumerate(experiment_dict['test_dict']['X_chat']) if ix not in test_]
        experiment_dict['test_dict']['Y_chat'] = [i for ix, i in enumerate(experiment_dict['test_dict']['Y_chat']) if ix not in test_]

    config['weights_matrix'] = utils.get_word_vectors(experiment_dict['vocab_word2vec'])

    test_iterator = utils.get_batch_data(experiment_dict, 'test_dict', config, plus=0)

    model, criterion, optimizer = utils.init_all(config)
    model.load_state_dict(torch.load(models_loc+model_name))
    model.eval()
    tr_v, hypothesis, corpus = utils.evaluate(model, test_iterator, criterion, 
                                              experiment_dict['index2word'], 
                                              config, test=True)
    bleu_score = bleu.compute(predictions=hypothesis, references=[[i] for i in corpus])
    bert_score = bertscore.compute(predictions=hypothesis, references=corpus, lang='en')
    mime_bleu_score = utils.get_BLEU_score(hypothesis, [[i] for i in corpus])
    test_dict_results[k] = {'bleu_score':bleu_score,
                            'bert_score':bert_score,
                            'mime_bleu_score':mime_bleu_score,
                            'hypothesis':hypothesis,
                            'corpus':corpus}

In [None]:
""" This loop will compute the Rouge 1, 2, L and the Meteor score for each experiment"""

test_dict_results_enriched = {}
for k,v in test_dict_results.items():
  # print(k)
  rouge_score = rouge.compute(predictions=v['hypothesis'], references=v['corpus'], 
                              rouge_types=["rouge1", "rouge2", "rougeL"])
  meteor_score = meteor.compute(predictions=v['hypothesis'], references=v['corpus'])

  test_dict_results_enriched[k] = v
  test_dict_results_enriched[k]['rouge1'] = rouge_score['rouge1'].mid.fmeasure
  test_dict_results_enriched[k]['rouge2'] = rouge_score['rouge2'].mid.fmeasure
  test_dict_results_enriched[k]['rougeL'] = rouge_score['rougeL'].mid.fmeasure
  test_dict_results_enriched[k]['meteor'] = meteor_score['meteor']

In [None]:
""" This loop consolidates all the calculated metrics for each experiment in a 
    Pandas Data Frame, and saves it as a csv file"""
test_results_lst = []
for k,v in test_dict_results_enriched.items():
  sacrebleu = v['bleu_score']['score']
  bleu = v['mime_bleu_score']
  bleurt = v['bleurt_score']
  bert_score = v['bert_score']['f1'].mean().item()
  _,ds_name,_,loss,_,smoothing,_,similarity,_,word_net,_,_ = k.split('-')
  test_results_lst.append([ds_name, loss, smoothing, similarity, word_net, 
                          sacrebleu, bleu, bleurt, bert_score, v['rouge1'], v['rouge2'],
                          v['rougeL'], v['meteor']])

enriched_test_results_df = pd.DataFrame(test_results_lst, columns = ['dataset', 'loss', 'smoothing', 'similarity', 'word_net',
                                                               'sacrebleu', 'bleu', 'bleurt', 'bert_score', 'rouge1', 
                                                               'rouge2', 'rougeL', 'meteor'])

enriched_test_results_df.to_csv('./enriched_test_results_df.csv')