In [1]:
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch
from bert_score import score
from bert_score import plot_example
import sys

current_directory = os.getcwd()
data_folder_path = os.path.join(current_directory, '..', 'data')
data_file_path = os.path.join(data_folder_path, 'candidates\candidates_annotated_sentences.csv')
df = pd.read_csv(data_file_path)

exclude_columns = ['original', 'source', 'target']

df_candidates = df.loc[:, ~df.columns.isin(exclude_columns)]


df.head(3)

Unnamed: 0,original,source,target,candidate_mbart_v1,candidate_ul2_v1,candidate_mbart_v2,candidate_ul2_v2,candidate_mbart_v3,candidate_ul2_v3,candidate_mbart_v4,candidate_ul2_v4,candidate_ul2_v3_late,candidate_mbart_v3_late,candidate_mbart_v4_late,candidate_ul2_mc4
0,mw vanmorgen gedoucht.,mevrouw vanmorgen gedoucht.,Mevrouw is vanmorgen gedoucht.,Mevrouw is vanmorgen gedoucht.,Mevrouw vanmorgen gedoucht.,Mevrouw vanmorgen gedoucht.,Mevrouw vanmorgen gedoucht.,Mevrouw heeft vanmorgen gedoucht.,Mevrouw heeft vanmorgen gedoucht.,Mevrouw was vanmorgen gedoucht.,Mevrouw heeft vanmorgen gedoucht.,Mevrouw heeft vanmorgen gedoucht.,Vanmorgen mevrouw gedoucht.,Mevrouw heeft vanmorgen gedoucht.,Mevrouw vanmorgen gedoucht.
1,Alles schoon aan gedaan kon geen schonen BH vi...,Alles schoon aan gedaan kon geen schonen BH vi...,Ik heb haar schone kleren aangedaan alleen kon...,Alles schoon aan gedaan en kon geen schone BH ...,Alles schoon aan gedaan maar kon geen schone B...,"Alles schoon aan gedaan, maar kon geen schone ...","Alles schoon aan gedaan, kon geen schone BH vi...",Alles schoon aan gedaan en ik kon geen schone ...,Alles schoon aan gedaan en kon geen schone BH ...,Alles schoon aan gedaan en kon geen schone BH ...,"Alles schoon aan gedaan, maar kon geen schone ...","Alles schoon aan gedaan, maar kon geen schone ...",Alles schoon aan gedaan en kon geen schone BH ...,"Alles schoon aan gedaan, maar ik kon geen scho...","Alles schoon aan gedaan, kon geen schone BH vi..."
2,Kleding die erhing heb ik in de wasmand gedaan.,Kleding die erhing heb ik in de wasmand gedaan.,Kleding die er hing heb ik in de wasmand gedaan.,Kleding die ergering heb ik in de wasmand gedaan.,Kleding die er hing heb ik in de wasmand gedaan.,Kleding die er hing heb ik in de wasmand gedaan.,Kleding die er hing heb ik in de wasmand gedaan.,Kleding die erhing heb ik in de wasmand gedaan.,Kleding die er hing heb ik in de wasmand gedaan.,"Kleding die erhing, heb ik in de wasmand gedaan.",Kleding die er hing heb ik in de wasmand gedaan.,Kleding die er hing heb ik in de wasmand gedaan.,Kleding die erhing heb ik in de wasmand gedaan.,Kleding die erger was heb ik in de wasmand ged...,Kleding die er hing heb ik in de wasmand gedaan.


In [2]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def dutch_robbert_evaluation(candidates, references):
  # Load model from HuggingFace Hub
  tokenizer = AutoTokenizer.from_pretrained('jegorkitskerkin/robbert-v2-dutch-base-mqa-finetuned')
  model = AutoModel.from_pretrained('jegorkitskerkin/robbert-v2-dutch-base-mqa-finetuned')

  results = []

  for i in range(len(candidates)):
      # Sentences we want sentence embeddings for
      sentences = [references[i], candidates[i]]
      encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

      # Compute token embeddings
      with torch.no_grad():
          model_output = model(**encoded_input)

      # Perform pooling. In this case, mean pooling.
      sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

      embedding1 = sentence_embeddings[0]
      embedding2 = sentence_embeddings[1]

      # normalize the embeddings to unit length
      embedding1_norm = F.normalize(embedding1, p=2, dim=0)
      embedding2_norm = F.normalize(embedding2, p=2, dim=0)

      # calculate the cosine similarity between the two embeddings
      similarity = torch.dot(embedding1_norm, embedding2_norm)
      results.append(float(similarity))

  return results

In [3]:
def bertscorer(candidates, references, bert_model, num_layers):
  """
  bert_model is any huggingface (dutch) bert_model such as :'textgain/allnli-GroNLP-bert-base-dutch-cased' or 'GroNLP/bert-base-dutch-cased'

  num_layers is the bert layer to use for your text embeddings. Optimal layers differ between bert models and you can find more here https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit#gid=0

  default is bert_multilingual which has an optimal num_layers associated with it unlike unchecked bert models.
  """
  # Load the Dutch BERT model and tokenizer
  #GroNLP/bert-base-dutch-cased

  # Calculate BERTScore
  
  P, R, F1 = score(candidates, references, model_type=bert_model, num_layers=num_layers)
  return [round(x, 3) for x in P.tolist()], [round(x, 3) for x in R.tolist()], [round(x, 3) for x in F1.tolist()]

In [4]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu_scores(candidates, references):
    bleu_scores_2 = []
    bleu_scores_4 = []
    weights_2 = [(1./2., 1./2.)]
    weights_4 = [(1./4., 1./4., 1./4., 1./4.)]
    for candidate, reference in zip(candidates, references):
        candidate_tokens = candidate.split()  # Assuming whitespace tokenization
        reference_tokens = reference.split()  # Assuming whitespace tokenization

        # Calculate BLEU score for a single candidate-reference pair
        bleu_score_2  = sentence_bleu([reference_tokens], candidate_tokens, weights_2)
        bleu_score_4 = sentence_bleu([reference_tokens], candidate_tokens, weights_4)

        bleu_scores_2.append(bleu_score_2)
        bleu_scores_4.append(bleu_score_4)

    return bleu_scores_2, bleu_scores_4


In [5]:
from rouge import Rouge

def calculate_rouge_scores(candidates, references):
    rouge = Rouge()

    scores = rouge.get_scores(candidates, references)

    rouge_1 = []
    rouge_2 = []
    rouge_l = []
    for i in range(len(scores)):
      rouge_1.append(scores[i]['rouge-1']['f'])
      rouge_2.append(scores[i]['rouge-2']['f'])
      rouge_l.append(scores[i]['rouge-l']['f'])
    return rouge_1, rouge_2, rouge_l

In [14]:
import nltk
from nltk.translate import meteor_score

nltk.download('punkt')
nltk.download('wordnet')

def calculate_meteor(candidate, reference):
    # Tokenize the texts
    candidate_tokens = nltk.word_tokenize(candidate)
    reference_tokens = nltk.word_tokenize(reference)

    # Calculate METEOR score
    meteor_score_value = meteor_score.meteor_score([reference_tokens], candidate_tokens)
    return meteor_score_value

def compare_texts(candidates, references):
    assert len(candidates) == len(references), "Length of candidate and reference texts must be equal."

    meteor_scores = []

    for candidate, reference in zip(candidates, references):
        # Calculate METEOR score
        meteor_score_value = calculate_meteor(candidate, reference)
        meteor_scores.append(meteor_score_value)

    return meteor_scores

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\f.tomassen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\f.tomassen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
sys.path.append('E:/Data_exploration/GitHub/paraphraser_code/fine-tuning')
from gleu_mine import gleu_calculator_mine
def calculate_gleu_scores(baselines, candidates, references, n=4,num_iterations=500):
    gleu_scores = []
    for i in range(len(candidates)):
        gleu_scores.append(float(gleu_calculator_mine(baselines[i], candidates[i], [references[i]], n=n, num_iterations=num_iterations)[0]))
    return gleu_scores


In [15]:
import pandas as pd

def evaluation(baselines, candidates, targets, index, bert_model='bert-base-multilingual-cased', num_layers=9):
  #Precision recall and f1 from bertscore. Baseline score is constant so save it and load it.
  P_candidates, R_candidates, F1_candidates = bertscorer(candidates, targets, bert_model, num_layers)
  gleu_scores = calculate_gleu_scores(baselines, candidates,targets, 4, 500)
  sentence_similarity_robbert = dutch_robbert_evaluation(candidates, targets)

  bleu_2, bleu_4 = calculate_bleu_scores(candidates, targets)
  rouge_1, rouge_2, rouge_l = calculate_rouge_scores(candidates, targets)
  meteor_scores = compare_texts(candidates, targets)
  
  data = {
    "Source": baselines,
    "Candidates": candidates,
    "GLEU+" : gleu_scores,
    "Targets": targets,
    "Bertscore score": F1_candidates,
    "Sentence similarity score" : sentence_similarity_robbert,
    "Bleu 2" : bleu_2,
    "Bleu 4" : bleu_4,
    "Rouge 1" : rouge_1,
    "Rouge 2" : rouge_2,
    "Rouge l" : rouge_l,
    "METEOR" : meteor_scores
  }

  extensive_df = pd.DataFrame(data)

  data2 = {
    "Average Bertscore score": sum(F1_candidates)/len(candidates),
    "Average sentence similarity score" : sum(sentence_similarity_robbert)/len(candidates),
    'GLEU' : sum(gleu_scores)/len(gleu_scores),
    "Bleu 2" : sum(bleu_2)/len(candidates),
    #"Bleu 2 avg difference" : bleu_2_average,
    "Bleu 4" : sum(bleu_4)/len(candidates),
    "Rouge 1" : sum(rouge_1)/len(candidates),
    #"rouge 1 avg difference" : rouge_1_average,
    "Rouge 2" : sum(rouge_2)/len(candidates),
    #"rouge 2 avg difference" : rouge_2_average,
    "Rouge l" : sum(rouge_l)/len(candidates),
    "METEOR" : sum(meteor_scores)/len(candidates),
    #"rouge l avg difference" : rouge_l_average
  }

  overview_df = pd.DataFrame(data2, index=[index])
  

  return extensive_df.round(3), overview_df.round(3)
  

In [16]:
df_candidates.columns

Index(['candidate_mbart_v1', 'candidate_ul2_v1', 'candidate_mbart_v2',
       'candidate_ul2_v2', 'candidate_mbart_v3', 'candidate_ul2_v3',
       'candidate_mbart_v4', 'candidate_ul2_v4', 'candidate_ul2_v3_late',
       'candidate_mbart_v3_late', 'candidate_mbart_v4_late',
       'candidate_ul2_mc4'],
      dtype='object')

In [17]:
sources = list(df['source']) #make_sentence_df(eval_df['train'], 'Preprocessed')
targets = list(df['target'])
for candidates in df_candidates.columns:
    extensive_df, overview_df = evaluation(sources, list(df[candidates]), targets, index=candidates)
    df_extension = candidates.replace('candidate_', '')
    df_extensive_name = f"df_extensive_{df_extension}"
    df_overview_name = f"df_overview_{df_extension}"

    # Update the local namespace (use globals() for the global namespace)
    locals()[df_extensive_name] = extensive_df
    locals()[df_overview_name] = overview_df

    extensive_name = os.path.join(data_folder_path, 'sentence_similarity\extensive_'+df_extension+'.csv')
    overview_name = os.path.join(data_folder_path, 'sentence_similarity\overview_'+df_extension+'.csv')

    extensive_df.to_csv(extensive_name, index=False)
    overview_df.to_csv(overview_name, index=False)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc
Asking to truncate to max_length but no maximum length is provided and the mod

In [18]:
dfs = [df_overview_ul2_v1, df_overview_ul2_v2, df_overview_ul2_v3, df_overview_ul2_v4, df_overview_mbart_v1, df_overview_mbart_v2, df_overview_mbart_v3, df_overview_mbart_v4, df_overview_ul2_mc4]
df_together = pd.concat(dfs)
df_together

Unnamed: 0,Average Bertscore score,Average sentence similarity score,GLEU,Bleu 2,Bleu 4,Rouge 1,Rouge 2,Rouge l,METEOR
candidate_ul2_v1,0.933,0.933,0.522,0.685,0.531,0.818,0.685,0.805,0.835
candidate_ul2_v2,0.942,0.945,0.549,0.715,0.551,0.826,0.699,0.815,0.865
candidate_ul2_v3,0.938,0.944,0.537,0.695,0.539,0.82,0.685,0.809,0.854
candidate_ul2_v4,0.94,0.945,0.541,0.704,0.543,0.822,0.688,0.809,0.861
candidate_mbart_v1,0.933,0.93,0.508,0.681,0.522,0.819,0.68,0.805,0.841
candidate_mbart_v2,0.934,0.935,0.525,0.683,0.526,0.814,0.673,0.802,0.858
candidate_mbart_v3,0.936,0.934,0.512,0.68,0.514,0.809,0.668,0.795,0.854
candidate_mbart_v4,0.937,0.942,0.522,0.694,0.519,0.82,0.681,0.807,0.855
candidate_ul2_mc4,0.942,0.93,0.539,0.716,0.559,0.827,0.702,0.815,0.87


In [None]:
df

In [13]:
df_together.to_csv('overview_all.csv', index=True)