In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[sentencepiece]
!pip install tqdm
!pip install rouge

In [None]:
import nltk
nltk.download('punkt')
from datasets import load_dataset
import pandas as pd
import tqdm

# Data Loading

In [None]:
val_data = load_dataset('GEM/xmediasum', split='validation[:10%]')
actual_zh = []
actual_de = []
for value in val_data:
  actual_zh.append(value['summary_zh'])
  actual_de.append(value['summary_de'])

print(len(actual_zh))
print(len(actual_de))

# Read output of Fine Tuned Model ('Generated Text')

In [None]:
finetune_data = pd.read_csv('/content/Abstractive_0_predictions_t5_small_ex_final.csv')
summaries = finetune_data[['Generated Text']]
generated_summaries = summaries.values.tolist()
finetune_summaries = []
for sum in generated_summaries:
  finetune_summaries.append(sum[0])

# Read output of Few Shot Model

In [None]:
with open('/content/few_shot_extractive_r_0.5_train_10_val_10.txt') as f:
  few_shot_summaries = f.readlines()

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from tqdm.notebook import tqdm_notebook
from rouge import Rouge

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

In [None]:
def translate(summary, target_language):
  tokenizer.src_lang = "en"
  encoded_hi = tokenizer(summary, return_tensors="pt").to(device)
  generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id(target_language)).to(device)
  translated_summary = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
  return translated_summary[0]

In [None]:
def get_translations(arr, target_language):
  translated = []
  for i in tqdm_notebook(range(len(arr)), desc='Completed'):
    translated.append(translate(arr[i], target_language))
  return translated

In [None]:
finetune_zh = get_translations(finetune_summaries, 'zh')
finetune_de = get_translations(finetune_summaries, 'de')
fewshot_zh = get_translations(finetune_summaries, 'zh')
fewshot_de = get_translations(finetune_summaries, 'de')

In [None]:
def create_translation_file(arr, fname):
  file = open(fname, 'w')
  for v in arr:
      file.write(v.encode('ascii', 'ignore').decode('ascii'))
      file.write('\n')
  file.close()

In [None]:
create_translation_file(finetune_zh, 'Finetune_Zh')
create_translation_file(finetune_de, 'Finetune_De')
create_translation_file(fewshot_zh, 'Fewshot_Zh')
create_translation_file(fewshot_de, 'Fewshot_De')

In [None]:
def get_bleu_response(actual, translated):
  bleu_score = 0
  for i in range(len(actual)):
    reference = []
    reference.append(actual[i])
    rtokens = [nltk.word_tokenize(ref) for ref in reference]
    ttokens = nltk.word_tokenize(translated[i][0])
    # print(nltk.translate.bleu_score.sentence_bleu(rtokens, ttokens))
    bleu_score += nltk.translate.bleu_score.sentence_bleu(rtokens, ttokens)
  
  return bleu_score/len(actual)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
print(f'Finetune Zh : {get_bleu_response(actual_zh, finetune_zh)}')
print(f'Finetune De : {get_bleu_response(actual_de, finetune_de)}')
print(f'Few Shot Zh : {get_bleu_response(actual_zh, fewshot_zh)}')
print(f'Few Shot De : {get_bleu_response(actual_de, fewshot_de)}')