In [1]:
import pandas as pd
import os
import re
import torch

from io import StringIO
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from tqdm import tqdm
from transformers import pipeline
from transformers import MarianMTModel, MarianTokenizer

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
file_path = 'data/eng-spa.txt'
try:
    with open(file_path, 'r') as file:
        data = file.read()
except FileNotFoundError:
    print("The specified file was not found.")
except Exception as e:
    print("An error occurred:", e)

In [3]:
columns = ['English', 'Spanish']
df = []
for line in data.split('\n'):
    if line:
        english, spanish, *_ = line.split('\t')
        df.append([english, spanish])

df = pd.DataFrame(df, columns=columns)

In [4]:
# Removing full stops and punctuation
df['English'] = df['English'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Spanish'] = df['Spanish'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['English'] = df['English'].str.lower()
df['Spanish'] = df['Spanish'].str.lower()
shuffled_df = df.sample(frac=1).reset_index(drop=True)
df = shuffled_df[:24000]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = 'Helsinki-NLP/opus-mt-es-en'  # Spanish to English model
model = MarianMTModel.from_pretrained(model_name).to(device)
tokenizer = MarianTokenizer.from_pretrained(model_name)
translated_sentences = []
for sentence in tqdm(df['Spanish']):
    inputs = tokenizer.encode(sentence, return_tensors="pt").to(device)
    translated = model.generate(inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    translated_sentences.append(translated_text)

df['Translated_English'] = translated_sentences


Using device: cuda


  1%|          | 126/24000 [00:13<49:14,  8.08it/s]

In [None]:
df['Translated_English'] = df['Translated_English'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Translated_English'] = df['Translated_English'].str.lower()

In [None]:
eng_ref = df['English'].tolist()
eng_hyp = df['Translated_English'].tolist()
bleu = BLEU()
result = bleu.corpus_score(eng_ref, [eng_hyp])

In [None]:
result

In [None]:
total_r1 = 0
total_r2 = 0
total_rl = 0
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for pred, target in zip(eng_hyp, eng_ref):
    scores = scorer.score(pred, target)
    total_r1 += scores['rouge1'].fmeasure
    total_r2 += scores['rouge2'].fmeasure
    total_rl += scores['rougeL'].fmeasure
    
print("ROGUE-1", 100*total_r1/len(eng_hyp))
print("ROGUE-2", 100*total_r2/len(eng_hyp))
print("ROGUE-L", 100*total_rl/len(eng_hyp))

In [None]:
input_sents = df["Spanish"].to_list()
target_sents = df["English"].to_list()
pred_sents = df["Translated_English"].to_list()


In [None]:
print("Input Sentence:", input_sents[0])
print("Target Sentence:", target_sents[0])
print("Predicted Sentence:", pred_sents[0])


In [None]:
print("Input Sentence:", input_sents[1])
print("Target Sentence:", target_sents[1])
print("Predicted Sentence:", pred_sents[1])


In [None]:
print("Input Sentence:", input_sents[5])
print("Target Sentence:", target_sents[5])
print("Predicted Sentence:", pred_sents[5])


In [None]:
print("Input Sentence:", input_sents[-6])
print("Target Sentence:", target_sents[-6])
print("Predicted Sentence:", pred_sents[-6])
