# Requirements

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# Loading the data

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, get_linear_schedule_with_warmup
import torch
import re

# Step 1: Prepare the Dataset
# Load the cleaned dataset
dataset_path = 'https://raw.githubusercontent.com/rajashekarcs2023/un-translation-system/refs/heads/main/truncated_dataset_40k.csv?token=GHSAT0AAAAAACU7KYP3LN4PWRPMP2NTEQAGZZQUV2A'
df = pd.read_csv(dataset_path)

# Drop rows with missing values if any
df.dropna(inplace=True)


# Preprocessing

In [8]:
# Data Cleaning Steps
# Remove any unwanted characters, extra spaces, and lowercasing text
def clean_text(text):
    text = text.lower()  # Lowercase all text
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip leading/trailing spaces
    return text

# Apply cleaning to both English and Spanish columns
df['English_clean'] = df['English'].apply(clean_text)
df['Spanish_clean'] = df['Spanish'].apply(clean_text)

# Remove rows where the cleaned texts are too short to be meaningful (e.g., less than 3 characters)
df = df[df['English_clean'].str.len() > 2]
df = df[df['Spanish_clean'].str.len() > 2]

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Format the dataset into separate lists for English and Spanish
def format_for_translation(df):
    return {
        "en": df["English_clean"].tolist(),
        "es": df["Spanish_clean"].tolist()
    }

train_data = format_for_translation(train_df)
val_data = format_for_translation(val_df)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# Create a DatasetDict to hold training and validation splits
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

# Save the dataset for future use
dataset_dict.save_to_disk('/content/drive/My Drive/prepared_translation_dataset')


print("Dataset preparation complete. Training and validation datasets are ready.")

Saving the dataset (0/1 shards):   0%|          | 0/31976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7995 [00:00<?, ? examples/s]

Dataset preparation complete. Training and validation datasets are ready.


# Model Prep

In [9]:
# Step 2: Load and Fine-Tune MarianMT Model
# Load the MarianMT model for translation (e.g., English to Spanish)
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')

# Tokenize the dataset for fine-tuning
def tokenize_function(examples):
    inputs = tokenizer(examples['en'], max_length=128, truncation=True, padding='max_length')
    targets = tokenizer(examples['es'], max_length=128, truncation=True, padding='max_length')
    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set up training arguments with learning rate scheduling

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,  # Adjusted learning rate to potentially improve convergence
    per_device_train_batch_size=8,  # Reduced batch size to prevent out of memory issues
    per_device_eval_batch_size=8,   # Reduced batch size for evaluation
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,  # Increased number of epochs to allow better learning
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if using a GPU
    logging_steps=50,  # Increased logging frequency for better tracking
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate a larger batch size
    lr_scheduler_type='linear',  # Linear learning rate scheduler
    warmup_steps=100
)

# Create a Seq2SeqTrainer for fine-tuning
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer
)



Map:   0%|          | 0/31976 [00:00<?, ? examples/s]

Map:   0%|          | 0/7995 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


# Training

In [10]:
# Start fine-tuning
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.272,0.243887
2,0.1639,0.210611
4,0.1029,0.208201


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


TrainOutput(global_step=9990, training_loss=0.2087958314874628, metrics={'train_runtime': 2501.052, 'train_samples_per_second': 63.925, 'train_steps_per_second': 3.994, 'total_flos': 5418314650091520.0, 'train_loss': 0.2087958314874628, 'epoch': 4.998749061796348})

# Evaluation

In [11]:
# Step 3: Evaluate Model on Sample Data

for i in range(5):
    input_text = val_data['en'][i]
    actual_output = val_data['es'][i]
    # Perform translation using the fine-tuned model
    tokenized_text = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to('cuda' if torch.cuda.is_available() else 'cpu')
    predicted_tokens = model.generate(**tokenized_text)
    predicted_output = tokenizer.decode(predicted_tokens[0], skip_special_tokens=True)
    print(f"Input: {input_text}\nPredicted Output: {predicted_output}\nActual Output: {actual_output}\n")

Input: nicaragua 15 february 1978 a 17 march 1978
Predicted Output: nicaragua 15 de febrero de 1978a 17 de marzo de 1978
Actual Output: nicaragua 15 de febrero de 1978a 17 de marzo de 1978

Input: amendments to rules of procedure
Predicted Output: enmiendas del reglamento
Actual Output: enmiendas al reglamento interno

Input: the study which was conducted during june and july 1992 was prepared by mr colin wooles of the government of canada and mr edward b bergh from the ministry for foreign affairs of sweden
Predicted Output: el estudio que fue llevado a cabo durante junio y julio de 1992 fue preparado por el sr colin wooles del gobierno del canadá y el sr edward b bergh del ministerio de relaciones exteriores de suecia
Actual Output: el estudio que se realizó en junio y julio de 1992 fue preparado por el sr colin wooles del gobierno del canadá y el sr edward b bergh del ministerio de relaciones exteriores de suecia

Input: during its consideration of the abovementioned report the advi

# Saving the Model

In [13]:
from google.colab import drive
drive.mount('/content/drive')

# Save the fine-tuned model to Google Drive
model_save_path = '/content/drive/My Drive/fine_tuned_un_translation_model_BEST'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/My Drive/fine_tuned_un_translation_model_BEST/tokenizer_config.json',
 '/content/drive/My Drive/fine_tuned_un_translation_model_BEST/special_tokens_map.json',
 '/content/drive/My Drive/fine_tuned_un_translation_model_BEST/vocab.json',
 '/content/drive/My Drive/fine_tuned_un_translation_model_BEST/source.spm',
 '/content/drive/My Drive/fine_tuned_un_translation_model_BEST/target.spm',
 '/content/drive/My Drive/fine_tuned_un_translation_model_BEST/added_tokens.json')

In [14]:
model_save_path = './fine_tuned_un_translation_model_BEST'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


('./fine_tuned_un_translation_model_BEST/tokenizer_config.json',
 './fine_tuned_un_translation_model_BEST/special_tokens_map.json',
 './fine_tuned_un_translation_model_BEST/vocab.json',
 './fine_tuned_un_translation_model_BEST/source.spm',
 './fine_tuned_un_translation_model_BEST/target.spm',
 './fine_tuned_un_translation_model_BEST/added_tokens.json')

In [15]:
import shutil

shutil.make_archive('fine_tuned_un_translation_model_BEST', 'zip', model_save_path)

'/content/fine_tuned_un_translation_model_BEST.zip'

In [16]:
from google.colab import files

files.download('fine_tuned_un_translation_model_BEST.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Bulk Evaluation

In [19]:
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def evaluate_translation(val_data, model, tokenizer):
    # Initialize a model for calculating similarity
    # similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # Create a dataset with English, Spanish (original), Spanish (predicted), similarity score, and BLEU score
    results = []
    for i in range(len(val_data['en'])):
        input_text = val_data['en'][i]
        actual_output = val_data['es'][i]

        # Perform translation using the fine-tuned model
        tokenized_text = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to('cuda' if torch.cuda.is_available() else 'cpu')
        predicted_tokens = model.generate(**tokenized_text)
        predicted_output = tokenizer.decode(predicted_tokens[0], skip_special_tokens=True)

        # Calculate similarity score between actual and predicted


        # Calculate BLEU score between english and predicted


        # Append the results
        results.append({
            'English': input_text,
            'Spanish (Original)': actual_output,
            'Spanish (Predicted)': predicted_output,
            #'Similarity Score': similarity_score,
            #'BLEU Score': bleu_score
        })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Save the results to a CSV file
    results_df.to_csv('./translation_evaluation_results.csv', index=False, encoding='utf-8')

    print("Evaluation complete. Results saved to 'translation_evaluation_results.csv'")


In [22]:
#evaluate_translation(val_data, model, tokenizer)

# IRRELEVANT

In [None]:
# from transformers import MarianMTModel, MarianTokenizer

# # Load the MarianMT model for translation (e.g., English to Spanish)
# model_name = 'Helsinki-NLP/opus-mt-en-es'
# tokenizer = MarianTokenizer.from_pretrained(model_name)
# model = MarianMTModel.from_pretrained(model_name)

# # Example sentences for translation
# sentences = [
#     "The Third Meeting of the Parties to the Montreal Protocol was held in Nairobi.",
#     "Opening remarks by the outgoing President of the Bureau were made.",
#     "The Parties discussed measures to protect the ozone layer."
# ]

# # Tokenize the input sentences
# tokenized_text = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

# # Perform the translation
# translated_tokens = model.generate(**tokenized_text)

# # Decode the translated tokens to get the translated sentences
# translated_sentences = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]

# # Print the translated sentences
# for i, translated_sentence in enumerate(translated_sentences):
#     print(f"Original: {sentences[i]}\nTranslated: {translated_sentence}\n")

In [None]:
from datasets import load_dataset
import numpy as np

# Replace with the path to your local CSV file
file_path = "truncated_dataset.csv"

# Load the dataset
dataset = load_dataset('csv', data_files=file_path)

In [None]:
# Display the dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['English', 'Spanish'],
        num_rows: 5000
    })
})


In [None]:
import re
import pandas as pd

def clean_text(text):
    # Lowercasing the text
    text = text.lower()
    # Removing special characters and unnecessary punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Removing leading/trailing whitespace
    text = text.strip()
    return text

# Assuming you have loaded your dataset with the datasets library, e.g.:
# dataset = Dataset.from_csv("/path/to/your/dataset.csv")

# Applying the cleaning function to the dataset columns
cleaned_dataset = dataset.map(lambda x: {
    'English_clean': clean_text(x['English']),
    'Spanish_clean': clean_text(x['Spanish'])
})

# Adding sentence length for both columns to analyze anomalies
cleaned_dataset = cleaned_dataset.map(lambda x: {
    'English_length': len(x['English_clean']),
    'Spanish_length': len(x['Spanish_clean'])
})

# Displaying some statistics for sentence lengths to help detect anomalies
import pandas as pd
length_stats = pd.DataFrame(cleaned_dataset['train'][['English_length', 'Spanish_length']]).describe()
print(length_stats)

# Optional: Filter out sentences that are too short or too long, if needed
def filter_length(examples, min_length=10, max_length=500):
    return (examples['English_length'] >= min_length) & (examples['English_length'] <= max_length) & \
           (examples['Spanish_length'] >= min_length) & (examples['Spanish_length'] <= max_length)

filtered_dataset = cleaned_dataset['train'].filter(filter_length)

# Now your dataset is cleaned, analyzed, and filtered for better training


ValueError: invalid literal for int() with base 10: 'Spanish_length'

In [None]:
from transformers import MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = examples['English']  # Your English text column
    targets = examples['Spanish']  # Your Spanish text column
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs



In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
train_test_split = tokenized_dataset['train'].train_test_split(test_size=0.2)
train_val_split = train_test_split['train'].train_test_split(test_size=0.1)

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']
test_dataset = train_test_split['test']

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_name)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.2,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True
)




In [None]:
from evaluate import load

# Load the desired metric
bleu_metric = load('bleu')  # For accuracy; use 'bleu' for BLEU score


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}

In [None]:
from transformers import MarianMTModel, Seq2SeqTrainer

model = MarianMTModel.from_pretrained(model_name)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malma-gashi[0m ([33malma-gashi-aptitude[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu
1,No log,0.730281,0.0
2,0.757800,0.730812,0.0


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


KeyboardInterrupt: 

In [None]:
results = trainer.evaluate(test_dataset)
print(results)

{'eval_loss': 0.6965523958206177, 'eval_runtime': 3.6975, 'eval_samples_per_second': 270.455, 'eval_steps_per_second': 33.807, 'epoch': 3.0}


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
#model.save_pretrained('./fine-tuned-marianmt-en-es')
#tokenizer.save_pretrained('./fine-tuned-marianmt-en-es')