In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import torch


In [None]:
from transformers import MarianMTModel, MarianTokenizer

model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-tw")
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-tw")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# prompt: from twi_dict.csv, there are english words and one or many twi translations, seperated by a semi colon. read in the dataset, and ensure every word only has one translation, if there are more than one add a new entry with the other translation and remove the other translation from the original entry

import pandas as pd

# Read the CSV file
df = pd.read_csv('twi_dict.csv')

# Create a new DataFrame to store the updated data
new_df = pd.DataFrame(columns=['English', 'Twi'])

# Iterate over the rows in the original DataFrame
for index, row in df.iterrows():
  english_word = row['english']
  twi_translations = row['twi'].split(';')

  # If there's only one translation, add it to the new DataFrame
  if len(twi_translations) == 1:
    new_df = pd.concat([new_df, pd.DataFrame({'English': [english_word], 'Twi': [twi_translations[0]]})], ignore_index=True)

  # If there are multiple translations, create new entries for each
  else:
    for translation in twi_translations:
      new_df = pd.concat([new_df, pd.DataFrame({'English': [english_word], 'Twi': [translation]})], ignore_index=True)

# Print the new DataFrame
print(new_df)

#remove any entries containing ints or floats
new_df = new_df[~new_df['English'].str.contains(r'\d')]
new_df = new_df[~new_df['Twi'].str.contains(r'\d')]


          English                     Twi
0        hopeless         anidasoɔ nni mu
1        hopeless    deɛ anidasoɔ nni mu 
2         horizon  ewiem ne asase ahyiaeɛ
3            horn                    abɛn
4            horn                 abebɛn 
...           ...                     ...
8314  interchange                di nsesa
8315     interior                     emu
8316     keenness                   aniku
8317    interlude                ntwaremu
8318    interfere          twitwa anan mu

[8319 rows x 2 columns]


In [None]:
# prompt: add an id which is a string id

import uuid

# Generate unique IDs for each row
new_df['id'] = [str(uuid.uuid4()) for _ in range(len(new_df))]

# Print the updated DataFrame
print(new_df)


                                        id      English  \
0     ff754d39-e039-4bf1-bcf3-2bec77e2d55d     hopeless   
1     5fc00867-ca84-4c5d-9aa6-0b814f478297     hopeless   
2     c132e785-e0dd-40c9-b2c9-ea31c55020d0      horizon   
3     1244c350-5482-4c78-b5a9-05aaecac1272         horn   
4     21a73391-9365-44df-b4c4-ec5013fd650b         horn   
...                                    ...          ...   
8314  2fc951dd-2ef9-43f9-a145-0c5637cd5941  interchange   
8315  6262ece2-2540-492e-9a29-d0a133ba8f0e     interior   
8316  cc527a98-b569-483b-a573-4a116670e631     keenness   
8317  658b2de2-53c0-4b87-aac5-daa507f91124    interlude   
8318  11fd016b-57b2-46b2-90b7-bbaaffc4edc3    interfere   

                         Twi  
0            anidasoɔ nni mu  
1       deɛ anidasoɔ nni mu   
2     ewiem ne asase ahyiaeɛ  
3                       abɛn  
4                    abebɛn   
...                      ...  
8314                di nsesa  
8315                     emu  
8316        

In [None]:
# Load the dataset
train_df = pd.read_csv('MetaStuff/Train.csv')

import re

#add new df to train_df
train_df = pd.concat([train_df, new_df], ignore_index=True)



# Function to normalize text
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, keeping only alphanumeric and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)

    # Remove multiple spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply normalization to both English and Twi columns
train_df['English'] = train_df['English'].apply(normalize_text)
train_df['Twi'] = train_df['Twi'].apply(normalize_text)

# Show a few examples of normalized data
print(train_df[['English', 'Twi']].head())

# Save the normalized dataset if needed
# train_df.to_csv('train_normalized.csv', index=False)


# Convert it to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)

# Display a few records to ensure it's loaded correctly
train_dataset = train_dataset.rename_column('English', 'source')
train_dataset = train_dataset.rename_column('Twi', 'target')
print(train_dataset)

# Split the dataset into train and validation sets
train_test_split = train_dataset.train_test_split(test_size=0.05)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


                                             English  \
0           how best can we promote sports in ghana?   
1  challenging programs often call for a large am...   
2  they usually go into agreement that the first ...   
3                it enables children to grow up well   
4  education in africa is lagging behind because ...   

                                                 Twi  
0  kwan bn so na yebetumi ahy agumadi ho nkuran y...  
1  mpn pii no, nhyehye ahorow a emu y den hwehw s...  
2  mpn pii no, y a na wy adwene kor s de bdi kan ...  
3                            ma mmfra tumi nyini yie  
4  nhomasua a w abibiman mu aka nakyi esiane s su...  
Dataset({
    features: ['id', 'source', 'target'],
    num_rows: 13091
})


In [None]:
#model_name = "facebook/m2m100_418M"
#tokenizer = M2M100Tokenizer.from_pretrained(model_name)

# Set the language codes
tokenizer.src_lang = "en"  # English
tokenizer.tgt_lang = "tn"  # Twi

def preprocess_function(examples):
    inputs = examples['source']
    targets = examples['target']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids

    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing to dataset
train_dataset = train_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)



Map:   0%|          | 0/12436 [00:00<?, ? examples/s]



Map:   0%|          | 0/655 [00:00<?, ? examples/s]

In [None]:
# Data collator for padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load the ROUGE metric for evaluation
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline after each sentence
    decoded_preds = ["\n".join(pred.strip() for pred in decoded_pred.split()) for decoded_pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip() for label in decoded_label.split()) for decoded_label in decoded_labels]

    # Compute ROUGE
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Since the result is already in float format, we just multiply by 100
    result = {key: value * 100 for key, value in result.items()}

    return result


In [None]:
# prompt: I want to fine tune the model on the training dataset

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_new",
    evaluation_strategy="steps",
    per_device_train_batch_size=4,  # Adjust based on your memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    num_train_epochs=10,  # Adjust the number of epochs
    predict_with_generate=True,
    save_steps=500,
    eval_steps=500,
    logging_dir='./logs_new',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    save_total_limit=3,
    lr_scheduler_type="cosine",  # or "linear"
)

# Create the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)






In [21]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./trained_model_new")

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,0.3286,0.320402,23.644055,7.941135,22.228434,23.678144
1000,0.3087,0.28258,26.309214,9.822873,24.840863,26.282028
1500,0.2995,0.264141,28.067404,11.435534,26.668755,28.117659
2000,0.2439,0.253542,29.762264,12.43121,28.201075,29.779861
2500,0.2464,0.243546,31.262493,13.44296,29.70442,31.207889
3000,0.2327,0.235968,31.449403,13.919467,30.018372,31.436735
3500,0.2089,0.232874,32.102685,14.564479,30.672319,32.084213
4000,0.2018,0.228625,32.98479,15.233581,31.699176,32.987604
4500,0.1935,0.225862,32.143572,14.940347,30.879241,32.08686
5000,0.1721,0.224042,33.699899,15.645748,32.53175,33.669195


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[56999]], 'forced_eos_token_id': 0}


Buffered data was truncated after reaching the output size limit.

In [None]:
# prompt: Predict the Test dataset and store the predicitions in a csv called KnownModelPreds
import os
# Load the test set
test_df = pd.read_csv("MetaStuff/Test.csv")

# Ensure the test set has the correct columns
assert 'id' in test_df.columns and 'English' in test_df.columns, "Test set must contain 'id' and 'English' columns."

# Prepare the tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

# Initialize the output CSV file
output_file = "KnownModelPreds.csv"

# If the output file doesn't exist, create it and write the header
if not os.path.exists(output_file):
    with open(output_file, "w") as f:
        f.write("id,Twi\n")  # Writing the header

# Tokenize the English sentences
def tokenize_function(texts):
    return tokenizer(texts, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

# Function to clean and format the translation
def clean_translation(translation):
    # Remove any commas in the translation
    translation = translation.replace(",", "")
    # Return the cleaned translation with quotes
    return translation.strip()

# Process the data in batches and save the output to CSV after every batch
batch_size = 16  # Adjust batch size based on memory capacity
for i in range(0, len(test_df), batch_size):
    batch = test_df.iloc[i:i + batch_size]

    # Tokenize the English sentences
    inputs = tokenize_function(batch["English"].tolist())

    # Move inputs to the correct device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate translations using the model
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

    # Decode the predictions into Twi translations
    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Write the results to CSV after every batch
    with open(output_file, "a") as f:
        for j in range(len(decoded_preds)):
            # Clean the translation and write to the file
            #cleaned_translation = clean_translation(decoded_preds[j])
            f.write(f"{batch['id'].iloc[j]},{decoded_preds[j]}\n")

    print(f"Processed batch {i // batch_size + 1}/{len(test_df) // batch_size + 1}")

print(f"Translations saved to {output_file}")


Processed batch 1/201
Processed batch 2/201
Processed batch 3/201
Processed batch 4/201
Processed batch 5/201
Processed batch 6/201
Processed batch 7/201
Processed batch 8/201
Processed batch 9/201
Processed batch 10/201
Processed batch 11/201
Processed batch 12/201
Processed batch 13/201
Processed batch 14/201
Processed batch 15/201
Processed batch 16/201
Processed batch 17/201
Processed batch 18/201
Processed batch 19/201
Processed batch 20/201
Processed batch 21/201
Processed batch 22/201
Processed batch 23/201
Processed batch 24/201
Processed batch 25/201
Processed batch 26/201
Processed batch 27/201
Processed batch 28/201
Processed batch 29/201
Processed batch 30/201
Processed batch 31/201
Processed batch 32/201
Processed batch 33/201
Processed batch 34/201
Processed batch 35/201
Processed batch 36/201
Processed batch 37/201
Processed batch 38/201
Processed batch 39/201
Processed batch 40/201
Processed batch 41/201
Processed batch 42/201
Processed batch 43/201
Processed batch 44/2

In [None]:
# prompt: load KnownModelPreds and replace all " with nothing

import pandas as pd

# Load the CSV file
df = pd.read_csv("KnownModelPreds.csv")

# Replace all occurrences of " with nothing
df['Twi'] = df['Twi'].str.replace('"', '')

# Save the modified DataFrame back to the CSV file
df.to_csv('KnownModelPreds.csv', index=False)
