<a href="https://colab.research.google.com/github/prupat/LLMs/blob/main/Fine_Tuning_Class_Project_Part_1_PB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of this project is to create a **Translation Bot English-French**. To do it, I'm fine-tuning the **"Helsinki-NLP/opus-mt-en-fr"** model on the **"PaulineSanchez/Translation_words_and_sentences_english_french"** dataset from the Hugging face website.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**1. FINE-TUNING SECTION**

In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [None]:
pip install sentencepiece sacremoses



In [None]:
# Let's set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Let's load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Let's load the dataset (dataset was downloaded from Kaggle.com, cleaned and saved as CSV)
csv_path = "/content/drive/MyDrive/Colab Notebooks/clean_eng_french.csv"
df = pd.read_csv(csv_path, encoding='ISO-8859-1')

source_texts = df["English words/sentences"].tolist()
target_texts = df["French words/sentences"].tolist()

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        target_text = self.target_texts[idx]

        source_inputs = self.tokenizer(
            source_text, return_tensors="pt", padding=True, truncation=True
        )
        target_inputs = self.tokenizer(
            target_text, return_tensors="pt", padding=True, truncation=True
        )

        return {
            "source_text": source_text,
            "target_text": target_text,
            "source_inputs": source_inputs,
            "target_inputs": target_inputs,
        }

# Custom collate function
def collate_fn(batch):
    source_inputs = pad_sequence([item["source_inputs"]["input_ids"].squeeze(0) for item in batch], batch_first=True)
    target_inputs = pad_sequence([item["target_inputs"]["input_ids"].squeeze(0) for item in batch], batch_first=True)

    return {
        "source_text": [item["source_text"] for item in batch],
        "target_text": [item["target_text"] for item in batch],
        "source_inputs": {"input_ids": source_inputs},
        "target_inputs": {"input_ids": target_inputs},
    }

# Let's create dataset and data loader with custom collate function
dataset = CustomDataset(source_texts, target_texts, tokenizer)
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Fine-tuning parameters
num_epochs = 3
learning_rate = 5e-5

# Let's define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Fine-tuning loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

for batch in dataloader:
    source_inputs = batch["source_inputs"]["input_ids"].to(device)
    target_inputs = batch["target_inputs"]["input_ids"].to(device)

    outputs = model(input_ids=source_inputs, labels=target_inputs)
    loss = outputs.loss
    total_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")

# Let's save the fine-tuned model
output_model_dir = "/content/drive/MyDrive/Colab Notebooks/fine_tuned_model"
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print("Fine-tuning complete. Model saved to:", output_model_dir)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 3/3, Loss: 0.44799310667981196
Epoch 3/3, Loss: 0.44799875606447714
Epoch 3/3, Loss: 0.44800827062253884
Epoch 3/3, Loss: 0.44802077738644125
Epoch 3/3, Loss: 0.44803079363978193
Epoch 3/3, Loss: 0.4480354667116062
Epoch 3/3, Loss: 0.44804211704832797
Epoch 3/3, Loss: 0.44805349520114596
Epoch 3/3, Loss: 0.44805726455687894
Epoch 3/3, Loss: 0.4480622845101963
Epoch 3/3, Loss: 0.4480845116152076
Epoch 3/3, Loss: 0.4480978775244185
Epoch 3/3, Loss: 0.448104895241657
Epoch 3/3, Loss: 0.4481216351802129
Epoch 3/3, Loss: 0.4481270614523499
Epoch 3/3, Loss: 0.4481462772643501
Epoch 3/3, Loss: 0.4481538807975302
Epoch 3/3, Loss: 0.4481742168431661
Epoch 3/3, Loss: 0.4481846150557663
Epoch 3/3, Loss: 0.4481956436642545
Epoch 3/3, Loss: 0.4481994016796832
Epoch 3/3, Loss: 0.4482070699734877
Epoch 3/3, Loss: 0.44820970570317226
Epoch 3/3, Loss: 0.4482159375034526
Epoch 3/3, Loss: 0.4482300506604533
Epoch 3/3, Loss: 0.44823320