# CODE FOR FINAL PROJECT

**Make sure to have the T4 selected in the `Runtime` -> `Change runtime type` section for GPU measurement, or else, it will use the CPU**

In [1]:
!pip install -q transformers datasets evaluate sentencepiece plotly sacrebleu  # Install all these libs

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency reso

---

**Objective: Fine tune a smaller student model using sequence-level distillation from a larger MarianMT teacher model using either the T4 GPU from Google Cloud or locally using the RTX 4070 in Anaconda Prompt**

---

**Focus: Training-speed, BLEU score, memory footprint, and emissions awareness.**

---

In [2]:
import os # Mainly for the RTX 4070 and env config
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import torch
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import time
import pandas as pd # for saving results

# Set computation device (USE A GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Load dataset
dataset = load_dataset("opus_books", "de-en") # German-English OPUS Books dataset
split = dataset["train"].train_test_split(test_size=0.1, seed=42) # Split dataset into train/test (90/10)
train_set = split["train"].select(range(1000)) # Use first 1000 examples for training
test_set = split["test"].select(range(200)) # Use first 200 examples for eval

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

In [5]:
# Load teacher model (pretrained MariantMT translation model)
teacher_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-de-en").to(device)
teacher_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]



In [6]:
# Clone the teacher model as the init student model
student_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-de-en").to(device)

#### Preprocessing for Training:

In [7]:
# Tokenize the data for the student model
def translate_with_teacher(batch):
    srcs = [item["de"] for item in batch["translation"]] # Extract German source sentences
    inputs = teacher_tokenizer(srcs, return_tensors='pt', padding=True, truncation=True).to(device) # Tokenize and move to device (GPU)

    # Generate translations using the teacher model
    with torch.no_grad():
        output_ids = teacher_model.generate(**inputs, max_length=128)

    # Decode the output token IDs into text
    batch["kd_translation"] = teacher_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return batch

# Apply translation gen across the training set
kd_dataset = train_set.map(translate_with_teacher, batched=True, batch_size=8)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# Tokenization for student training
def prepare_for_training(batch):
    srcs = [item["de"] for item in batch["translation"]] # Extract og German inputs
    tgts = batch["kd_translation"] # Extract the KD English Translations
    inputs = teacher_tokenizer(srcs, padding="max_length", truncation=True, max_length=128) # Tokenize the sources
    labels = teacher_tokenizer(tgts, padding="max_length", truncation=True, max_length=128) # Tokenize the teacher-gen translations as training labels
    inputs["labels"] = labels["input_ids"] # Add labels to input batch
    return inputs

# Tokenize and prepare the full dataset for student model fine-tuning
train_data_tokenized = kd_dataset.map(prepare_for_training, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

#### Training Config and Trainer Setup:

In [9]:
training_args = TrainingArguments(
    output_dir="./student_kd", # Where to save model checkpoints
    per_device_train_batch_size=8, # Batch size per GPU device
    num_train_epochs=3, # Train for 3 epochs
    logging_steps=10, # Log training process every 10 steps
    fp16=torch.cuda.is_available() # Use automatic mixed precision
)

# Init HuggingFace trainer for managing training loop
trainer = Trainer(
    model=student_model, # Student model to be trained
    args=training_args, # Training config
    train_dataset=train_data_tokenized, # Preprocessed training dataset
    tokenizer=teacher_tokenizer # Tokenizer for handling inputs/outputs
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


#### Monitor Training Time and Estimate GPU Cost:

In [10]:
start_time = time.time() # Start timer before training
trainer.train() # Fine-tune the student model
end_time = time.time() # End timer after training

# Calc training duration
training_duration = end_time - start_time
training_hours = training_duration / 3600

# Set GPU hourly cost (based on T4 pricing ref)
gpu_cost_per_hour = 0.35

# Estimate the total training cost
gpu_cost = training_hours * gpu_cost_per_hour

print(f"Training time: {training_hours:.2f} hours")
print(f"Estimated GPU cost (T4 @ $0.35/hr): ${gpu_cost:.2f}")

Step,Training Loss
10,8.4274
20,1.5873
30,1.167
40,1.002
50,0.8475
60,0.9183
70,0.8812
80,0.7795
90,0.7387
100,0.6528




Training time: 0.01 hours
Estimated GPU cost (T4 @ $0.35/hr): $0.00


#### Eval Student Model using BLEU Score:

In [11]:
metric = evaluate.load("sacrebleu") # Load SacreBLEU eval metric

# Function to gen student model predictions
def generate_predictions(batch):
    srcs = [item["de"] for item in batch["translation"]] # Extract German source sens
    inputs = teacher_tokenizer(srcs, return_tensors='pt', padding=True, truncation=True).to(device) # Tokenize and move to device

    # Generate translated outputs without gradient calculation
    with torch.no_grad():
        output_ids = student_model.generate(**inputs, max_length=128)

    # Decode the output IDs into text
    batch["predictions"] = teacher_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return batch

# Apply predictions function to test set
predictions = test_set.map(generate_predictions, batched=True, batch_size=8)

# Format reference translations properly
references = [[item["en"]] for item in predictions["translation"]]

# Compute BLEU score between predictions and references
results = metric.compute(predictions=predictions["predictions"], references=references)
print(f"BLEU score of the student model: {results['score']:.2f}")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

BLEU score of the student model: 7.44


#### Generate Reports:

In [12]:
def generate_teacher_predictions(batch):
    srcs = [item["de"] for item in batch["translation"]] # Extract the German source sentences
    inputs = teacher_tokenizer(srcs, return_tensors='pt', padding=True, truncation=True).to(device) # Tkn and move to device

    # Generate translated outputs without gradient tracking
    with torch.no_grad():
        outputs = teacher_model.generate(**inputs, max_length=128)

    # Decode and generate tkn IDs into readable text
    batch["teacher_predictions"] = teacher_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return batch

# Apply teacher prediction function across the test set
teacher_eval_set = test_set.map(generate_teacher_predictions, batched=True, batch_size=8)


#----------------------------------------------------------------#
# COMPUTE BLEU SCORE FOR TEACHER MODEL
#----------------------------------------------------------------#

# Load in the sacreBLEU metric and compute BLEU for teacher model outputs
teacher_bleu = evaluate.load("sacrebleu").compute(
    predictions=teacher_eval_set["teacher_predictions"],
    references=[[item["en"]] for item in teacher_eval_set["translation"]]
)
print(f"Teacher BLEU score: {teacher_bleu['score']:.2f}")



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Teacher BLEU score: 18.10


#### Save Training and Eval Summary Log:

In [15]:
# using google colab lib to download the results/csv:
from google.colab import files
import datetime

In [25]:
with open("results_log_T4.txt", "w") as f:
    f.write("MarianMT Distillation Summary (RTX 4070)\n")
    f.write(f"Date/Time: {datetime.datetime.now()}\n\n")
    f.write(f"GPU Used: {torch.cuda.get_device_name(0)}\n")
    f.write(f"Training Time (hrs): {training_hours:.2f}\n")
    f.write(f"Estimated GPU Cost ($): {gpu_cost:.2f}\n")
    f.write(f"BLEU Score (Teacher): {teacher_bleu['score']:.2f}\n")
    f.write(f"BLEU Score (Student): {results['score']:.2f}\n")

files.download('results_log_T4.txt') # Rename to `results_log_RTX4070` for the RTX test.
print("File downloaded successfully  :D! ")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

File downloaded successfully  :D! 


#### Save Detailed Translation Results to CSV:

In [26]:
# Create a Dataframe with source, reference, and prediction texts
df = pd.DataFrame({
    "Source (DE)": [item["de"] for item in predictions["translation"]],
    "Reference (EN)": [item["en"] for item in predictions["translation"]],
    "Prediction (EN)": predictions["predictions"]
})

# save to csv
df.to_csv("translation_results.csv", index=False)

# Use google.colab lib to download:
files.download('translation_results.csv')

print("File downloaded successfully :D ")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

File downloaded successfully :D 


#### Save the Final Student Model:

In [24]:
# Save the trained student model weights and tokenizer for later reuse:
student_model.save_pretrained("student_model_t4") # change ".../student_model_x" to RTX4070 if training on the local machine
teacher_tokenizer.save_pretrained("student_model_t4") # rename to RTX4070 if training on Anaconda Prompt

# Download the zip file:
!zip -r student_model_t4.zip student_model_t4/

files.download('student_model_t4.zip')

print("Student model saved to folder: student_model_t4/") # Same comments as above about the naming...

updating: student_model_t4/ (stored 0%)
updating: student_model_t4/model.safetensors (deflated 7%)
updating: student_model_t4/source.spm (deflated 49%)
updating: student_model_t4/tokenizer_config.json (deflated 68%)
updating: student_model_t4/target.spm (deflated 49%)
updating: student_model_t4/special_tokens_map.json (deflated 35%)
updating: student_model_t4/vocab.json (deflated 69%)
updating: student_model_t4/config.json (deflated 62%)
updating: student_model_t4/generation_config.json (deflated 43%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Student model saved to folder: student_model_t4/
