In [49]:
# Cell 1: Import necessary libraries and set device
import re
import torch
import torchaudio
from datasets import load_dataset, Dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
from torch.utils.data import DataLoader
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [50]:
import tempfile
import zipfile
import os
from datasets import load_from_disk

zip_path = "small_validation_set.zip"
with tempfile.TemporaryDirectory() as tmpdirname:
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(tmpdirname)
    
    # List the contents to determine the correct path
    extracted_items = os.listdir(tmpdirname)
    print("Extracted items:", extracted_items)
    
    # If you see a folder like "small_validation_set", use that folder
    dataset_path = os.path.join(tmpdirname, "small_validation_set")
    small_validation_set = load_from_disk(dataset_path)
    # Now use small_validation_set as needed


Extracted items: ['small_validation_set']


In [51]:
small_validation_set

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 100
})

In [52]:
# Cell 7: Define data collator without moving tensors to device
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Process audio
        input_features = [self.processor(feature["audio"]["array"], sampling_rate=16000, return_tensors="pt").input_features[0] for feature in features]
        # Process labels
        labels = [self.processor.tokenizer(feature["text"]).input_ids for feature in features]

        # Pad inputs and labels
        input_features = torch.stack(input_features)
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l) for l in labels],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )

        # Replace padding token id's of the labels by -100 so they are ignored in the loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        batch = {
            "input_features": input_features,  # Do not move to device here
            "labels": labels,                  # Do not move to device here
        }
        return batch


In [53]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

In [54]:
# Cell 8: Create DataLoaders with pin_memory=True
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
# train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=data_collator, pin_memory=True)
# val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=data_collator, pin_memory=True)


In [55]:
import evaluate
import numpy as np

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Convert to numpy arrays and move to CPU
    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.cpu().numpy()
    else:
        pred_ids = np.array(pred_ids)

    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.cpu().numpy()
    else:
        label_ids = np.array(label_ids)

    # Replace -100 with the pad token ID
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER and CER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}



In [56]:
# Import necessary libraries
import torch
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm.auto import tqdm
small_val_dataset = small_validation_set
# Step 1: Set device and clear GPU cache
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 2: Load the fine-tuned Whisper model on GPU
model1 = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-best-model/checkpoint-22832").to(device) #change

# Step 3: Define evaluation training arguments
eval_training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-best-model",
    per_device_eval_batch_size=1,      # Keep batch size low to manage GPU memory
    dataloader_num_workers=0,          # Set to 0 to simplify I/O operations
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,                        # Disable fp16 for stability during evaluation
    evaluation_strategy="no",
    disable_tqdm=False,
    logging_dir="./logs-eval",
    logging_steps=10,
    report_to="none"
)

# Step 4: Initialize Trainer
trainer1 = Seq2SeqTrainer(
    model=model1,
    args=eval_training_args,
    eval_dataset=small_val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 5: Perform Evaluation with Debug Print Statements
torch.cuda.empty_cache()  # Clear GPU cache
print("Starting evaluation...")

try:
    eval_results = trainer1.evaluate()
    print(f"Final WER: {eval_results['eval_wer']}")
    print(f"Final CER: {eval_results['eval_cer']}")
except RuntimeError as e:
    print(f"Runtime error during evaluation: {e}")
    torch.cuda.empty_cache()


Using device: cuda


  trainer1 = Seq2SeqTrainer(


Starting evaluation...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/100 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Final WER: 0.20552519732847602
Final CER: 0.057348686822848716


In [57]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load the base Whisper model and processor
processor_base = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model_base = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")


In [58]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_base.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [59]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Process audio
        input_features = [self.processor(
            feature["audio"]["array"], sampling_rate=16000, return_tensors="pt"
        ).input_features[0] for feature in features]

        # Process labels
        labels = [self.processor.tokenizer(feature["text"]).input_ids for feature in features]

        # Pad inputs and labels
        input_features = torch.stack(input_features)
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l) for l in labels],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )

        # Replace padding token id's of the labels by -100 so they are ignored in the loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        batch = {
            "input_features": input_features,
            "labels": labels,
        }
        return batch


In [60]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor_base)


In [61]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

eval_training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-base-eval",
    per_device_eval_batch_size=1,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,  # Set to True if using GPU with enough memory
    evaluation_strategy="no",
)


In [62]:
trainer_base = Seq2SeqTrainer(
    model=model_base,
    args=eval_training_args,
    eval_dataset=small_val_dataset,
    tokenizer=processor_base.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer_base = Seq2SeqTrainer(


In [63]:
eval_results_base = trainer_base.evaluate()
print(f"Base Model WER: {eval_results_base['eval_wer']}")
print(f"Base Model CER: {eval_results_base['eval_cer']}")


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/100 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Base Model WER: 0.25561627200971465
Base Model CER: 0.06937432639400987


In [64]:
# =========================
# ADDITIONAL CODE CELL
# =========================

import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import evaluate

# Reuse the same WER/CER metrics you already loaded:
wer_metric_single = evaluate.load("wer")
cer_metric_single = evaluate.load("cer")

# Helper function to get a single transcription from a model
def transcribe_sample(model, processor, sample, device):
    """
    Runs a forward pass on 'sample' (which should contain 'audio' and 'text'),
    and returns the decoded prediction string.
    """
    # 1) Prepare the input features
    inputs = processor(
        sample["audio"]["array"], 
        sampling_rate=16000, 
        return_tensors="pt"
    ).to(device)

    # 2) Generate
    with torch.no_grad():
        predicted_ids = model.generate(inputs.input_features)
    # 3) Decode
    transcription = processor.tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

# We'll store results here
records = []

print("Performing sample-by-sample evaluation for base & fine-tuned models...")

# Loop through each sample in the small_validation_set
for i in tqdm(range(len(small_validation_set))):
    sample = small_validation_set[i]  # single sample dict

    reference_text = sample["text"]

    # Get base model prediction
    base_pred = transcribe_sample(model_base, processor_base, sample, device)

    # Get fine-tuned model prediction
    ft_pred = transcribe_sample(model1, processor, sample, device)

    # Compute WER & CER for each model on this single sample
    wer_base = wer_metric_single.compute(predictions=[base_pred], references=[reference_text])
    cer_base = cer_metric_single.compute(predictions=[base_pred], references=[reference_text])

    wer_ft = wer_metric_single.compute(predictions=[ft_pred], references=[reference_text])
    cer_ft = cer_metric_single.compute(predictions=[ft_pred], references=[reference_text])

    # Store in a list of dicts
    records.append({
        "sample_idx": i,
        "reference": reference_text,
        "base_pred": base_pred,
        "ft_pred": ft_pred,
        "wer_base": wer_base,
        "cer_base": cer_base,
        "wer_ft": wer_ft,
        "cer_ft": cer_ft
    })

# Convert to DataFrame
df = pd.DataFrame(records)

print("\nOverall, we have {} samples.".format(len(df)))

# Filter for samples where base model had an error (WER>0) 



Performing sample-by-sample evaluation for base & fine-tuned models...


  0%|          | 0/100 [00:00<?, ?it/s]


Overall, we have 100 samples.


In [65]:
# but fine-tuned model got it perfect (WER=0)
df_improved = df[(df["wer_base"] > df["wer_ft"])]
print("Number of samples improved:", len(df_improved))

# Display the top rows
display_columns = ["sample_idx", "reference", "base_pred", "ft_pred", "wer_base", "wer_ft"]
df_improved[display_columns].head(10)

Number of samples improved: 60


Unnamed: 0,sample_idx,reference,base_pred,ft_pred,wer_base,wer_ft
1,1,and fancied his countenance was not altogether...,And fancy it his countenance was not all toge...,and fancy it his countenance was now altogeth...,0.351351,0.297297
3,3,you've been jawin like a lot a old hens,You've been joined like a lot of old hands.,you've been drawing like a lot of old hands,0.444444,0.333333
4,4,let us finally confess it that what is most di...,Let us finally confess it that what is most d...,Let us finally confess it that what is most d...,0.129032,0.096774
6,6,the tireless machines marched back and forth a...,The tireless machines march back and forth ac...,The tireless machines march back and forth ac...,0.142857,0.071429
7,7,before the middle of the day they were visited...,Before the middle of the day there were visit...,Before the middle of the day there were visit...,0.34,0.3
9,9,alarmed but not discouraged she tried it anoth...,alarmed but not discouraged. She tried it ano...,"Alarmed but not discouraged, she tried it ano...",0.212121,0.151515
11,11,but at least it was obvious that some one must...,But at least it was obvious that someone must...,but at least it was obvious that someone must...,0.222222,0.155556
13,13,they began to fell trees for the timbers of th...,They began to felt trees for the timbers of t...,They began to fell trees for the timbers of t...,0.2,0.15
14,14,it's a technical problem of the exigencies of ...,It's a technical problem of the exigenesis of...,It's a technical problem of the exigenacies o...,0.425,0.375
15,15,and in their fury the women fell upon him deal...,"and in their fury the women fell upon him, de...",and in their fury the women fell upon him dea...,0.225,0.05


In [66]:
df_improved2 = df[(df["cer_base"] > df["cer_ft"])]
print("Number of samples improved:", len(df_improved))

# Display the top rows
display_columns = ["sample_idx", "reference", "base_pred", "ft_pred", "cer_base", "cer_ft"]
df_improved2[display_columns].head(10)


Number of samples improved: 60


Unnamed: 0,sample_idx,reference,base_pred,ft_pred,cer_base,cer_ft
1,1,and fancied his countenance was not altogether...,And fancy it his countenance was not all toge...,and fancy it his countenance was now altogeth...,0.086294,0.076142
2,2,the weirdness in which milly's absence had lef...,the weirdness in which Millie's absence had l...,the weirdness of which millies absence had le...,0.0625,0.057292
3,3,you've been jawin like a lot a old hens,You've been joined like a lot of old hands.,you've been drawing like a lot of old hands,0.25641,0.179487
4,4,let us finally confess it that what is most di...,Let us finally confess it that what is most d...,Let us finally confess it that what is most d...,0.022857,0.017143
6,6,the tireless machines marched back and forth a...,The tireless machines march back and forth ac...,The tireless machines march back and forth ac...,0.028926,0.016529
7,7,before the middle of the day they were visited...,Before the middle of the day there were visit...,Before the middle of the day there were visit...,0.110672,0.083004
9,9,alarmed but not discouraged she tried it anoth...,alarmed but not discouraged. She tried it ano...,"Alarmed but not discouraged, she tried it ano...",0.035176,0.025126
11,11,but at least it was obvious that some one must...,But at least it was obvious that someone must...,but at least it was obvious that someone must...,0.046083,0.036866
14,14,it's a technical problem of the exigencies of ...,It's a technical problem of the exigenesis of...,It's a technical problem of the exigenacies o...,0.102222,0.088889
15,15,and in their fury the women fell upon him deal...,"and in their fury the women fell upon him, de...",and in their fury the women fell upon him dea...,0.05,0.015


In [67]:
df_improved.values[0]

array([1,
       'and fancied his countenance was not altogether unknown to me i asked him some questions concerning his family and his country but all the answers i could get were sighs and tears i took pity on him',
       ' And fancy it his countenance was not all together unknown to me. I asked him some questions concerning his family and his country, but all the answers I could get were size and tears. I took pity on him.',
       ' and fancy it his countenance was now altogether unknown to me. I asked him some questions concerning his family and his country, but all the answers I could get were size and tears. I took pity on him,',
       0.35135135135135137, 0.08629441624365482, 0.2972972972972973,
       0.07614213197969544], dtype=object)

In [68]:
df.to_csv('whisper_ft_results.csv')

In [69]:
from transformers import MarianMTModel

# Load the fine-tuned MarianMT model from the local directory
model = MarianMTModel.from_pretrained('./MarianMT Finetuned')

# Verify the model has been loaded correctly
print(model.config)


MarianConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "./MarianMT Finetuned",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  }

In [70]:
# Cell 1: Load the OpenSubtitles English-French Dataset
from datasets import load_dataset
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
# Load the dataset
dataset = load_dataset("opus100", "en-fr")

# Inspect the dataset
print(dataset)


DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


In [71]:
# Cell 2: Preprocess the Data
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

# Define source and target languages
SRC_LANG = "en"
TGT_LANG = "fr"

# Preprocessing function
def preprocess_function(examples):
    inputs = [ex[SRC_LANG] for ex in examples['translation']]
    targets = [ex[TGT_LANG] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)




In [72]:
# Cell 3: Set Up Evaluation Metrics
import evaluate

# Load BLEU metric
bleu = evaluate.load('sacrebleu')

# Metric computation function
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    result = bleu.compute(predictions=decoded_preds, references=labels)
    return {"bleu": result["score"]}


In [73]:
# Cell 5: Evaluate the Base Model with GPU


# Check if GPU is available


# Split the data
split_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    dataloader_pin_memory=True,  # Ensures better performance with GPUs
)
small_train_dataset = train_dataset.shuffle(seed=42).select(range(30000))
small_eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000))
# Use a data collator to handle padding during evaluation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model.to(device),  # Send the model to GPU
    args=training_args,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate the base model
base_metrics = trainer.evaluate()
print(f"Base Model BLEU Score: {base_metrics['eval_bleu']:.2f}")


  trainer = Seq2SeqTrainer(
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/500 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Base Model BLEU Score: 40.38


In [75]:
model1 = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr')


In [76]:
# Initialize the trainer
trainer1 = Seq2SeqTrainer(
    model=model1.to(device),  # Send the model to GPU
    args=training_args,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate the base model
base_metrics = trainer1.evaluate()
print(f"Base Model BLEU Score: {base_metrics['eval_bleu']:.2f}")

  trainer1 = Seq2SeqTrainer(
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/500 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Base Model BLEU Score: 38.54


In [None]:
base_metrics = trainer.evaluate()
print(f"Base Model BLEU Score: {base_metrics['eval_bleu']:.2f}")


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/500 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Base Model BLEU Score: 40.38


2025/02/17 01:45:35 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 09566bb44fa04b1bb75ffd256645bfbc: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='./whisper-base-eval\runs\Feb17_01-30-23_LAPTOP-OIPDEFHU' for run ID='09566bb44fa04b1bb75ffd256645bfbc'. Attempted logging new value './results\runs\Feb17_01-32-55_LAPTOP-OIPDEFHU'.


In [79]:
import torch
from transformers import MarianMTModel, MarianTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Base model (not fine-tuned)
model_base = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to(device)
tokenizer_base = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

# Fine-tuned model
model_ft = MarianMTModel.from_pretrained("./MarianMT Finetuned").to(device)
# Reuse the same tokenizer or load separately if you used a different tokenizer
tokenizer_ft = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")


Using device: cuda


In [80]:
def translate_text(model, tokenizer, src_text: str, device) -> str:
    """
    Translate a single English string to French using the given model & tokenizer.
    Returns the decoded French string.
    """
    # Tokenize the source text
    inputs = tokenizer(src_text, return_tensors="pt", max_length=128, truncation=True)
    inputs = inputs.to(device)

    # Generate translation
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
    
    # Decode to a string
    translated_str = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return translated_str


In [81]:
import pandas as pd
from tqdm.auto import tqdm
import evaluate

# We'll use sacrebleu to compute sentence-level BLEU
sacrebleu_single = evaluate.load("sacrebleu")

records = []

for i in tqdm(range(len(small_eval_dataset))):
    sample = small_eval_dataset[i]

    # 1) Extract source (English) and reference (French)
    src_text = sample["translation"]["en"]
    ref_text = sample["translation"]["fr"]

    # 2) Get base model prediction
    base_pred = translate_text(model_base, tokenizer_base, src_text, device)

    # 3) Get fine-tuned model prediction
    ft_pred = translate_text(model_ft, tokenizer_ft, src_text, device)

    # 4) Compute BLEU for each prediction on this single sentence
    # sacrebleu expects: predictions=[str], references=[[str]]
    base_bleu_dict = sacrebleu_single.compute(predictions=[base_pred], references=[[ref_text]])
    ft_bleu_dict   = sacrebleu_single.compute(predictions=[ft_pred],   references=[[ref_text]])

    base_bleu = base_bleu_dict["score"]
    ft_bleu   = ft_bleu_dict["score"]

    # 5) Store results
    records.append({
        "sample_idx": i,
        "src_text": src_text,
        "reference_fr": ref_text,
        "base_pred": base_pred,
        "ft_pred": ft_pred,
        "bleu_base": base_bleu,
        "bleu_ft": ft_bleu
    })

# Convert to a DataFrame
df = pd.DataFrame(records)
print(f"Total samples analyzed: {len(df)}")


  0%|          | 0/1000 [00:00<?, ?it/s]

Total samples analyzed: 1000


In [82]:
df_improved = df[df["bleu_ft"] > df["bleu_base"]]
print("Number of samples improved:", len(df_improved))

# Show the top 10 improved examples
print(df_improved[["sample_idx", "src_text", "reference_fr", "base_pred", "ft_pred", "bleu_base", "bleu_ft"]].head(10))


Number of samples improved: 166
    sample_idx                                           src_text  \
6            6  PCT/GL/ISPE/1 Page 174 Chapter 19 Examination ...   
11          11                   - Makes your eyes wanna tear up.   
19          19  If we are to escape from France, we must have ...   
25          25  “In its decision 2004/128, the Commission on H...   
27          27                                          38 500 ð.   
31          31                                          Hey, you.   
49          49  Country office reports indicate a remarkably r...   
57          57  @EhCherif: Live shots fired by snipers who kil...   
59          59  Our nostalgic postcard range now numbers over ...   
64          64  Acknowledgement The Chair of the Strategic Opt...   

                                         reference_fr  \
6   PCT/GL/ISPE/1 Page 199 Chapitre 19 Procédure d...   
11                                 - Ça fait pleurer.   
19  Si nous voulons fuir la France, n

In [83]:
df_improved

Unnamed: 0,sample_idx,src_text,reference_fr,base_pred,ft_pred,bleu_base,bleu_ft
6,6,PCT/GL/ISPE/1 Page 174 Chapter 19 Examination ...,PCT/GL/ISPE/1 Page 199 Chapitre 19 Procédure d...,PCT/GL/ISPE/1 Page 174 Chapitre 19 Procédure d...,PCT/GL/ISPE/1 Page 174 Chapitre 19 Procédure d...,46.334186,52.512434
11,11,- Makes your eyes wanna tear up.,- Ça fait pleurer.,- Tes yeux veulent se déchirer.,- Ça te donne envie de te déchirer les yeux.,7.809850,8.295194
19,19,"If we are to escape from France, we must have ...","Si nous voulons fuir la France, nous devons av...","Si nous voulons échapper à la France, nous dev...","Si nous voulons fuir la France, nous devons av...",69.975223,100.000000
25,25,"“In its decision 2004/128, the Commission on H...","“In its decision 2004/128, the Commission on H...","Dans sa décision 2004/128, la Commission des d...","«Dans sa décision 2004/128, la Commission des ...",8.103715,9.242725
27,27,38 500 ð.,35 000 ð.,38 500 ð. . . . . . . . . . . . . . . . . . . ...,38 500 ð.,0.207668,31.947155
...,...,...,...,...,...,...,...
973,973,I object.,Objection.,Je m'oppose.,Je m'y oppose.,0.000000,15.973578
978,978,"In 1866, a particularly harsh famine killed mo...",En 1866 une famine particulière­ment sévère em...,"En 1866, une famine particulièrement dure a tu...","En 1866, une famine particulièrement dure tua ...",40.016016,43.242271
984,984,European Day of languages 26 September 2008 Ex...,Journée européenne des langues vendredi 26 sep...,Journée européenne des langues vendredi 26 sep...,Journée européenne des langues vendredi 26 sep...,46.880117,78.035623
994,994,Pepper spray?,- Une bombe lacrymo ?,Un vaporisateur de poivre ?,Spray de poivre ?,10.682175,12.440235


In [84]:
df.to_csv('marian_ft_results.csv')