In [None]:
!pip install transformers datasets torch sacrebleu evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import wandb
import torch
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import warnings
from evaluate import load
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
!rm -rf ~/.cache/huggingface/transformers

In [None]:
model_name = "facebook/mbart-large-50"
model = MBartForConditionalGeneration.from_pretrained(model_name)
from transformers import MBart50Tokenizer
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ta_IN")

train_dataset = load_dataset("opus100", "en-ta", split="train[:10%]")
val_dataset = load_dataset("opus100", "en-ta", split="validation")

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/164k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/159k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/227014 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
def preprocess_function(examples):
    inputs = [ex['en'] for ex in examples["translation"]]
    targets = [ex['ta'] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/22701 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_only_model=True,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=1000,
    dataloader_num_workers=0
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key")
wandb.login(key=my_secret)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mctarunvignesh[0m ([33mctarunvignesh-[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.0699,0.287528
2,0.4147,0.255152
3,0.2549,0.256275
4,0.1987,0.265105
5,0.1346,0.275482
6,0.1175,0.284511
7,0.0894,0.293655
8,0.0592,0.299262
9,0.0513,0.304934
10,0.0392,0.30753


TrainOutput(global_step=14190, training_loss=0.21499497556115138, metrics={'train_runtime': 20789.6211, 'train_samples_per_second': 10.919, 'train_steps_per_second': 0.683, 'total_flos': 6.149501607739392e+16, 'train_loss': 0.21499497556115138, 'epoch': 10.0})

In [None]:
model.save_pretrained('./en-ta-fine-tuned-v1')
tokenizer.save_pretrained('./en-ta-fine-tuned-token-v1')

('./en-ta-fine-tuned-token-v1/tokenizer_config.json',
 './en-ta-fine-tuned-token-v1/special_tokens_map.json',
 './en-ta-fine-tuned-token-v1/sentencepiece.bpe.model',
 './en-ta-fine-tuned-token-v1/added_tokens.json')

In [None]:
!zip -r file.zip /kaggle/working/en-ta-fine-tuned-token-v1 /kaggle/working/en-ta-fine-tuned-v1
!ls
from IPython.display import FileLink
FileLink(r'file.zip')

  adding: kaggle/working/en-ta-fine-tuned-token-v1/ (stored 0%)
  adding: kaggle/working/en-ta-fine-tuned-token-v1/sentencepiece.bpe.model (deflated 49%)
  adding: kaggle/working/en-ta-fine-tuned-token-v1/special_tokens_map.json (deflated 61%)
  adding: kaggle/working/en-ta-fine-tuned-token-v1/tokenizer_config.json (deflated 92%)
  adding: kaggle/working/en-ta-fine-tuned-v1/ (stored 0%)
  adding: kaggle/working/en-ta-fine-tuned-v1/config.json (deflated 60%)
  adding: kaggle/working/en-ta-fine-tuned-v1/model.safetensors (deflated 7%)
  adding: kaggle/working/en-ta-fine-tuned-v1/generation_config.json (deflated 42%)
en-ta-fine-tuned-token-v1  en-ta-fine-tuned-v1	file.zip  logs	results  wandb


In [None]:
def translate_text(text):
    torch.manual_seed(42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
    outputs = model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return translated_text

text = "Hello, I am under the water. Please help me, here too much paining."
tamil = translate_text(text)
print(tamil)

வணக்கம், நான் தண்ணீர் கீழ் உள்ளேன். தயவு செய்து எனக்கு உதவ, இங்கு மிகவும் பஞ்சம் உள்ளது.


In [None]:
test_dataset = load_dataset("opus100", "en-ta", split="test")
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

test_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print(test_results)

metric = load("sacrebleu")

def compute_metrics(pred):
  labels_ids = pred.label_ids
  pred_ids = pred.predictions

  decode_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  decode_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  bleu_score = metric.compute(predictions=decode_preds, references=[[label] for label in decode_labels])
  return bleu_score

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.3282564580440521, 'eval_runtime': 56.768, 'eval_samples_per_second': 35.231, 'eval_steps_per_second': 4.404, 'epoch': 10.0}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
predictions = trainer.predict(test_dataset=tokenized_test_dataset, metric_key_prefix="test")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [None]:
metrics = compute_metrics(predictions)

In [None]:
metrics

{'score': 15.063667706835597,
 'counts': [6154, 2343, 1140, 631],
 'totals': [13838, 11838, 10184, 8924],
 'precisions': [44.47174447174447,
  19.792194627470856,
  11.194029850746269,
  7.0708202599731065],
 'bp': 0.9271972453065181,
 'sys_len': 13838,
 'ref_len': 14884}

In [None]:
!nvidia-smi

Sun Apr  6 10:43:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P0             36W /  250W |   15081MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!nvidia-smi --query-gpu=name,driver_version,memory.total,memory.used,compute_mode,temperature.gpu --format=csv

name, driver_version, memory.total [MiB], memory.used [MiB], compute_mode, temperature.gpu
Tesla P100-PCIE-16GB, 560.35.03, 16384 MiB, 15081 MiB, Default, 52


In [None]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   4
  On-line CPU(s) list:    0-3
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   2
    Socket(s):            1
    Stepping:             3
    BogoMIPS:             4000.32
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 

In [None]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:            31Gi       2.6Gi       322Mi        16Mi        28Gi        28Gi
Swap:             0B          0B          0B


In [None]:
!df -h

Filesystem        Size  Used Avail Use% Mounted on
overlay           7.9T  6.1T  1.9T  77% /
tmpfs              64M     0   64M   0% /dev
shm                14G  4.0K   14G   1% /dev/shm
/dev/sdb1         122G   28G   94G  23% /opt/bin
/dev/loop1         20G  6.7G   13G  35% /kaggle/lib
/dev/mapper/snap  7.9T  6.1T  1.9T  77% /etc/hosts
tmpfs              16G     0   16G   0% /proc/acpi
tmpfs              16G     0   16G   0% /proc/scsi
tmpfs              16G     0   16G   0% /sys/firmware


In [None]:
import torch
import transformers
import datasets
import evaluate
import wandb

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("Transformers Version:", transformers.__version__)
print("Evaluate Version:", evaluate.__version__)

PyTorch Version: 2.5.1+cu121
CUDA Available: True
CUDA Version: 12.1
Transformers Version: 4.47.0
Evaluate Version: 0.4.3


In [None]:
model.eval()

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=