In [1]:
!pip install transformers datasets torch sacrebleu evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import wandb
import torch
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import warnings
from evaluate import load
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
!rm -rf ~/.cache/huggingface/transformers

In [4]:
model_name = "facebook/mbart-large-50"
model = MBartForConditionalGeneration.from_pretrained(model_name)
from transformers import MBart50Tokenizer
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="ta_IN", tgt_lang="en_XX")

train_dataset = load_dataset("opus100", "en-ta", split="train[:10%]")
val_dataset = load_dataset("opus100", "en-ta", split="validation")

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/164k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/159k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/227014 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
def preprocess_function(examples):
    inputs = [ex['ta'] for ex in examples["translation"]]
    targets = [ex['en'] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/22701 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_only_model=True,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=1000,
    dataloader_num_workers=0
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [7]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mctarunvignesh[0m ([33mctarunvignesh-[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.9574,0.182863
2,0.2863,0.174108
3,0.1924,0.179124
4,0.157,0.187566
5,0.1147,0.206644
6,0.0887,0.217495
7,0.077,0.229903
8,0.0518,0.238686
9,0.0457,0.246079
10,0.0345,0.249718


TrainOutput(global_step=14190, training_loss=0.1758386116619258, metrics={'train_runtime': 20737.4946, 'train_samples_per_second': 10.947, 'train_steps_per_second': 0.684, 'total_flos': 6.149501607739392e+16, 'train_loss': 0.1758386116619258, 'epoch': 10.0})

In [13]:
model.save_pretrained('./ta-en-fine-tuned-v1')
tokenizer.save_pretrained('./ta-en-fine-tuned-token-v1')

('./ta-en-fine-tuned-token-v1/tokenizer_config.json',
 './ta-en-fine-tuned-token-v1/special_tokens_map.json',
 './ta-en-fine-tuned-token-v1/sentencepiece.bpe.model',
 './ta-en-fine-tuned-token-v1/added_tokens.json')

In [15]:
!zip -r file.zip /kaggle/working/ta-en-fine-tuned-token-v1 /kaggle/working/ta-en-fine-tuned-v1
from IPython.display import FileLink
FileLink(r'file.zip')

  adding: kaggle/working/ta-en-fine-tuned-token-v1/ (stored 0%)
  adding: kaggle/working/ta-en-fine-tuned-token-v1/sentencepiece.bpe.model (deflated 49%)
  adding: kaggle/working/ta-en-fine-tuned-token-v1/tokenizer_config.json (deflated 92%)
  adding: kaggle/working/ta-en-fine-tuned-token-v1/special_tokens_map.json (deflated 61%)
  adding: kaggle/working/ta-en-fine-tuned-v1/ (stored 0%)
  adding: kaggle/working/ta-en-fine-tuned-v1/model.safetensors (deflated 7%)
  adding: kaggle/working/ta-en-fine-tuned-v1/config.json (deflated 60%)
  adding: kaggle/working/ta-en-fine-tuned-v1/generation_config.json (deflated 42%)


In [24]:
def translate_text(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
    outputs = model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return translated_text

text = "நீங்கள் என்ன எதிர்பார்த்தீர்கள்?"
english = translate_text(text)
print(english)

What are you expecting?


In [25]:
test_dataset = load_dataset("opus100", "en-ta", split="test")
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

test_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print(test_results)

metric = load("sacrebleu")

def compute_metrics(pred):
  labels_ids = pred.label_ids
  pred_ids = pred.predictions

  decode_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  decode_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  bleu_score = metric.compute(predictions=decode_preds, references=[[label] for label in decode_labels])
  return bleu_score

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.26493462920188904, 'eval_runtime': 56.4175, 'eval_samples_per_second': 35.45, 'eval_steps_per_second': 4.431, 'epoch': 10.0}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [26]:
predictions = trainer.predict(test_dataset=tokenized_test_dataset, metric_key_prefix="test")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [27]:
metrics = compute_metrics(predictions)

In [28]:
metrics

{'score': 21.72636938840862,
 'counts': [7861, 3645, 2034, 1235],
 'totals': [15952, 13952, 12288, 10986],
 'precisions': [49.27908726178536,
  26.125286697247706,
  16.552734375,
  11.241580192972874],
 'bp': 0.9820462730048332,
 'sys_len': 15952,
 'ref_len': 16241}