In [1]:
!pip install transformers datasets torch sacrebleu evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import wandb
import torch
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import warnings
from evaluate import load
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
!rm -rf ~/.cache/huggingface/transformers

In [4]:
model_name = "facebook/mbart-large-50"
model = MBartForConditionalGeneration.from_pretrained(model_name)
from transformers import MBart50Tokenizer
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="en_XX", tgt_lang="hi_IN")

train_dataset = load_dataset("opus100", "en-hi", split="train[:10%]")
val_dataset = load_dataset("opus100", "en-hi", split="validation")

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/65.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/534319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
def preprocess_function(examples):
    inputs = [ex['en'] for ex in examples["translation"]]
    targets = [ex['hi'] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/53432 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_only_model=True,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=1000,
    dataloader_num_workers=0
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [7]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mctarunvignesh[0m ([33mctarunvignesh-[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3346,0.348037
2,0.2445,0.336834
3,0.1712,0.354283
4,0.1028,0.391814


TrainOutput(global_step=16695, training_loss=0.24042253337173486, metrics={'train_runtime': 24060.8199, 'train_samples_per_second': 11.104, 'train_steps_per_second': 0.694, 'total_flos': 7.23517944323113e+16, 'train_loss': 0.24042253337173486, 'epoch': 4.998652492888157})

In [10]:
model.save_pretrained('./en-hi-fine-tuned-v1')
tokenizer.save_pretrained('./en-hi-fine-tuned-token-v1')

('./en-hi-fine-tuned-token-v1/tokenizer_config.json',
 './en-hi-fine-tuned-token-v1/special_tokens_map.json',
 './en-hi-fine-tuned-token-v1/sentencepiece.bpe.model',
 './en-hi-fine-tuned-token-v1/added_tokens.json')

In [17]:
!zip -r file.zip /kaggle/working/en-hi-fine-tuned-token-v1 /kaggle/working/en-hi-fine-tuned-v1
!ls
from IPython.display import FileLink
FileLink(r'file.zip')

  adding: kaggle/working/en-hi-fine-tuned-token-v1/ (stored 0%)
  adding: kaggle/working/en-hi-fine-tuned-token-v1/special_tokens_map.json (deflated 61%)
  adding: kaggle/working/en-hi-fine-tuned-token-v1/tokenizer_config.json (deflated 92%)
  adding: kaggle/working/en-hi-fine-tuned-token-v1/sentencepiece.bpe.model (deflated 49%)
  adding: kaggle/working/en-hi-fine-tuned-v1/ (stored 0%)
  adding: kaggle/working/en-hi-fine-tuned-v1/generation_config.json (deflated 42%)
  adding: kaggle/working/en-hi-fine-tuned-v1/config.json (deflated 60%)
  adding: kaggle/working/en-hi-fine-tuned-v1/model.safetensors (deflated 7%)
en-hi-fine-tuned-token-v1  en-hi-fine-tuned-v1	file.zip  logs	results  wandb


In [18]:
def translate_text(text):
    torch.manual_seed(42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
    outputs = model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return translated_text

text = "This is working!"
hindi = translate_text(text)
print(hindi)

यह काम कर रहा है!


In [19]:
test_dataset = load_dataset("opus100", "en-hi", split="test")
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

test_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print(test_results)

metric = load("sacrebleu")

def compute_metrics(pred):
  labels_ids = pred.label_ids
  pred_ids = pred.predictions

  decode_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  decode_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  bleu_score = metric.compute(predictions=decode_preds, references=[[label] for label in decode_labels])
  return bleu_score

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.4219711422920227, 'eval_runtime': 56.7553, 'eval_samples_per_second': 35.239, 'eval_steps_per_second': 4.405, 'epoch': 4.998652492888157}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [20]:
predictions = trainer.predict(test_dataset=tokenized_test_dataset, metric_key_prefix="test")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [21]:
metrics = compute_metrics(predictions)

In [22]:
metrics

{'score': 16.617376561241933,
 'counts': [14466, 6234, 3289, 1832],
 'totals': [30210, 28210, 26320, 24547],
 'precisions': [47.88480635551142,
  22.098546614675648,
  12.496200607902736,
  7.463233796390598],
 'bp': 0.9375530279261092,
 'sys_len': 30210,
 'ref_len': 32158}