<a href="https://colab.research.google.com/github/shouvikcirca/LLMs/blob/main/HuggingFace_MachineTranslation_hi_to_en_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

In [None]:
# ! pip install datasets

In [None]:
# Translation pipeline
from datasets import load_dataset

iitbdataset = load_dataset("cfilt/iitb-english-hindi")

In [None]:
len(iitbdataset["train"]),len(iitbdataset["validation"]),len(iitbdataset["test"])

In [None]:
from datasets import DatasetDict
demodataset = DatasetDict({"train": iitbdataset['test'],"validation":iitbdataset['validation']})

In [None]:
def get_tokenizer_training_corpus():
    global demodataset
    for start_idx in range(0, len(demodataset['train']), 10):
        samples = demodataset['train'][start_idx : start_idx + 10]['translation']
        samples = [i['hi'] for i in samples]
        yield(samples)

In [None]:
tokenizer_training_corpus = get_tokenizer_training_corpus()

In [None]:
# next(training_corpus)

In [None]:
# Loading model to be used
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(tokenizer_training_corpus, 52000) # 52000 is the vocabulary length

In [None]:
# example = 'आपका नाम क्या है'
# tokens = tokenizer.tokenize(example)
# tokens

In [None]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["hi"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True, padding=True
    )
    return model_inputs

In [None]:
from transformers import AutoModelForSeq2SeqLM

model_checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
tokenized_datasets = demodataset.map(
    preprocess_function,
    batched=True,
    remove_columns = iitbdataset["train"].column_names,
)

In [None]:
# batch = data_collator([tokenized_datasets["validation"][i] for i in range(1, 3)])
# batch.keys()

In [None]:
# batch['labels']

In [None]:
# batch["decoder_input_ids"] # To see if they are the shifted versions of batch['labels']

In [None]:
# ! pip uninstall tensorflow -y
! pip install tensorflow==2.14


In [None]:
# ! pip install evaluate --no-cache-dir
# ! pip install sacrebleu --no-cache-dir


import evaluate
metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace 0s in the labels as we can't decode them
    labels = np.where(labels != 0, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
! pip install accelerate -U
! pip install transformers -U

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR']
# !pip install cloud-tpu-client==0.10 torch==1.13.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.13-cp38-cp38-linux_x86_64.whl
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION
import os
os.environ['LD_LIBRARY_PATH']='/usr/local/lib'
!echo $LD_LIBRARY_PATH

!sudo ln -s /usr/local/lib/libmkl_intel_lp64.so /usr/local/lib/libmkl_intel_lp64.so.1
!sudo ln -s /usr/local/lib/libmkl_intel_thread.so /usr/local/lib/libmkl_intel_thread.so.1
!sudo ln -s /usr/local/lib/libmkl_core.so /usr/local/lib/libmkl_core.so.1

!ldconfig
!ldd /usr/local/lib/python3.7/dist-packages/torch/lib/libtorch.so

In [None]:
# https://huggingface.co/settings/tokens --> Token generation page for HuggingFace

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Seq2SeqTrainingArguments
import accelerate
import transformers

args = Seq2SeqTrainingArguments(
    f"multilingual_llm",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    # fp16=True,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(tags="translation", commit_message="Pretrainedt5 epoch3")

In [None]:
from transformers import pipeline

translator = pipeline("translation_hi_to_en", model="multilingual_llm")
text = ["आपका नाम क्या है", "आपका घर कहाँ है"]
translator(text)