In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce GTX 1650


# Login to Hugging Face
#### Before You can start using hugging face models and datasets, you need to Create a Token from the website. To generate token got to Settings > Access Tokens and generate and save the token with the required permissions. 
#### In the notebook run below code and provide the token that you created. Now you will be able to access the data and models present at hugging face.

In [2]:
# from huggingface_hub import notebook_login
# notebook_login()

# Import Dataset
#### You can use load_dataset() to load datasets available on HuggingFace. It loads the data in a DatasetDict Format. When you print it you can see the meta data instead of the actual data. 
#### You can see that there is one key "train" which contains english_sentence and hindi_sentence which are our input text and labels

In [3]:
from datasets import load_dataset
raw_dataset = load_dataset("Aarif1430/english-to-hindi")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['english_sentence', 'hindi_sentence'],
        num_rows: 127705
    })
})

#### You can look at the data by accessing the keys like a dictionary

In [4]:
raw_dataset["train"][100]

{'english_sentence': 'politicians do not have permission to do what needs to be done.',
 'hindi_sentence': 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .'}

# Preprocess Dataset

#### First step is to convert your data into numbers which can be done using AutoTokenizer class.
#### Almost all the models on hugging face have their own tokenizers that you can import using this library. 

In [55]:
from transformers import AutoTokenizer

In [56]:
tokenizer = AutoTokenizer.from_pretrained("barghavani/English_to_Hindi")

In [57]:
tokenizer("Hello My name is Priyam")

{'input_ids': [12110, 633, 300, 23, 34293, 363, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

#### Here we are using preprocess function for comverting text to numbers
#### At the end you will have the same imported DatasetDict with 2 added columns 1. input_ids (English sentences) 2. labels (Hindi Sentences)

In [8]:
max_length = 128
def preprocess(example):
    text = [en for en in example["english_sentence"]]
    labels = [hin for hin in example["hindi_sentence"]]
    model_input = tokenizer(text, max_length = max_length)
    with tokenizer.as_target_tokenizer():
        label = tokenizer(labels, max_length = max_length)

    model_input["labels"] = label["input_ids"]
    return model_input


In [9]:
sample_data = preprocess(raw_dataset["train"][100:102])
sample_data

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': [[21770, 110, 36, 55, 2961, 7, 110, 117, 1874, 7, 42, 846, 3, 0], [56, 70, 232, 288, 7, 1169, 27, 195, 131, 295, 1075, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[40085, 86, 6, 173, 41, 358, 187, 236, 2, 49, 91, 25, 1611, 29, 5, 44, 3, 0], [6871, 383, 276, 58, 38, 929, 6, 207, 11, 8106, 44730, 3061, 2, 0]]}

In [10]:
tokenized_dataset = raw_dataset.map(preprocess, batched = True)

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 127705
    })
})

#### Splitting the data into train and test

In [12]:
dataset = tokenized_dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
train_dataset

Dataset({
    features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 114934
})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 114934
    })
    test: Dataset({
        features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12771
    })
})

# Load Model

#### As our task is of Machine Translation which is a sequence to sequence task, we will use AutoModelForSeq2SeqLM. 
#### For different tasks you can use different such AutoModel functions.

In [14]:
from transformers import AutoModelForSeq2SeqLM

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained("barghavani/English_to_Hindi").to(device)

# Training Config

In [16]:
batch_size = 16
learning_rate = 0.00003
weight_decay = 0.01
number_of_train_epochs = 1

#### Data collator is used for supplying our input data in batches to avoid any memory issues.

In [17]:
from transformers import DataCollatorForSeq2Seq

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model, return_tensors = "pt", pad_to_multiple_of=128)

In [19]:
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    predict_with_generate=True
)



In [20]:
train_dataset

Dataset({
    features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 114934
})

# Train Model

In [21]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


# Inference Model

In [32]:
def translate(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    
    # Generate translation
    output_sequences = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )
    
    # Decode the generated sequences
    translated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return translated_text

In [54]:
input_text = "i am a singer who dances well but sings bad"
translated_text = translate(input_text)
print("Translated text:", translated_text)

Translated text: मैं एक गायक हूँ जो अच्छी तरह से नृत्य करते हैं लेकिन बुरा गाता है


# Save Fine tuned model

In [None]:
# Define the directory where you want to save the model
save_directory = "./fine_tuned_model"

# Save the model
trainer.save_model(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print("Model and tokenizer saved successfully!")


# Load Fine Tuned Model

In [31]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

save_directory = r"D:\courses\NLP\Transformers\results\checkpoint-3000"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!
