In [1]:
import torch
from transformers import XLNetTokenizer, XLNetLMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader, Dataset

# Custom Dataset
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]
        inputs = self.tokenizer(input_text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        targets = self.tokenizer(target_text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze()
        target_ids = targets['input_ids'].squeeze()
        return input_ids, target_ids



  from .autonotebook import tqdm as notebook_tqdm
2024-06-30 21:24:20.628606: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [8]:
from mbart.configuration_mbart import MBartConfig
from mbart.modeling_mbart import MBartModel, MBartForConditionalGeneration
from mbart.tokenization_mbart import MBartTokenizer

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import AlbertTokenizer, AutoTokenizer


from xlnet.modeling_xlnet import XLNetLMHeadModel
from xlnet.configuration_xlnet import XLNetConfig

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)



dec_only_config = XLNetConfig(vocab_size = 64014, bos_token_id= 64000, n_layer=6,pad_token_id=0,eos_token_id=64001)
model = XLNetLMHeadModel(config=dec_only_config)

# Example data
input_texts = ["I am a boy </s> <2en>"]
target_texts = ["<2hi> मैं एक लड़का हूँ </s>"]

# Prepare the dataset and dataloader
dataset = TranslationDataset(input_texts, target_texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Define training parameters
epochs = 3
learning_rate = 2e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(epochs):
    for input_ids, target_ids in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=target_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Save the fine-tuned model
torch.save(model.state_dict(), "finetuned_xlnet.pt")

# Testing the model in inference time
model.eval()
test_input = "I am a boy </s> <2en>"
test_input_ids = tokenizer(test_input, add_special_tokens=True, return_tensors="pt").input_ids
generated_ids = model.generate(test_input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"Generated text: {generated_text}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 0, Loss: 5.417823314666748
Epoch 1, Loss: 2.0797722339630127
Epoch 2, Loss: 0.9125195145606995


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (-1). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Generated text: I am a boyळम्ळम्ळम्ळम्ळम्ळम्ळम्ळम्ळम्ळम्ळम् कधीहीळम् कधीहीळम् कधीहीळम् कधीही कधीही कधीहीळम् कधीहीळम्ळम् कधीहीळम्ळम्ळम्ळम् कधीहीळम् कधीहीळम् कधीहीळम् कधीहीळम् कधीहीळम्ळम् कधीहीळम्


In [2]:
import torch
from transformers import XLNetTokenizer, XLNetLMHeadModel, MBartConfig, MBartForConditionalGeneration

# Load the tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
dec_only_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')

# Example input and target texts
x = "I am a boy </s> <2en>"
y = "<2hi> मैं एक लड़का हूँ </s>"

# Tokenize the input and target texts
inp = tokenizer(x, add_special_tokens=True, return_tensors="pt", padding=True).input_ids
out = tokenizer(y, add_special_tokens=True, return_tensors="pt", padding=True).input_ids

# Generate output using the model
# output_dec = dec_only_model(input_ids=inp, labels=out)

# Decode the generated output
dec_only_out = tokenizer.decode(dec_only_model.generate(inp, max_length=50, num_return_sequences=1)[0], skip_special_tokens=True)

print(dec_only_out)


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (-1). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


I am a boy <2en> in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in
