In [1]:
import pandas as pd
import  numpy as np

In [2]:
PATH = "../datasets/wiki/"
import os
datatypes = os.listdir(PATH)
datatypes.remove(".DS_Store")

def get_contents(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            sentence = line.strip()
            sentences.append(sentence)
    return sentences

def get_duals(datatypes,lang):
    for i in datatypes:
        x = os.listdir(os.path.join(PATH,i))
        x.remove("domain.txt")
        if lang in x[0] or lang in x[1]:
            res_lang = get_contents(os.path.join(PATH,i,x[0]))
            res_en = get_contents(os.path.join(PATH,i,"train.eng_Latn"))
            return {lang:res_lang,"en":res_en}
        

In [3]:
data = get_duals(datatypes,"hi")

In [5]:
pd.DataFrame(data)[:400]

Unnamed: 0,hi,en
0,"""संयुक्त प्रतिस्पर्धा"" वा ""बहु प्रतिस्पर्धा "" ...","There are also ""combined events"" or ""multi eve..."
1,"आवेदन जमा करबाक बाद, अहाँकेँ किछु व्यावसायिक द...","After submitting your application, you should ..."
2,१९९० के दशक मे दमानिया शिपिंग द्वारा संचालित म...,There was also a short-lived catamaran service...
3,"एहन रोगीक लेल, जनिक संक्रमण आ कैंसरक नैदानिक स...",X-rays are a less costly initial option offere...
4,"एलर्जी सँ पीड़ित नेत्रश्लेष्माशोथक लेल, मुह के...","For allergic conjunctivitis, cool water poured..."
...,...,...
395,यहाँ के पौधों की कुछ प्रजातियों में यूपेटोरियम...,"Some of the plant species include eupatorium, ..."
396,यह आदिलाबाद जिले के नेरेडीगोंडा के पास तरनाम ख...,"It is approximately 5km from Tarnam Khurd, nea..."
397,सवातुला गुण्डम जलप्रपात भारत के तेलंगाना राज्य...,Savatula Gundam Waterfalls is one of many wate...
398,केम्पटी जलप्रपात को 1835 में ब्रिटिश अधिकारी ज...,Kempty Falls were developed as a tourist desti...


In [7]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import XLNetTokenizer, XLNetLMHeadModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from sklearn.model_selection import train_test_split

class TextToTextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

def preprocess_data(file_path, tokenizer):
    # Load the data into a pandas DataFrame
    df = pd.DataFrame(data)[:400]
    
    # Encode the data using the tokenizer
    encodings = tokenizer(df['hi'].tolist(), df['en'].tolist(), padding=True, truncation=True, return_tensors="pt")
    
    return encodings

def train(model, train_dataloader, val_dataloader, epochs, learning_rate):
    # Set up the optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    # Move the model to the device (GPU/CPU)
    device = torch.device("cpu") 
    model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch, labels=batch["input_ids"])
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Training loss: {avg_train_loss}")

        model.eval()
        total_eval_loss = 0
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch, labels=batch["input_ids"])
            loss = outputs.loss
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Validation loss: {avg_val_loss}")

def main():
    # Load the tokenizer and model
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')

    # Load and preprocess the data
    file_path = 'path_to_your_mt_data.tsv'
    encodings = preprocess_data(file_path, tokenizer)
    
    # Create a PyTorch dataset
    dataset = TextToTextDataset(encodings)

    # Split the dataset into training and validation sets
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Create data loaders
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8)

    # Train the model
    train(model, train_dataloader, val_dataloader, epochs=3, learning_rate=2e-5)
    torch.save(model.state_dict(), "XLnet.pt")


if __name__ == "__main__":
    main()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/3, Training loss: 2.599706707149744
Epoch 1/3, Validation loss: 0.08676095977425576
Epoch 2/3, Training loss: 0.11623244918882847
Epoch 2/3, Validation loss: 0.025831568986177444
Epoch 3/3, Training loss: 0.07209880966693163
Epoch 3/3, Validation loss: 0.019819531589746475


In [10]:
model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
model.load_state_dict(torch.load("XLnet.pt"))
model.eval()  # Set the model to evaluation mode


XLNetLMHeadModel(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_loss): Linear(in_features=768, out_features=32000, bias=True)
)

In [14]:
import torch
from transformers import XLNetTokenizer, XLNetLMHeadModel

# Load the tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')

# Load the state dictionary from the file
model.load_state_dict(torch.load("XLnet.pt"))
model.eval()

# Prepare the input data
input_text = ""
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate output
with torch.no_grad():
    outputs = model.generate(input_ids, max_length=50, num_return_sequences=1, temperature=1.0)
    print(outputs)

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


tensor([[4036,    4,    3,   19,   19,   19,   19,   19,   19,   19,   19,   19,
           19,   19,   19,   19,   19,   19,   19,   19,   19,   19,   19,   19,
           19,   19,   19,   19,   19,   19,   19,   19,   19,   19,   19,   19,
           19,   19,   19,   19,   19,   19,   19,   19,   19,   19,   19,   19,
           19,   19]])
Hi,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [15]:
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)

# Or use tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)

model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

# Or use model = MBartForConditionalGeneration.from_pretrained("ai4bharat/IndicBART")

# Some initial mapping
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")
# To get lang_id use any of ['<2as>', '<2bn>', '<2en>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']

# First tokenize the input and outputs. The format below is how IndicBART was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. Similarly, the output should be "<2yy> Sentence </s>". 
inp = tokenizer("I am a boy </s> <2en>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[  466,  1981,    80, 25573, 64001, 64004]])

out = tokenizer("<2hi> मैं  एक लड़का हूँ </s>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[64006,   942,    43, 32720,  8384, 64001]])
# Note that if you use any language other than Hindi or Marathi, you should convert its script to Devanagari using the Indic NLP Library.

model_outputs=model(input_ids=inp, decoder_input_ids=out[:,0:-1], labels=out[:,1:])

# For loss
model_outputs.loss ## This is not label smoothed.

# For logits
model_outputs.logits

# For generation. Pardon the messiness. Note the decoder_start_token_id.

model.eval() # Set dropouts to zero

model_output=model.generate(inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2en>"))


# Decode to get output strings

decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(decoded_output) # I am a boy
# Note that if your output language is not Hindi or Marathi, you should convert its script from Devanagari to the desired language using the Indic NLP Library.

# What if we mask?

inp = tokenizer("I am [MASK] </s> <2en>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


I am a boy
I am happy.
मैं जानता हूँ
मला ओळखलं पाहिजे


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-7b"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
sequences = pipeline(
   "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


  from .autonotebook import tqdm as notebook_tqdm
2024-07-01 16:39:09.050579: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versio

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`