In [1]:
%env CUDA_VISIBLE_DEVICES= 3

env: CUDA_VISIBLE_DEVICES=3


In [2]:
import os
os.environ['HF_HOME'] = '/home/sofia/cache_custom'

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransToolkit import IndicProcessor
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
from torch.nn.functional import softmax


BATCH_SIZE = 8 # edited from 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
print(DEVICE)

cuda


In [4]:
import importlib
import sys
sys.path.append('../')
import possible_indic_relations as poss_indic_rel
# Reload the module to reflect changes
importlib.reload(poss_indic_rel)

pir= poss_indic_rel.possible_relations
pir

ambiguos_words = list(pir.keys())

In [5]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model

In [6]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir,  quantization)

ip_en_ind = IndicProcessor(inference=True)

In [7]:
lang_script_list = [
                           'ory_Orya',
                     'pan_Guru', 'ben_Beng', 
                       'mal_Mlym',
                           'mar_Deva', 
                           'tam_Taml', 'guj_Gujr', 
                           'tel_Telu', 'hin_Deva', 
                           'kan_Knda', 
                           ]

In [8]:
# sents=['Along with him were his brother and a cousin.',
#        'She has admitted to murdering her husband with the help of her brother and his friend, claimed police.',
#         'She is the sister of his first wife, and the aunt of his daughter by his first wife.',
#         "I've become a grandmother, she says, but I have a mother's responsibilities too."
        
#  ]

In [9]:
# sents =['ଆଈ', 'ଦେଢ଼ଶୁର', 'ଦିଅର', 'ଜେଜେବାପା', 'ଜେଜେମା']

In [10]:
# word_trl=[]
span_encodings = {}
for lang in lang_script_list:
#   if lang == 'ory_Orya':
    span_encodings[lang] = {}
    # sents = ambiguos_words
    sents=[]
    for word in ambiguos_words:
        # insert into sents the description of all the keys / items of the pir[word][lang][keys]
        # sents.extend( key for key in pir[word][lang].keys())
        # keep prefixes of the key like key[0], key[1] and so on
        sents.extend( key[:i] for key in pir[word][lang].keys() for i in range(1, len(key)+1))

    print("sents: ", sents)
    for i in tqdm(range(0, len(sents), BATCH_SIZE)):
        batch = sents[i : i + BATCH_SIZE]
        print("Batch:", batch)  

        # batch = ip_en_ind.preprocess_batch(words_ids[lang].keys().tolist(), src_lang=lang, tgt_lang=lang)
        batch = ip_en_ind.preprocess_batch(batch, src_lang=lang, tgt_lang='hin_Deva')
        print("Batch edited:", batch)
        # # Tokenize the batch and generate input encodings
        with en_indic_tokenizer.as_target_tokenizer():  
            inputs = en_indic_tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ) 
        # print("Odia tokens inputs: ", inputs.input_ids)
        for sent, ids in zip(sents[i : i + BATCH_SIZE], inputs.input_ids):
            # till index of foound '2' in the list
            ids=ids.tolist()
            end_idx = ids.index(2)
            start_idx = ids. index(43144)
            # check if 2nd occurrence of 43144 is after start_idx
            if 'Deva' in lang:
                start_idx = ids.index(43144, start_idx+1)
            print(sent, ids [:   end_idx]) 
            span_encodings[lang][sent] = ids[start_idx+1:   end_idx]

span_encodings



sents:  ['ଜ', 'ଜେ', 'ଜେଜ', 'ଜେଜେ', 'ଜେଜେମ', 'ଜେଜେମା', 'ଆ', 'ଆଈ', 'ଜ', 'ଜେ', 'ଜେଜ', 'ଜେଜେ', 'ଜେଜେବ', 'ଜେଜେବା', 'ଜେଜେବାପ', 'ଜେଜେବାପା', 'ଅ', 'ଅଜ', 'ଅଜା', 'ବ', 'ବଡ', 'ବଡ଼', 'ବଡ଼ବ', 'ବଡ଼ବା', 'ବଡ଼ବାପ', 'ବଡ଼ବାପା', 'ଦ', 'ଦା', 'ଦାଦ', 'ଦାଦା', 'ମ', 'ମା', 'ମାମ', 'ମାମୁ', 'ମାମୁଁ', 'ପ', 'ପି', 'ପିଉ', 'ପିଉସ', 'ପିଉସା', 'ମ', 'ମଉ', 'ମଉସ', 'ମଉସା', 'ପ', 'ପି', 'ପିଉ', 'ପିଉସ', 'ପିଉସୀ', 'ମ', 'ମା', 'ମାଉ', 'ମାଉସ', 'ମାଉସୀ', 'ମ', 'ମା', 'ମାଇ', 'ମାଇଁ', 'ବ', 'ବଡ', 'ବଡ଼', 'ବଡ଼ମ', 'ବଡ଼ମା', 'ବଡ଼ମାଆ', 'ଖ', 'ଖୁ', 'ଖୁଡ', 'ଖୁଡ଼', 'ଖୁଡ଼ି', 'ବ', 'ବଡ', 'ବଡ଼', 'ବଡ଼ ', 'ବଡ଼ ଶ', 'ବଡ଼ ଶଳ', 'ବଡ଼ ଶଳା', 'ଶ', 'ଶଳ', 'ଶଳା', 'ଭ', 'ଭି', 'ଭିଣ', 'ଭିଣେ', 'ଭିଣେଇ', 'ଭ', 'ଭି', 'ଭିଣ', 'ଭିଣୋ', 'ଭିଣୋଇ', 'ଦ', 'ଦେ', 'ଦେଢ', 'ଦେଢ଼', 'ଦେଢ଼ଶ', 'ଦେଢ଼ଶୁ', 'ଦେଢ଼ଶୁର', 'ଦ', 'ଦି', 'ଦିଅ', 'ଦିଅର', 'ବ', 'ବଡ', 'ବଡ଼', 'ବଡ଼ ', 'ବଡ଼ ନ', 'ବଡ଼ ନଣ', 'ବଡ଼ ନଣନ', 'ବଡ଼ ନଣନ୍', 'ବଡ଼ ନଣନ୍ଦ', 'ନ', 'ନଣ', 'ନଣନ', 'ନଣନ୍', 'ନଣନ୍ଦ', 'ଭ', 'ଭା', 'ଭାଉ', 'ଭାଉଜ', 'ଭ', 'ଭା', 'ଭାଇ', 'ଭାଇବ', 'ଭାଇବୋ', 'ଭାଇବୋହ', 'ଭାଇବୋହୁ', 'ଦ', 'ଦେ', 'ଦେଢ', 'ଦେଢ଼', 'ଦେଢ଼ଶ', 'ଦେଢ଼ଶା', 'ଦେଢ଼ଶାସ', 'ଦେଢ଼ଶାସୁ', 'ଶ'

100%|██████████| 22/22 [00:00<00:00, 724.56it/s]


Batch: ['ଜ', 'ଜେ', 'ଜେଜ', 'ଜେଜେ', 'ଜେଜେମ', 'ଜେଜେମା', 'ଆ', 'ଆଈ']
Batch edited: ['ory_Orya hin_Deva ज', 'ory_Orya hin_Deva जे', 'ory_Orya hin_Deva जेज', 'ory_Orya hin_Deva जेजे', 'ory_Orya hin_Deva जेजेम', 'ory_Orya hin_Deva जेजेमा', 'ory_Orya hin_Deva आ', 'ory_Orya hin_Deva आई']
ଜ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 106]
ଜେ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 169]
ଜେଜ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 169, 256]
ଜେଜେ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 41445]
ଜେଜେମ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 41445, 143]
ଜେଜେମା [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 41445, 241]
ଆ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 34]
ଆଈ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 740]
Batch: ['ଜ', 'ଜେ', 'ଜେଜ', 'ଜେଜେ', 'ଜେ

100%|██████████| 16/16 [00:00<00:00, 803.34it/s]


Batch: ['ਦ', 'ਦਾ', 'ਦਾਦ', 'ਦਾਦੀ', 'ਨ', 'ਨਾ', 'ਨਾਨ', 'ਨਾਨੀ']
Batch edited: ['pan_Guru hin_Deva द', 'pan_Guru hin_Deva दा', 'pan_Guru hin_Deva दाद', 'pan_Guru hin_Deva दादी', 'pan_Guru hin_Deva न', 'pan_Guru hin_Deva ना', 'pan_Guru hin_Deva नान', 'pan_Guru hin_Deva नानी']
ਦ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 409]
ਦਾ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 171]
ਦਾਦ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 15597]
ਦਾਦੀ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 29498]
ਨ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 80]
ਨਾ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 46]
ਨਾਨ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 5457]
ਨਾਨੀ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 8911]
Batch: ['ਦ', 'ਦਾ', 'ਦਾਦ', 'ਦਾਦਾ', 'ਨ', 'ਨਾ', 'ਨਾਨ', 'ਨਾਨਾ']
Batch edited: ['pan_Guru hin_Deva द', 'pan_Guru hin_Deva दा', 'pan_Guru hin_Deva दाद', 'pan_Guru hin_

100%|██████████| 29/29 [00:00<00:00, 843.29it/s]


Batch: ['ঠ', 'ঠা', 'ঠাক', 'ঠাকু', 'ঠাকুর', 'ঠাকুরম', 'ঠাকুরমা', 'দ']
Batch edited: ['ben_Beng hin_Deva ठ', 'ben_Beng hin_Deva ठा', 'ben_Beng hin_Deva ठाक', 'ben_Beng hin_Deva ठाकु', 'ben_Beng hin_Deva ठाकुर', 'ben_Beng hin_Deva ठाकुरम', 'ben_Beng hin_Deva ठाकुरमा', 'ben_Beng hin_Deva द']
ঠ [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 3712]
ঠা [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 33566]
ঠাক [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 16434]
ঠাকু [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 58599]
ঠাকুর [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 9402]
ঠাকুরম [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 9402, 143]
ঠাকুরমা [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 9402, 241]
দ [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 409]
Batch: ['দি', 'দিদ', 'দিদি', 'দিদি ', 'দিদি ম', 'দিদি মা', 'ঠ', 'ঠা']
Batch edited: ['ben_Beng hin_Deva दि', 'ben_Beng

100%|██████████| 33/33 [00:00<00:00, 804.71it/s]


Batch: ['അ', 'അമ', 'അമ്', 'അമ്മ', 'അമ്മൂ', 'അമ്മൂമ', 'അമ്മൂമ്', 'അമ്മൂമ്മ']
Batch edited: ['mal_Mlym hin_Deva अ', 'mal_Mlym hin_Deva अम', 'mal_Mlym hin_Deva अम्', 'mal_Mlym hin_Deva अम्म', 'mal_Mlym hin_Deva अम्मू', 'mal_Mlym hin_Deva अम्मूम', 'mal_Mlym hin_Deva अम्मूम्', 'mal_Mlym hin_Deva अम्मूम्म']
അ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 213]
അമ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 2039]
അമ് [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 19875]
അമ്മ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774]
അമ്മൂ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774, 208]
അമ്മൂമ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774, 8641]
അമ്മൂമ് [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774, 17244]
അമ്മൂമ്മ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774, 208, 1476]
Batch: ['അ', 'അപ', 'അപ്', 'അപ്പ'

100%|██████████| 26/26 [00:00<00:00, 829.18it/s]


Batch: ['आ', 'आज', 'आजी', 'म', 'मा', 'माव', 'मावस', 'मावस ']
Batch edited: ['mar_Deva hin_Deva आ', 'mar_Deva hin_Deva आज', 'mar_Deva hin_Deva आजी', 'mar_Deva hin_Deva म', 'mar_Deva hin_Deva मा', 'mar_Deva hin_Deva माव', 'mar_Deva hin_Deva मावस', 'mar_Deva hin_Deva मावस']
आ [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 34]
आज [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 755]
आजी [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 32967]
म [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 214]
मा [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 354]
माव [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 11701]
मावस [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 11701, 115]
मावस  [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 11701, 115]
Batch: ['मावस आ', 'मावस आज', 'मावस आजी', 'आ', 'आज', 'आजो', 'आजोब', 'आजोबा']
Batch edited: ['mar_De

  0%|          | 0/37 [00:00<?, ?it/s]

Batch: ['அ', 'அப', 'அப்', 'அப்ப', 'அப்பத', 'அப்பத்', 'அப்பத்த', 'அப்பத்தா']
Batch edited: ['tam_Taml hin_Deva अ', 'tam_Taml hin_Deva अप', 'tam_Taml hin_Deva अप्', 'tam_Taml hin_Deva अप्प', 'tam_Taml hin_Deva अप्पत', 'tam_Taml hin_Deva अप्पत्', 'tam_Taml hin_Deva अप्पत्त', 'tam_Taml hin_Deva अप्पत्ता']
அ [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 213]
அப [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 918]
அப் [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 8288]
அப்ப [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2763]
அப்பத [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2763, 119]
அப்பத் [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2763, 183]
அப்பத்த [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2763, 357]
அப்பத்தா [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2763, 2154]
Batch: ['அ', 'அம', 'அம்', 'அம்ம', 'அம்மத',

100%|██████████| 37/37 [00:00<00:00, 784.91it/s]


அக் [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 16591]
அக்க [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 4965]
அக்கா [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 45390]
த [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 302]
தங [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 302, 9110]
தங் [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 302, 2129]
தங்க [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 11591]
தங்கை [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 11591, 463]
Batch: ['அ', 'அண', 'அண்', 'அண்ண', 'அண்ணா', 'த', 'தம', 'தம்']
Batch edited: ['tam_Taml hin_Deva अ', 'tam_Taml hin_Deva अण', 'tam_Taml hin_Deva अण्', 'tam_Taml hin_Deva अण्ण', 'tam_Taml hin_Deva अण्णा', 'tam_Taml hin_Deva त', 'tam_Taml hin_Deva तम', 'tam_Taml hin_Deva तम्']
அ [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 213]
அண [828

  0%|          | 0/15 [00:00<?, ?it/s]

Batch: ['દ', 'દા', 'દાદ', 'દાદી', 'ન', 'ના', 'નાન', 'નાની']
Batch edited: ['guj_Gujr hin_Deva द', 'guj_Gujr hin_Deva दा', 'guj_Gujr hin_Deva दाद', 'guj_Gujr hin_Deva दादी', 'guj_Gujr hin_Deva न', 'guj_Gujr hin_Deva ना', 'guj_Gujr hin_Deva नान', 'guj_Gujr hin_Deva नानी']
દ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 409]
દા [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 171]
દાદ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 15597]
દાદી [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 29498]
ન [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 80]
ના [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 46]
નાન [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 5457]
નાની [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 8911]
Batch: ['

100%|██████████| 15/15 [00:00<00:00, 707.93it/s]


નણ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 80, 266]
નણં [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 80, 1058]
નણંદ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 80, 266, 942]
ભ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 980]
ભા [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 4569]
ભાભ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 4569, 796]
ભાભી [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 51887]
સ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 116]
Batch: ['સા', 'સાળ', 'સાળી', 'દ', 'દી', 'દીક', 'દીકર', 'દીકરો']
Batch edited: ['guj_Gujr hin_Deva सा', 'guj_Gujr hin_Deva साळ', 'guj_Gujr hin_Deva साळी', 'guj_Gujr hin_Deva द', 'guj_Gujr hin_Deva दी', 'guj_Gujr hin_Deva दीक', 'guj_Gujr hin_Deva दीकर', 'guj_

  0%|          | 0/35 [00:00<?, ?it/s]

Batch: ['న', 'నా', 'నాన', 'నాన్', 'నాన్న', 'నాన్నమ', 'నాన్నమ్', 'నాన్నమ్మ']
Batch edited: ['tel_Telu hin_Deva न', 'tel_Telu hin_Deva ना', 'tel_Telu hin_Deva नान', 'tel_Telu hin_Deva नान्', 'tel_Telu hin_Deva नान्न', 'tel_Telu hin_Deva नान्नम', 'tel_Telu hin_Deva नान्नम्', 'tel_Telu hin_Deva नान्नम्म']
న [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 80]
నా [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 46]
నాన [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 5457]
నాన్ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 23830]
నాన్న [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 30539]
నాన్నమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 30539, 143]
నాన్నమ్ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 30539, 136]
నాన్నమ్మ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 30539, 1476]
Batch: ['అ', 'అమ', 'అమ్', 'అమ్మ', 'అమ్మమ', 'న', 'నా'

100%|██████████| 35/35 [00:00<00:00, 821.59it/s]


పెద్ద  నాన [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2224, 5457]
పెద్ద  నాన్ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2224, 23830]
పెద్ద  నాన్న [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2224, 30539]
పెద్ద  నాన్నమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2224, 30539, 143]
చ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 140]
చి [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 1501]
చిన [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 12090]
చిన్ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 1501, 117]
Batch: ['చిన్న', 'చిన్న ', 'చిన్న అ', 'చిన్న అమ', 'చిన్న అమ్', 'చిన్న అమ్మ', 'చిన్న అమ్మమ', 'ప']
Batch edited: ['tel_Telu hin_Deva चिन्न', 'tel_Telu hin_Deva चिन्न', 'tel_Telu hin_Deva चिन्न अ', 'tel_Telu hin_Deva चिन्न अम', 'tel_Telu hin_Deva चिन्न अम्', 'tel_Telu hin_Deva चिन्न अम्म', 'tel_Telu hin_Deva चिन्न अम

  0%|          | 0/19 [00:00<?, ?it/s]

Batch: ['द', 'दा', 'दाद', 'दादी', 'न', 'ना', 'नान', 'नानी']
Batch edited: ['hin_Deva hin_Deva द', 'hin_Deva hin_Deva दा', 'hin_Deva hin_Deva दाद', 'hin_Deva hin_Deva दादी', 'hin_Deva hin_Deva न', 'hin_Deva hin_Deva ना', 'hin_Deva hin_Deva नान', 'hin_Deva hin_Deva नानी']
द [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 409]
दा [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 171]
दाद [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 15597]
दादी [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 29498]
न [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 80]
ना [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 46]
नान [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 5457]
नानी [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 8911]
Batch: ['प', 'पि', 'पित', 'पिता', 'पिताम', 'पितामह', 'पितामही', 'द']
Batch edited: ['hin_Deva hin_Deva प'

100%|██████████| 19/19 [00:00<00:00, 802.05it/s]


फ [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 813]
फू [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 16845]
फूफ [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 16845, 495]
फूफा [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 16845, 3755]
म [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 214]
मौ [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 5545]
मौस [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 70877]
मौसा [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 5545, 964]
Batch: ['ब', 'बु', 'बुआ', 'म', 'मौ', 'मौस', 'मौसी', 'म']
Batch edited: ['hin_Deva hin_Deva ब', 'hin_Deva hin_Deva बु', 'hin_Deva hin_Deva बुआ', 'hin_Deva hin_Deva म', 'hin_Deva hin_Deva मौ', 'hin_Deva hin_Deva मौस', 'hin_Deva hin_Deva मौसी', 'hin_Deva hin_Deva म']
ब [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 336]
बु [105948, 5

100%|██████████| 20/20 [00:00<00:00, 823.24it/s]

Batch: ['ಅ', 'ಅಜ', 'ಅಜ್', 'ಅಜ್ಜ', 'ಅಜ್ಜಿ', 'ಅ', 'ಅಜ', 'ಅಜ್']
Batch edited: ['kan_Knda hin_Deva अ', 'kan_Knda hin_Deva अज', 'kan_Knda hin_Deva अज्', 'kan_Knda hin_Deva अज्ज', 'kan_Knda hin_Deva अज्जि', 'kan_Knda hin_Deva अ', 'kan_Knda hin_Deva अज', 'kan_Knda hin_Deva अज्']
ಅ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 213]
ಅಜ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 4565]
ಅಜ್ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 4565, 263]
ಅಜ್ಜ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 44928]
ಅಜ್ಜಿ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 4565, 35330]
ಅ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 213]
ಅಜ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 4565]
ಅಜ್ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 4565, 263]
Batch: ['ಅಜ್ಜ', 'ದ', 'ದೊ', 'ದೊಡ', 'ದೊಡಪ', 'ದೊಡಪ್', 'ದೊಡಪ್ಪ', 'ಚ']
Batch edited: ['kan_Knda




{'ory_Orya': {'ଜ': [106],
  'ଜେ': [169],
  'ଜେଜ': [169, 256],
  'ଜେଜେ': [41445],
  'ଜେଜେମ': [41445, 143],
  'ଜେଜେମା': [41445, 241],
  'ଆ': [34],
  'ଆଈ': [740],
  'ଜେଜେବ': [41445, 237],
  'ଜେଜେବା': [41445, 1007],
  'ଜେଜେବାପ': [41445, 1007, 246],
  'ଜେଜେବାପା': [41445, 1007, 1714],
  'ଅ': [213],
  'ଅଜ': [4565],
  'ଅଜା': [62200],
  'ବ': [336],
  'ବଡ': [6181],
  'ବଡ଼': [1111],
  'ବଡ଼ବ': [1111, 237],
  'ବଡ଼ବା': [1111, 1007],
  'ବଡ଼ବାପ': [1111, 1007, 246],
  'ବଡ଼ବାପା': [1111, 1007, 1714],
  'ଦ': [409],
  'ଦା': [171],
  'ଦାଦ': [15597],
  'ଦାଦା': [15588],
  'ମ': [214],
  'ମା': [354],
  'ମାମ': [9971],
  'ମାମୁ': [9971, 195],
  'ମାମୁଁ': [9971, 19212],
  'ପ': [205],
  'ପି': [449],
  'ପିଉ': [52157],
  'ପିଉସ': [52157, 115],
  'ପିଉସା': [52157, 964],
  'ମଉ': [63293],
  'ମଉସ': [63293, 115],
  'ମଉସା': [63293, 964],
  'ପିଉସୀ': [52157, 694],
  'ମାଉ': [30261],
  'ମାଉସ': [40150],
  'ମାଉସୀ': [30261, 694],
  'ମାଇ': [10859],
  'ମାଇଁ': [10859, 2304],
  'ବଡ଼ମ': [1111, 143],
  'ବଡ଼ମା': [1111, 241],
  'ବଡ଼ମାଆ': [11

In [11]:
import span_encodings as se
import importlib
importlib.reload(se)

fetched_span_encodings = se.span_encodings

fetched_span_encodings


{'ory_Orya': {"ଜେଜେମା'": [41445, 241, 30],
  'ଜେଜେବାପା ': [41445, 1007, 1714],
  'ମାମୁଁ। ': [9971, 19212, 6],
  'ମାଉସୀ। ': [30261, 694, 6],
  'ଶ୍ୱଶୁର-ଶ୍ୱଶୁର ': [21405, 699, 22252, 13, 21405, 699, 22252],
  'ଶ୍ୱଶୁର। ': [21405, 699, 22252, 6],
  'ସମ୍ପର୍କୀଯ଼ ଭାଇ। ': [60824, 3991, 6],
  'ଶିଶୁ ': [3442],
  'ପୁତୁରା ': [4300, 5686],
  'ଭାଣିଜୀ ': [980, 9742, 795],
  'None': [2],
  'ଜେଜେମା': [41445, 241],
  'ଆଈ': [740],
  'ଜେଜେବାପା': [41445, 1007, 1714],
  'ଅଜା': [62200],
  'ବଡ଼ବାପା': [1111, 1007, 1714],
  'ଦାଦା': [15588],
  'ମାମୁଁ': [9971, 19212],
  'ପିଉସା': [52157, 964],
  'ମଉସା': [63293, 964],
  'ପିଉସୀ': [52157, 694],
  'ମାଉସୀ': [30261, 694],
  'ମାଇଁ': [10859, 2304],
  'ବଡ଼ମାଆ': [1111, 241, 1109],
  'ଖୁଡ଼ି': [3617, 4405],
  'ବଡ଼ ଶଳା': [1111, 649, 1624],
  'ଶଳା': [649, 1624],
  'ଭିଣେଇ': [5442, 53872],
  'ଭିଣୋଇ': [5442, 1754, 89],
  'ଦେଢ଼ଶୁର': [57, 10861, 22252],
  'ଦିଅର': [305, 4093],
  'ବଡ଼ ନଣନ୍ଦ': [1111, 80, 266, 5766],
  'ନଣନ୍ଦ': [80, 266, 5766],
  'ଭାଉଜ': [4569, 25547],
  'ଭାଇବୋହୁ': [3991

In [12]:
# add span_encodings to the fetched_span_encodings and update the file
# fetched_span_encodings.update(span_encodings)
for key in fetched_span_encodings.keys():
    fetched_span_encodings[key].update(span_encodings[key])


In [13]:
fetched_span_encodings

{'ory_Orya': {"ଜେଜେମା'": [41445, 241, 30],
  'ଜେଜେବାପା ': [41445, 1007, 1714],
  'ମାମୁଁ। ': [9971, 19212, 6],
  'ମାଉସୀ। ': [30261, 694, 6],
  'ଶ୍ୱଶୁର-ଶ୍ୱଶୁର ': [21405, 699, 22252, 13, 21405, 699, 22252],
  'ଶ୍ୱଶୁର। ': [21405, 699, 22252, 6],
  'ସମ୍ପର୍କୀଯ଼ ଭାଇ। ': [60824, 3991, 6],
  'ଶିଶୁ ': [3442],
  'ପୁତୁରା ': [4300, 5686],
  'ଭାଣିଜୀ ': [980, 9742, 795],
  'None': [2],
  'ଜେଜେମା': [41445, 241],
  'ଆଈ': [740],
  'ଜେଜେବାପା': [41445, 1007, 1714],
  'ଅଜା': [62200],
  'ବଡ଼ବାପା': [1111, 1007, 1714],
  'ଦାଦା': [15588],
  'ମାମୁଁ': [9971, 19212],
  'ପିଉସା': [52157, 964],
  'ମଉସା': [63293, 964],
  'ପିଉସୀ': [52157, 694],
  'ମାଉସୀ': [30261, 694],
  'ମାଇଁ': [10859, 2304],
  'ବଡ଼ମାଆ': [1111, 241, 1109],
  'ଖୁଡ଼ି': [3617, 4405],
  'ବଡ଼ ଶଳା': [1111, 649, 1624],
  'ଶଳା': [649, 1624],
  'ଭିଣେଇ': [5442, 53872],
  'ଭିଣୋଇ': [5442, 1754, 89],
  'ଦେଢ଼ଶୁର': [57, 10861, 22252],
  'ଦିଅର': [305, 4093],
  'ବଡ଼ ନଣନ୍ଦ': [1111, 80, 266, 5766],
  'ନଣନ୍ଦ': [80, 266, 5766],
  'ଭାଉଜ': [4569, 25547],
  'ଭାଇବୋହୁ': [3991

In [14]:
# # word_trl=[]
# span_encodings = {}
# for lang in lang_script_list:
#   if lang == 'ory_Orya':
#     span_encodings[lang] = {}
#     # sents = ambiguos_words
#     # sents=[]
#     # for word in ambiguos_words:
#     #     # insert into sents the description of all the keys / items of the pir[word][lang][keys]
#     #     sents.extend(pir[word][lang][key]['description'] +' is my '+ word for key in pir[word][lang].keys())
#     print("sents: ", sents)
#     for i in tqdm(range(0, len(sents), BATCH_SIZE)):
#         batch = sents[i : i + BATCH_SIZE]
#         print("Batch:", batch)  

#         # batch = ip_en_ind.preprocess_batch(words_ids[lang].keys().tolist(), src_lang=lang, tgt_lang=lang)
#         batch = ip_en_ind.preprocess_batch(batch, src_lang='eng_Latn', tgt_lang=lang)
#         print("Batch edited:", batch)
#         # # Tokenize the batch and generate input encodings
#         inputs = en_indic_tokenizer(
#             batch,
#             truncation=True,
#             padding="longest",
#             return_tensors="pt",
#             return_attention_mask=True,
#         ).to(DEVICE)

#         with torch.no_grad():
#             # generated_tokens = model.generate(
#             outputs = en_indic_model.generate(
#                 **inputs,
#                 use_cache=True,
#                 min_length=0,
#                 max_length=256,
#                 num_beams=5,
#                 num_return_sequences=1, # TODO temp
#                 output_scores=True,
#                 output_logits=True,
#                 return_dict_in_generate=True,

#             )
#             # print("Length of outputs.logits actual", len(outputs.logits))
#             # print("Shape of outputs.logits actual", outputs.logits[0].shape)

#             # print("Length of outputs.beam_indices actual", len(outputs.beam_indices))
#             # print("Shape of outputs.beam_indices actual", outputs.beam_indices.shape)
            
#             outputs.beam_indices = outputs.beam_indices.cpu()
#             outputs.logits = tuple(logits.cpu() for logits in outputs.logits)               
#         # Decode the generated tokens into text
#         generated_tokens = outputs.sequences
#         # print("len generated_tokens: ", (generated_tokens[0]).shape)
#         print("1st generated token: ", generated_tokens[0])
#         vector = generated_tokens.detach().cpu().tolist()
#         # print("length of outputs vectors: ", len(vector), len(vector[0]))
#         # print("vector of generated_tokens: ", vector)
#         print("1st vector: ", vector[0])
        


#         with en_indic_tokenizer.as_target_tokenizer():
#             decoded_op = en_indic_tokenizer.batch_decode(
#                 vector,
#                 skip_special_tokens=True,
#                 clean_up_tokenization_spaces=True,
#             )

#         print("1st decoded_op: ", decoded_op[0])
#         # Postprocess the translations, including entity replacement
#         word_trl = ip_en_ind.postprocess_batch(decoded_op, lang=lang)

#         print("translations: ", word_trl)
#         for word in word_trl:
#             word_index = word_trl.index(word)
#             if word_index < len(vector):
#                 span_encodings[lang][word] = vector[word_index]
#                 # keep the items between '2' and '2' from span_encodings[lang][word]
#                 start_idx = vector[word_index].index(2)
#                 end_idx = vector[word_index].index(2, start_idx+1)
#                 span_encodings[lang][word] = vector[word_index][start_idx+1:end_idx]
#             else:
#                 print(f"Index {word_index} out of range for vector of length {len(vector)}")


# span_encodings

In [15]:
# write span_encoding into a file named "span_relations_encodings.json"
import json
with open("span_relations_encodings.json", "a") as f:
    json.dump(span_encodings, f)

    

# Indic - En

In [16]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor


model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

ip = IndicProcessor(inference=True)


tokenization_indictrans.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [17]:
input_sentences = [
        "दादी माँ " ,
        "दादा जी। " ,
        "चाचा " ,
        "चाची ",
        "बहनोई ",
        "ननद ",
        "चचेरा भाई ",
        "बच्चा। ",
        "भतीजे ",
        "भतीजी ",
]

In [18]:


src_lang, tgt_lang = "hin_Deva", "eng_Latn"

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
)
print("inputs: ", inputs)

inputs:  {'input_ids': tensor([[    1,     8,     4, 29524, 12100,     2],
        [    8,     4, 15613,   630,     7,     2],
        [    1,     1,     8,     4, 34086,     2],
        [    1,     1,     8,     4, 60712,     2],
        [    1,     8,     4,  4637, 49430,     2],
        [    1,     8,     4,  3564,    78,     2],
        [    8,     4, 49642,  2676,  3077,     2],
        [    1,     8,     4, 13258,     7,     2],
        [    1,     8,     4, 39163,  1404,     2],
        [    1,     8,     4, 39163,   812,     2]]), 'attention_mask': tensor([[0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1]])}


# Indic-Indic

In [19]:
model_name = "ai4bharat/indictrans2-indic-indic-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

ip = IndicProcessor(inference=True)


tokenization_indictrans.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

In [22]:
ip

<IndicTransToolkit.processor.IndicProcessor at 0x7f2b208ab2b0>

In [23]:

input_sentences = [
   "ବଡ଼ ଶଳା",
            "ଶଳା",
            "ଭିଣେଇ"  ,
            "ଭିଣୋଇ" ,
            "ଦେଢ଼ଶୁର" ,
            "ଦିଅର" ]

In [24]:


src_lang, tgt_lang = "ory_Orya", "ory_Orya"

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)
print("batch: ", batch)


# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
)
print("inputs: ", inputs)
print("ids", inputs.input_ids)

# Generate translations using the model
with torch.no_grad():
    generated_tokens = model.generate(
        **inputs,
        use_cache=True,
        min_length=0,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
    )

# Decode the generated tokens into text
with tokenizer.as_target_tokenizer():
    generated_tokens = tokenizer.batch_decode(
        generated_tokens.detach().cpu().tolist(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

# Postprocess the translations, including entity replacement
translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)

for input_sentence, translation in zip(input_sentences, translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")

batch:  ['ory_Orya ory_Orya बड़ शळा', 'ory_Orya ory_Orya शळा', 'ory_Orya ory_Orya भिणेइ', 'ory_Orya ory_Orya भिणोइ', 'ory_Orya ory_Orya देढ़शुर', 'ory_Orya ory_Orya दिअर']
inputs:  {'input_ids': tensor([[   48,    48,  1129,   666,  1642,     2],
        [    1,    48,    48,   666,  1642,     2],
        [    1,    48,    48,  5465, 53900,     2],
        [   48,    48,  5465,  1773,   103,     2],
        [   48,    48,    71, 10885, 22277,     2],
        [    1,    48,    48,   322,  4114,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1]])}
ids tensor([[   48,    48,  1129,   666,  1642,     2],
        [    1,    48,    48,   666,  1642,     2],
        [    1,    48,    48,  5465, 53900,     2],
        [   48,    48,  5465,  1773,   103,     2],
        [   48,    48,    71, 10885, 22277,     2],
        [    1,    48,    48,   322,  4

