In [3]:
%env CUDA_VISIBLE_DEVICES= 1,5,7

env: CUDA_VISIBLE_DEVICES=1,5,7


In [4]:
import os
os.environ['HF_HOME'] = '/home/sofia/cache_custom'

In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransToolkit import IndicProcessor
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
from torch.nn.functional import softmax


BATCH_SIZE = 8 # edited from 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
print(DEVICE)

cuda


In [6]:
import importlib
import possible_indic_relations as poss_indic_rel
# Reload the module to reflect changes
importlib.reload(poss_indic_rel)

pir= poss_indic_rel.possible_relations
pir

ambiguos_words = list(pir.keys())

In [7]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model

In [8]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir,  quantization)

ip_en_ind = IndicProcessor(inference=True)

In [9]:
lang_script_list = [
                           'ory_Orya',
                     'pan_Guru', 'ben_Beng', 
                       'mal_Mlym',
                           'mar_Deva', 
                           'tam_Taml', 'guj_Gujr', 
                           'tel_Telu', 'hin_Deva', 
                           'kan_Knda', 
                           ]

In [14]:
sents=['Along with him were his brother and a cousin.',
       'She has admitted to murdering her husband with the help of her brother and his friend, claimed police.',
        'She is the sister of his first wife, and the aunt of his daughter by his first wife.',
        "I've become a grandmother, she says, but I have a mother's responsibilities too."
        
 ]

In [None]:
# sents =['ଆଈ', 'ଦେଢ଼ଶୁର', 'ଦିଅର', 'ଜେଜେବାପା', 'ଜେଜେମା']

In [39]:
# word_trl=[]
span_encodings = {}
for lang in lang_script_list:
#   if lang == 'ory_Orya':
    span_encodings[lang] = {}
    # sents = ambiguos_words
    sents=[]
    for word in ambiguos_words:
        # insert into sents the description of all the keys / items of the pir[word][lang][keys]
        sents.extend( key for key in pir[word][lang].keys())
    print("sents: ", sents)
    for i in tqdm(range(0, len(sents), BATCH_SIZE)):
        batch = sents[i : i + BATCH_SIZE]
        print("Batch:", batch)  

        # batch = ip_en_ind.preprocess_batch(words_ids[lang].keys().tolist(), src_lang=lang, tgt_lang=lang)
        batch = ip_en_ind.preprocess_batch(batch, src_lang=lang, tgt_lang='hin_Deva')
        print("Batch edited:", batch)
        # # Tokenize the batch and generate input encodings
        with en_indic_tokenizer.as_target_tokenizer():  
            inputs = en_indic_tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ) 
        # print("Odia tokens inputs: ", inputs.input_ids)
        for sent, ids in zip(sents[i : i + BATCH_SIZE], inputs.input_ids):
            # till index of foound '2' in the list
            ids=ids.tolist()
            end_idx = ids.index(2)
            start_idx = ids. index(43144)
            # check if 2nd occurrence of 43144 is after start_idx
            if 'Deva' in lang:
                start_idx = ids.index(43144, start_idx+1)
            print(sent, ids [:   end_idx]) 
            span_encodings[lang][sent] = ids[start_idx+1:   end_idx]

span_encodings



sents:  ['ଜେଜେମା', 'ଆଈ', 'ଜେଜେବାପା', 'ଅଜା', 'ବଡ଼ବାପା', 'ଦାଦା', 'ମାମୁଁ', 'ପିଉସା', 'ମଉସା', 'ପିଉସୀ', 'ମାଉସୀ', 'ମାଇଁ', 'ବଡ଼ମାଆ', 'ଖୁଡ଼ି', 'ବଡ଼ ଶଳା', 'ଶଳା', 'ଭିଣେଇ', 'ଭିଣୋଇ', 'ଦେଢ଼ଶୁର', 'ଦିଅର', 'ବଡ଼ ନଣନ୍ଦ', 'ନଣନ୍ଦ', 'ଭାଉଜ', 'ଭାଇବୋହୁ', 'ଦେଢ଼ଶାସୁ', 'ଶାଳୀ', 'ଭାଇ', 'ଦିଦି', 'ପୁଅ', 'ଝିଅ', 'ପିଲା', 'ପୁତୁରା', 'ଭଣଜା', 'ଝିଆରୀ', 'ଭାଣିଜୀ']


100%|██████████| 5/5 [00:00<00:00, 505.30it/s]


Batch: ['ଜେଜେମା', 'ଆଈ', 'ଜେଜେବାପା', 'ଅଜା', 'ବଡ଼ବାପା', 'ଦାଦା', 'ମାମୁଁ', 'ପିଉସା']
Batch edited: ['ory_Orya hin_Deva जेजेमा', 'ory_Orya hin_Deva आई', 'ory_Orya hin_Deva जेजेबापा', 'ory_Orya hin_Deva अजा', 'ory_Orya hin_Deva बड़बापा', 'ory_Orya hin_Deva दादा', 'ory_Orya hin_Deva मामुँ', 'ory_Orya hin_Deva पिउसा']
ଜେଜେମା [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 41445, 241]
ଆଈ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 740]
ଜେଜେବାପା [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 41445, 1007, 1714]
ଅଜା [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 62200]
ବଡ଼ବାପା [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 1111, 1007, 1714]
ଦାଦା [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 15588]
ମାମୁଁ [22347, 4621, 59836, 4429, 17301, 5689, 105948, 59836, 2134, 5172, 43144, 9971, 19212]
ପିଉସା [22347, 4621, 59836, 4429, 17301, 5689, 10

100%|██████████| 4/4 [00:00<00:00, 571.24it/s]


Batch: ['ਦਾਦੀ', 'ਨਾਨੀ', 'ਦਾਦਾ', 'ਨਾਨਾ', 'ਤਾਇਆ', 'ਚਾਚਾ', 'ਮਾਮਾ', 'ਫੁੱਫੜ']
Batch edited: ['pan_Guru hin_Deva दादी', 'pan_Guru hin_Deva नानी', 'pan_Guru hin_Deva दादा', 'pan_Guru hin_Deva नाना', 'pan_Guru hin_Deva ताइआ', 'pan_Guru hin_Deva चाचा', 'pan_Guru hin_Deva मामा', 'pan_Guru hin_Deva फुੱफड़']
ਦਾਦੀ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 29498]
ਨਾਨੀ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 8911]
ਦਾਦਾ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 15588]
ਨਾਨਾ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 2820]
ਤਾਇਆ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 302, 6438]
ਚਾਚਾ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 34059]
ਮਾਮਾ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 24501]
ਫੁੱਫੜ [73756, 59836, 2860, 74873, 105948, 59836, 2134, 5172, 43144, 7355, 19, 495, 1643]
Batch: ['ਮਾਸੜ', 'ਭੂਆ', 'ਮਾਸੀ', 'ਮਾਮੀ', 'ਤਾਈ', 'ਸਾਲਾ', 'ਜੀਜਾ', 'ਜੇਠ']
Batch edited: ['pan_Guru 

100%|██████████| 6/6 [00:00<00:00, 596.46it/s]


Batch: ['ঠাকুরমা', 'দিদি মা', 'ঠাকুরদা', 'দাদু', 'জেঠা মশাই', 'কাকু', 'মামা', 'পিশে মশাই']
Batch edited: ['ben_Beng hin_Deva ठाकुरमा', 'ben_Beng hin_Deva दिदि मा', 'ben_Beng hin_Deva ठाकुरदा', 'ben_Beng hin_Deva दादु', 'ben_Beng hin_Deva जेठा मशाइ', 'ben_Beng hin_Deva काकु', 'ben_Beng hin_Deva मामा', 'ben_Beng hin_Deva पिशे मशाइ']
ঠাকুরমা [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 9402, 241]
দিদি মা [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 48446, 354]
ঠাকুরদা [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 9402, 450]
দাদু [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 409, 8599]
জেঠা মশাই [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 169, 9965, 15575, 635]
কাকু [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 63220]
মামা [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 24501]
পিশে মশাই [72274, 59836, 2422, 48275, 105948, 59836, 2134, 5172, 43144, 449, 1272, 15575, 635]
Batch: 

100%|██████████| 4/4 [00:00<00:00, 487.23it/s]


Batch: ['അമ്മൂമ്മ', 'അപ്പൂപ്പൻ', 'അമ്മാവൻ', 'മൂത്ത അച്ഛൻ', 'ചിറ്റപ്പൻ', 'മൂത്തമാമൻ', 'ഇളയമാമൻ', 'മാമൻ']
Batch edited: ['mal_Mlym hin_Deva अम्मूम्म', 'mal_Mlym hin_Deva अप्पूप्पൻ', 'mal_Mlym hin_Deva अम्मावൻ', 'mal_Mlym hin_Deva मूत्त अच्छൻ', 'mal_Mlym hin_Deva चिऱ्ऱप्पൻ', 'mal_Mlym hin_Deva मूत्तमामൻ', 'mal_Mlym hin_Deva इळयमामൻ', 'mal_Mlym hin_Deva मामൻ']
അമ്മൂമ്മ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774, 208, 1476]
അപ്പൂപ്പൻ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 2763, 208, 378, 28]
അമ്മാവൻ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1774, 529, 28]
മൂത്ത അച്ഛൻ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 16674, 7673, 28]
ചിറ്റപ്പൻ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 60069, 378, 28]
മൂത്തമാമൻ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172, 43144, 1398, 11549, 462, 28]
ഇളയമാമൻ [66990, 59836, 3082, 19327, 4839, 105948, 59836, 2134, 5172

100%|██████████| 5/5 [00:00<00:00, 565.61it/s]


Batch: ['आजी', 'मावस आजी', 'आजोबा', 'चुलत आजोबा', 'काका', 'मामा', 'मावसा', 'आत्या']
Batch edited: ['mar_Deva hin_Deva आजी', 'mar_Deva hin_Deva मावस आजी', 'mar_Deva hin_Deva आजोबा', 'mar_Deva hin_Deva चुलत आजोबा', 'mar_Deva hin_Deva काका', 'mar_Deva hin_Deva मामा', 'mar_Deva hin_Deva मावसा', 'mar_Deva hin_Deva आत्या']
आजी [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 32967]
मावस आजी [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 11701, 115, 32967]
आजोबा [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 56799]
चुलत आजोबा [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 6702, 119, 56799]
काका [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 41565]
मामा [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 24501]
मावसा [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 11701, 964]
आत्या [60726, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 2618, 53

100%|██████████| 5/5 [00:00<00:00, 336.77it/s]


Batch: ['அப்பத்தா', 'அம்மத்தா', 'சின்ன பாட்டி', 'பெரிய பாட்டி', 'பாட்டி', 'தாத்தா', 'சின்ன தாத்தா', 'பெரிய தாத்தா']
Batch edited: ['tam_Taml hin_Deva अप्पत्ता', 'tam_Taml hin_Deva अम्मत्ता', 'tam_Taml hin_Deva चिऩ्ऩ पाट्टि', 'tam_Taml hin_Deva पॆरिय पाट्टि', 'tam_Taml hin_Deva पाट्टि', 'tam_Taml hin_Deva तात्ता', 'tam_Taml hin_Deva चिऩ्ऩ तात्ता', 'tam_Taml hin_Deva पॆरिय तात्ता']
அப்பத்தா [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2763, 2154]
அம்மத்தா [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 1774, 2154]
சின்ன பாட்டி [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 13735, 511, 956]
பெரிய பாட்டி [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 2765, 511, 956]
பாட்டி [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 511, 956]
தாத்தா [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 5172, 43144, 14552, 3286]
சின்ன தாத்தா [82829, 59836, 2526, 13607, 5573, 105948, 59836, 2134, 517

100%|██████████| 4/4 [00:00<00:00, 505.96it/s]


Batch: ['દાદી', 'નાની', 'દાદા', 'નાના', 'કાકા', 'મામા', 'ફોઈ', 'માસી']
Batch edited: ['guj_Gujr hin_Deva दादी', 'guj_Gujr hin_Deva नानी', 'guj_Gujr hin_Deva दादा', 'guj_Gujr hin_Deva नाना', 'guj_Gujr hin_Deva काका', 'guj_Gujr hin_Deva मामा', 'guj_Gujr hin_Deva फोई', 'guj_Gujr hin_Deva मासी']
દાદી [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 29498]
નાની [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 8911]
દાદા [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 15588]
નાના [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 2820]
કાકા [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 41565]
મામા [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 24501]
ફોઈ [57020, 18693, 59836, 2860, 7218, 18693, 7881, 105948, 59836, 2134, 5172, 43144, 1529, 1408]
માસી [57020, 18693, 59836, 2860, 7218, 18693, 7881, 10

100%|██████████| 5/5 [00:00<00:00, 580.14it/s]


Batch: ['నాన్నమ్మ', 'అమ్మమ', 'నాన్నమ', 'చిన్న నాన్నమ', 'పెద్ద  నాన్నమ', 'చిన్న అమ్మమ', 'పెద్ద అమ్మమ', 'తాతయ్యగారు']
Batch edited: ['tel_Telu hin_Deva नान्नम्म', 'tel_Telu hin_Deva अम्मम', 'tel_Telu hin_Deva नान्नम', 'tel_Telu hin_Deva चिन्न नान्नम', 'tel_Telu hin_Deva पॆद्द नान्नम', 'tel_Telu hin_Deva चिन्न अम्मम', 'tel_Telu hin_Deva पॆद्द अम्मम', 'tel_Telu hin_Deva तातय्यगारु']
నాన్నమ్మ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 30539, 1476]
అమ్మమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 1774, 143]
నాన్నమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 30539, 143]
చిన్న నాన్నమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2195, 30539, 143]
పెద్ద  నాన్నమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2224, 30539, 143]
చిన్న అమ్మమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 5172, 43144, 2195, 1774, 143]
పెద్ద అమ్మమ [76563, 59836, 2526, 9463, 7218, 105948, 59836, 2134, 

100%|██████████| 5/5 [00:00<00:00, 601.64it/s]


Batch: ['दादी', 'नानी', 'पितामही', 'दादा', 'नाना', 'पितामह', 'ताऊ', 'चाचा']
Batch edited: ['hin_Deva hin_Deva दादी', 'hin_Deva hin_Deva नानी', 'hin_Deva hin_Deva पितामही', 'hin_Deva hin_Deva दादा', 'hin_Deva hin_Deva नाना', 'hin_Deva hin_Deva पितामह', 'hin_Deva hin_Deva ताऊ', 'hin_Deva hin_Deva चाचा']
दादी [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 29498]
नानी [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 8911]
पितामही [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 30502, 412]
दादा [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 15588]
नाना [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 2820]
पितामह [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 53208]
ताऊ [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 261, 5200]
चाचा [105948, 59836, 2134, 5172, 43144, 105948, 59836, 2134, 5172, 43144, 34059]
Batch: ['मामा', 'फूफा', 'मौसा', 'बुआ',

100%|██████████| 4/4 [00:00<00:00, 541.34it/s]

Batch: ['ಅಜ್ಜಿ', 'ಅಜ್ಜ', 'ದೊಡಪ್ಪ', 'ಚಿಕ್ಕಪ್ಪ', 'ಮಾಮ', 'ಮಾವ', 'ದೊಡ್ಡಪ್ಪ', 'ಅತ್ತೆ']
Batch edited: ['kan_Knda hin_Deva अज्जि', 'kan_Knda hin_Deva अज्ज', 'kan_Knda hin_Deva दॊडप्प', 'kan_Knda hin_Deva चिक्कप्प', 'kan_Knda hin_Deva माम', 'kan_Knda hin_Deva माव', 'kan_Knda hin_Deva दॊड्डप्प', 'kan_Knda hin_Deva अत्तॆ']
ಅಜ್ಜಿ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 4565, 35330]
ಅಜ್ಜ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 44928]
ದೊಡಪ್ಪ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 39398, 123, 378]
ಚಿಕ್ಕಪ್ಪ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 2950, 378]
ಮಾಮ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 9971]
ಮಾವ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 11701]
ದೊಡ್ಡಪ್ಪ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 1945, 378]
ಅತ್ತೆ [84689, 59836, 5727, 40769, 5689, 105948, 59836, 2134, 5172, 43144, 213, 823]
Batch: ['




{'ory_Orya': {'ଜେଜେମା': [41445, 241],
  'ଆଈ': [740],
  'ଜେଜେବାପା': [41445, 1007, 1714],
  'ଅଜା': [62200],
  'ବଡ଼ବାପା': [1111, 1007, 1714],
  'ଦାଦା': [15588],
  'ମାମୁଁ': [9971, 19212],
  'ପିଉସା': [52157, 964],
  'ମଉସା': [63293, 964],
  'ପିଉସୀ': [52157, 694],
  'ମାଉସୀ': [30261, 694],
  'ମାଇଁ': [10859, 2304],
  'ବଡ଼ମାଆ': [1111, 241, 1109],
  'ଖୁଡ଼ି': [3617, 4405],
  'ବଡ଼ ଶଳା': [1111, 649, 1624],
  'ଶଳା': [649, 1624],
  'ଭିଣେଇ': [5442, 53872],
  'ଭିଣୋଇ': [5442, 1754, 89],
  'ଦେଢ଼ଶୁର': [57, 10861, 22252],
  'ଦିଅର': [305, 4093],
  'ବଡ଼ ନଣନ୍ଦ': [1111, 80, 266, 5766],
  'ନଣନ୍ଦ': [80, 266, 5766],
  'ଭାଉଜ': [4569, 25547],
  'ଭାଇବୋହୁ': [3991, 1137, 7131],
  'ଦେଢ଼ଶାସୁ': [57, 10861, 325, 14699],
  'ଶାଳୀ': [3175, 2651],
  'ଭାଇ': [3991],
  'ଦିଦି': [48446],
  'ପୁଅ': [23401],
  'ଝିଅ': [20076],
  'ପିଲା': [23000],
  'ପୁତୁରା': [4300, 5686],
  'ଭଣଜା': [30360, 2935],
  'ଝିଆରୀ': [2398, 80993],
  'ଭାଣିଜୀ': [980, 9742, 795]},
 'pan_Guru': {'ਦਾਦੀ': [29498],
  'ਨਾਨੀ': [8911],
  'ਦਾਦਾ': [15588],
  'ਨਾਨਾ': [2820],

In [43]:
import span_encodings as se
import importlib
importlib.reload(se)

fetched_span_encodings = se.span_encodings

fetched_span_encodings


{'ory_Orya': {"ଜେଜେମା'": [41445, 241, 30],
  'ଜେଜେବାପା ': [41445, 1007, 1714],
  'ମାମୁଁ। ': [9971, 19212, 6],
  'ମାଉସୀ। ': [30261, 694, 6],
  'ଶ୍ୱଶୁର-ଶ୍ୱଶୁର ': [21405, 699, 22252, 13, 21405, 699, 22252],
  'ଶ୍ୱଶୁର। ': [21405, 699, 22252, 6],
  'ସମ୍ପର୍କୀଯ଼ ଭାଇ। ': [60824, 3991, 6],
  'ଶିଶୁ ': [3442],
  'ପୁତୁରା ': [4300, 5686],
  'ଭାଣିଜୀ ': [980, 9742, 795],
  'None': [2]},
 'pan_Guru': {'ਦਾਦੀ ਮਾਂ ': [29498, 640],
  'ਦਾਦਾ ਜੀ ': [15588, 613],
  'ਚਾਚਾ ': [34059],
  'ਮਾਸੀ ਜੀ। ': [65770, 613, 6],
  'ਭਰਾ-ਸੱਸ ': [8327, 13, 116, 19, 115],
  'ਭਰਜਾਈ ': [1144, 40842],
  'ਚਚੇਰਾ ਭਰਾ ': [49615, 2656, 8327],
  'ਬੱਚਾ ': [336, 19, 317],
  'ਭਤੀਜੇ ': [39136, 1386],
  'ਭਤੀਜੀ ': [39136, 795],
  'None': [2]},
 'ben_Beng': {'দিদিমা ': [48446, 241],
  'দাদা। ': [15588, 6],
  'চাচা ': [34059],
  'আন্টি। ': [5745, 102, 6],
  'শ্যালক ': [649, 4692, 75],
  'শ্যালিকা ': [649, 4692, 1525],
  'চাচাত ভাই। ': [35407, 359, 3991, 6],
  'শিশু। ': [3442, 6],
  'ভাগ্নে ': [291, 27065],
  'ভাগ্নি ': [291, 17389],
  'None': [

In [44]:
# add span_encodings to the fetched_span_encodings and update the file
# fetched_span_encodings.update(span_encodings)
for key in fetched_span_encodings.keys():
    fetched_span_encodings[key].update(span_encodings[key])


In [45]:
fetched_span_encodings

{'ory_Orya': {"ଜେଜେମା'": [41445, 241, 30],
  'ଜେଜେବାପା ': [41445, 1007, 1714],
  'ମାମୁଁ। ': [9971, 19212, 6],
  'ମାଉସୀ। ': [30261, 694, 6],
  'ଶ୍ୱଶୁର-ଶ୍ୱଶୁର ': [21405, 699, 22252, 13, 21405, 699, 22252],
  'ଶ୍ୱଶୁର। ': [21405, 699, 22252, 6],
  'ସମ୍ପର୍କୀଯ଼ ଭାଇ। ': [60824, 3991, 6],
  'ଶିଶୁ ': [3442],
  'ପୁତୁରା ': [4300, 5686],
  'ଭାଣିଜୀ ': [980, 9742, 795],
  'None': [2],
  'ଜେଜେମା': [41445, 241],
  'ଆଈ': [740],
  'ଜେଜେବାପା': [41445, 1007, 1714],
  'ଅଜା': [62200],
  'ବଡ଼ବାପା': [1111, 1007, 1714],
  'ଦାଦା': [15588],
  'ମାମୁଁ': [9971, 19212],
  'ପିଉସା': [52157, 964],
  'ମଉସା': [63293, 964],
  'ପିଉସୀ': [52157, 694],
  'ମାଉସୀ': [30261, 694],
  'ମାଇଁ': [10859, 2304],
  'ବଡ଼ମାଆ': [1111, 241, 1109],
  'ଖୁଡ଼ି': [3617, 4405],
  'ବଡ଼ ଶଳା': [1111, 649, 1624],
  'ଶଳା': [649, 1624],
  'ଭିଣେଇ': [5442, 53872],
  'ଭିଣୋଇ': [5442, 1754, 89],
  'ଦେଢ଼ଶୁର': [57, 10861, 22252],
  'ଦିଅର': [305, 4093],
  'ବଡ଼ ନଣନ୍ଦ': [1111, 80, 266, 5766],
  'ନଣନ୍ଦ': [80, 266, 5766],
  'ଭାଉଜ': [4569, 25547],
  'ଭାଇବୋହୁ': [3991

In [None]:
# # word_trl=[]
# span_encodings = {}
# for lang in lang_script_list:
#   if lang == 'ory_Orya':
#     span_encodings[lang] = {}
#     # sents = ambiguos_words
#     # sents=[]
#     # for word in ambiguos_words:
#     #     # insert into sents the description of all the keys / items of the pir[word][lang][keys]
#     #     sents.extend(pir[word][lang][key]['description'] +' is my '+ word for key in pir[word][lang].keys())
#     print("sents: ", sents)
#     for i in tqdm(range(0, len(sents), BATCH_SIZE)):
#         batch = sents[i : i + BATCH_SIZE]
#         print("Batch:", batch)  

#         # batch = ip_en_ind.preprocess_batch(words_ids[lang].keys().tolist(), src_lang=lang, tgt_lang=lang)
#         batch = ip_en_ind.preprocess_batch(batch, src_lang='eng_Latn', tgt_lang=lang)
#         print("Batch edited:", batch)
#         # # Tokenize the batch and generate input encodings
#         inputs = en_indic_tokenizer(
#             batch,
#             truncation=True,
#             padding="longest",
#             return_tensors="pt",
#             return_attention_mask=True,
#         ).to(DEVICE)

#         with torch.no_grad():
#             # generated_tokens = model.generate(
#             outputs = en_indic_model.generate(
#                 **inputs,
#                 use_cache=True,
#                 min_length=0,
#                 max_length=256,
#                 num_beams=5,
#                 num_return_sequences=1, # TODO temp
#                 output_scores=True,
#                 output_logits=True,
#                 return_dict_in_generate=True,

#             )
#             # print("Length of outputs.logits actual", len(outputs.logits))
#             # print("Shape of outputs.logits actual", outputs.logits[0].shape)

#             # print("Length of outputs.beam_indices actual", len(outputs.beam_indices))
#             # print("Shape of outputs.beam_indices actual", outputs.beam_indices.shape)
            
#             outputs.beam_indices = outputs.beam_indices.cpu()
#             outputs.logits = tuple(logits.cpu() for logits in outputs.logits)               
#         # Decode the generated tokens into text
#         generated_tokens = outputs.sequences
#         # print("len generated_tokens: ", (generated_tokens[0]).shape)
#         print("1st generated token: ", generated_tokens[0])
#         vector = generated_tokens.detach().cpu().tolist()
#         # print("length of outputs vectors: ", len(vector), len(vector[0]))
#         # print("vector of generated_tokens: ", vector)
#         print("1st vector: ", vector[0])
        


#         with en_indic_tokenizer.as_target_tokenizer():
#             decoded_op = en_indic_tokenizer.batch_decode(
#                 vector,
#                 skip_special_tokens=True,
#                 clean_up_tokenization_spaces=True,
#             )

#         print("1st decoded_op: ", decoded_op[0])
#         # Postprocess the translations, including entity replacement
#         word_trl = ip_en_ind.postprocess_batch(decoded_op, lang=lang)

#         print("translations: ", word_trl)
#         for word in word_trl:
#             word_index = word_trl.index(word)
#             if word_index < len(vector):
#                 span_encodings[lang][word] = vector[word_index]
#                 # keep the items between '2' and '2' from span_encodings[lang][word]
#                 start_idx = vector[word_index].index(2)
#                 end_idx = vector[word_index].index(2, start_idx+1)
#                 span_encodings[lang][word] = vector[word_index][start_idx+1:end_idx]
#             else:
#                 print(f"Index {word_index} out of range for vector of length {len(vector)}")


# span_encodings

sents:  ['Along with him were his brother and a cousin.', 'She has admitted to murdering her husband with the help of her brother and his friend, claimed police.', 'She is the sister of his first wife, and the aunt of his daughter by his first wife.', "I've become a grandmother, she says, but I have a mother's responsibilities too."]


  0%|          | 0/1 [00:00<?, ?it/s]

Batch: ['Along with him were his brother and a cousin.', 'She has admitted to murdering her husband with the help of her brother and his friend, claimed police.', 'She is the sister of his first wife, and the aunt of his daughter by his first wife.', "I've become a grandmother, she says, but I have a mother's responsibilities too."]
Batch edited: ['eng_Latn ory_Orya Along with him were his brother and a cousin .', 'eng_Latn ory_Orya She has admitted to murdering her husband with the help of her brother and his friend , claimed police .', 'eng_Latn ory_Orya She is the sister of his first wife , and the aunt of his daughter by his first wife .', "eng_Latn ory_Orya I 've become a grandmother , she says , but I have a mother 's responsibilities too ."]


100%|██████████| 1/1 [00:00<00:00,  1.11it/s]

1st generated token:  tensor([    2,  2313,  1201,  2313,  3991,    60,  4725, 60824,  6442,     6,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1], device='cuda:0')
1st vector:  [2, 2313, 1201, 2313, 3991, 60, 4725, 60824, 6442, 6, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
1st decoded_op:  ताङ्क सहित ताङ्क भाइ एबं जणे सम्पर्कीय़ थिले । 
translations:  ['ତାଙ୍କ ସହିତ ତାଙ୍କ ଭାଇ ଏବଂ ଜଣେ ସମ୍ପର୍କୀଯ଼ ଥିଲେ। ', 'ସେ ତାଙ୍କ ଭାଇ ଏବଂ ତାଙ୍କ ବନ୍ଧୁଙ୍କ ସହାଯ଼ତାରେ ତାଙ୍କ ସ୍ୱାମୀଙ୍କୁ ହତ୍ଯ଼ା କରିଥିବା ସ୍ୱୀକାର କରିଛନ୍ତି ବୋଲି ପୋଲିସ ଦାବି କରିଛି। ', 'ସେ ତାଙ୍କ ପ୍ରଥମ ପତ୍ନୀଙ୍କ ଭଉଣୀ, ଏବଂ ତାଙ୍କ ପ୍ରଥମ ପତ୍ନୀଙ୍କ ଦ୍ୱାରା ତାଙ୍କ ଝିଅର ମାଉସୀ | ', 'ସେ କୁହନ୍ତି, ମୁଁ ଜଣେ ଜେଜେମା "ହୋଇଛି, କିନ୍ତୁ ମୋର ମଧ୍ଯ଼ ଜଣେ ମା" ର ଦାଯ଼ିତ୍ୱ ରହିଛି। ']





{'ory_Orya': {'ତାଙ୍କ ସହିତ ତାଙ୍କ ଭାଇ ଏବଂ ଜଣେ ସମ୍ପର୍କୀଯ଼ ଥିଲେ। ': [2313,
   1201,
   2313,
   3991,
   60,
   4725,
   60824,
   6442,
   6],
  'ସେ ତାଙ୍କ ଭାଇ ଏବଂ ତାଙ୍କ ବନ୍ଧୁଙ୍କ ସହାଯ଼ତାରେ ତାଙ୍କ ସ୍ୱାମୀଙ୍କୁ ହତ୍ଯ଼ା କରିଥିବା ସ୍ୱୀକାର କରିଛନ୍ତି ବୋଲି ପୋଲିସ ଦାବି କରିଛି। ': [20,
   2313,
   3991,
   60,
   2313,
   4247,
   435,
   67152,
   2313,
   2071,
   699,
   4751,
   1698,
   27179,
   21666,
   2071,
   699,
   46074,
   4329,
   3882,
   2904,
   5226,
   7227,
   6],
  'ସେ ତାଙ୍କ ପ୍ରଥମ ପତ୍ନୀଙ୍କ ଭଉଣୀ, ଏବଂ ତାଙ୍କ ପ୍ରଥମ ପତ୍ନୀଙ୍କ ଦ୍ୱାରା ତାଙ୍କ ଝିଅର ମାଉସୀ | ': [20,
   2313,
   398,
   1760,
   435,
   38982,
   5,
   60,
   2313,
   398,
   1760,
   435,
   5399,
   699,
   663,
   2313,
   27993,
   4093,
   30261,
   694,
   9182],
  'ସେ କୁହନ୍ତି, ମୁଁ ଜଣେ ଜେଜେମା "ହୋଇଛି, କିନ୍ତୁ ମୋର ମଧ୍ଯ଼ ଜଣେ ମା" ର ଦାଯ଼ିତ୍ୱ ରହିଛି। ': [20,
   54036,
   5,
   1149,
   4725,
   41445,
   241,
   7,
   5034,
   5,
   91,
   4433,
   1230,
   4725,
   354,
   7,
   118,
   33406,
   699,
   2784,
   6]}}

In [30]:
# write span_encoding into a file named "span_relations_encodings.json"
import json
with open("span_relations_encodings.json", "a") as f:
    json.dump(span_encodings, f)

    

# Indic - En

In [None]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor


model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

ip = IndicProcessor(inference=True)


In [None]:
input_sentences = [
        "दादी माँ " ,
        "दादा जी। " ,
        "चाचा " ,
        "चाची ",
        "बहनोई ",
        "ननद ",
        "चचेरा भाई ",
        "बच्चा। ",
        "भतीजे ",
        "भतीजी ",
]

In [None]:


src_lang, tgt_lang = "hin_Deva", "eng_Latn"

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
)
print("inputs: ", inputs)

inputs:  {'input_ids': tensor([[    1,     8,     4, 29524, 12100,     2],
        [    8,     4, 15613,   630,     7,     2],
        [    1,     1,     8,     4, 34086,     2],
        [    1,     1,     8,     4, 60712,     2],
        [    1,     8,     4,  4637, 49430,     2],
        [    1,     8,     4,  3564,    78,     2],
        [    8,     4, 49642,  2676,  3077,     2],
        [    1,     8,     4, 13258,     7,     2],
        [    1,     8,     4, 39163,  1404,     2],
        [    1,     8,     4, 39163,   812,     2]]), 'attention_mask': tensor([[0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1]])}


# Indic-Indic

In [None]:
model_name = "ai4bharat/indictrans2-indic-indic-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

ip = IndicProcessor(inference=True)


In [24]:

input_sentences = [
   "ବଡ଼ ଶଳା",
            "ଶଳା",
            "ଭିଣେଇ"  ,
            "ଭିଣୋଇ" ,
            "ଦେଢ଼ଶୁର" ,
            "ଦିଅର" ]

In [27]:


src_lang, tgt_lang = "ory_Orya", "ory_Orya"

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)
print("batch: ", batch)


# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
)
print("inputs: ", inputs)
print("ids", inputs.input_ids)

# Generate translations using the model
with torch.no_grad():
    generated_tokens = model.generate(
        **inputs,
        use_cache=True,
        min_length=0,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
    )

# Decode the generated tokens into text
with tokenizer.as_target_tokenizer():
    generated_tokens = tokenizer.batch_decode(
        generated_tokens.detach().cpu().tolist(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

# Postprocess the translations, including entity replacement
translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)

for input_sentence, translation in zip(input_sentences, translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")

batch:  ['ory_Orya ory_Orya बड़ शळा', 'ory_Orya ory_Orya शळा', 'ory_Orya ory_Orya भिणेइ', 'ory_Orya ory_Orya भिणोइ', 'ory_Orya ory_Orya देढ़शुर', 'ory_Orya ory_Orya दिअर']
inputs:  {'input_ids': tensor([[   48,    48,  1129,   666,  1642,     2],
        [    1,    48,    48,   666,  1642,     2],
        [    1,    48,    48,  5465, 53900,     2],
        [   48,    48,  5465,  1773,   103,     2],
        [   48,    48,    71, 10885, 22277,     2],
        [    1,    48,    48,   322,  4114,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1]])}
ids tensor([[   48,    48,  1129,   666,  1642,     2],
        [    1,    48,    48,   666,  1642,     2],
        [    1,    48,    48,  5465, 53900,     2],
        [   48,    48,  5465,  1773,   103,     2],
        [   48,    48,    71, 10885, 22277,     2],
        [    1,    48,    48,   322,  4

