In [9]:
lang_script_list = [
                     'pan_Guru', 'ben_Beng', 
                       'mal_Mlym',
                           'mar_Deva', 
                           'tam_Taml', 'guj_Gujr', 
                           'tel_Telu', 'hin_Deva', 
                           'kan_Knda', 
                           'ory_Orya'
                           ]



# Translate using models


In [1]:

import os
import pandas as pd

test_csv_folder = 'custom_test_csv'

In [2]:
lang_code_map = {
    'eng_Latn': 'en',
    'hin_Deva': 'hi',
    'guj_Gujr': 'gu',
    'kan_Knda': 'kn',
    'mal_Mlym': 'ml',
    'mar_Deva': 'mr',
    'tam_Taml': 'ta',
    'tel_Telu': 'te',
    'pan_Guru': 'pa',
    'ben_Beng': 'bn',
    'ory_Orya': 'or'
}


## IndicTrans2

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
# from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer
from IndicTransToolkit import IndicProcessor
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

BATCH_SIZE = 16 # edited from 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
print(DEVICE)

cuda


In [None]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []


    from tqdm import tqdm
    for i in tqdm(range(0, len(input_sentences), BATCH_SIZE)):

    # for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1, # TODO temp
            )

        # Decode the generated tokens into text

        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [5]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir,  quantization)

ip_en_ind = IndicProcessor(inference=True)

In [None]:
# import the test_sentences_eng.txt file data as sents

sents = []
with open('test_sentences_eng.txt', 'r') as f:
    sents = f.readlines()

sents = [sent.strip() for sent in sents]

print(len(sents))

2010


In [10]:
src_lang = "eng_Latn"

for lang in lang_script_list:
    tgt_lang = lang
    print(lang)
    translations = batch_translate(sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip_en_ind)


    # save hindi translations to a file test_translations_hin.txt
    with open('test_translations/indic_trans2/test_transl_it2_'+lang+'.txt', 'w') as f:
        for sent in translations:
            f.write(sent + '\n')

pan_Guru


100%|██████████| 126/126 [01:46<00:00,  1.18it/s]


ben_Beng


100%|██████████| 126/126 [01:18<00:00,  1.61it/s]


mal_Mlym


100%|██████████| 126/126 [01:51<00:00,  1.13it/s]


mar_Deva


100%|██████████| 126/126 [01:22<00:00,  1.52it/s]


tam_Taml


100%|██████████| 126/126 [01:28<00:00,  1.42it/s]


guj_Gujr


100%|██████████| 126/126 [01:22<00:00,  1.53it/s]


tel_Telu


100%|██████████| 126/126 [01:25<00:00,  1.48it/s]


hin_Deva


100%|██████████| 126/126 [01:24<00:00,  1.49it/s]


kan_Knda


100%|██████████| 126/126 [01:39<00:00,  1.27it/s]


ory_Orya


100%|██████████| 126/126 [01:39<00:00,  1.26it/s]


In [None]:
to break run all and save sarvam money

## Sarvam AI

# TODO REMOVE CHILD from sentences before converting to Indic in Sarvam AI

In [12]:
import requests

url_sarvam = "https://api.sarvam.ai/translate"

In [13]:
indic_code_map = {
    'hin_Deva': 'hi-IN',
    'guj_Gujr': 'gu-IN',
    'kan_Knda': 'kn-IN',
    'mal_Mlym': 'ml-IN',
    'mar_Deva': 'mr-IN',
    'tam_Taml': 'ta-IN',
    'tel_Telu': 'te-IN',
    'pan_Guru': 'pa-IN',
    'ben_Beng': 'bn-IN',
    'ory_Orya': 'od-IN',
}

In [23]:
sents = []
with open('test_sentences_eng.txt', 'r') as f:
    sents = f.readlines()

sents = [sent.strip() for sent in sents]
# sents = sents[:1]
len(sents)    

2010

In [22]:
if len(sents) !=0:
    for lang in lang_script_list:
        transls = []
        for row in sents:
            payload = {
                "input": row,
                "source_language_code": "en-IN",
                "target_language_code": indic_code_map[lang],
                # "speaker_gender": "Male",
                # "mode": "formal",
                # "model": "mayura:v1",
                # "enable_preprocessing": True
            }
            headers = {"Content-Type": "application/json", 'api-subscription-key': 'b23a7c3b-71a0-4f19-b410-26f9d0bf5f2e'}

            response = requests.request("POST", url_sarvam, json=payload, headers=headers)
            if response.status_code != 200:
                    print(response.json())
            transls.append(response.json()['translated_text'])
        
        # save translations to a file test_translations
        with open('test_translations/sarvam_ai/test_transl_sarvam_'+lang+'.txt', 'w') as f:
            for sent in transls:
                f.write(sent + '\n')
            print(lang)

# print(len(transls))

            

pan_Guru
ben_Beng
mal_Mlym
mar_Deva
tam_Taml
guj_Gujr
tel_Telu
hin_Deva
kan_Knda
ory_Orya


## Microsoft Phi

In [24]:
%pip install requests uuid



In [25]:
import requests, uuid, json

# Add your key and endpoint
key = "f1d5da8e40cc4634b631f7ec807676e1"
endpoint = "https://api.cognitive.microsofttranslator.com"

# location, also known as region.
# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
location = "centralindia"

path = '/translate'
constructed_url = endpoint + path

In [26]:


headers = {
    'Ocp-Apim-Subscription-Key': key,
    # location required if you're using a multi-service or regional (not global) resource.
    'Ocp-Apim-Subscription-Region': location,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

In [30]:
sents = []
with open('test_sentences_eng.txt', 'r') as f:
    sents = f.readlines()

sents = [sent.strip() for sent in sents]
# sents = sents[:1]
# sents 
len(sents)

2010

In [32]:
  
for lang in lang_script_list:
    # transl=[]
    print(lang)

    # make batches of 100 size of sents
    for i in range(0, len(sents), 100):
        body=[]
        for text in sents[i:i+100]:
            text_object = {'text': text}
            body.append(text_object)

        params = {
            'api-version': '3.0',
            'from': 'en',
            'to': lang_code_map[lang]
        }
        request = requests.post(constructed_url, params=params, headers=headers, json=body)
        response = request.json()
        if request.status_code != 200:
            print(response)
        # save the response[i]['translations'][0]['text'] to file test_transl_ms_lang.txt
        with open('test_translations/ms_phi/test_transl_ms_'+lang+'.txt', 'w') as f:
            for i in range(len(response)):
                f.write(response[i]['translations'][0]['text'] + '\n')
    

           



pan_Guru
{'error': {'code': 429001, 'message': 'The server rejected the request because the client has exceeded request limits.'}}


KeyError: 0

In [None]:
if len(sents) !=0:
    for lang in lang_script_list:
        transls = []
        for row in sents:
            payload = {
                "input": row,
                "source_language_code": "en-IN",
                "target_language_code": indic_code_map[lang],
                # "speaker_gender": "Male",
                # "mode": "formal",
                # "model": "mayura:v1",
                # "enable_preprocessing": True
            }
            headers = {"Content-Type": "application/json", 'api-subscription-key': 'b23a7c3b-71a0-4f19-b410-26f9d0bf5f2e'}

            response = requests.request("POST", url_sarvam, json=payload, headers=headers)
            if response.status_code != 200:
                    print(response.json())
            transls.append(response.json()['translated_text'])
        
        # save translations to a file test_translations
        with open('test_translations/sarvam_ai/test_transl_sarvam_'+lang+'.txt', 'w') as f:
            for sent in transls:
                f.write(sent + '\n')
            print(lang)

# print(len(transls))

            

## Google Translate API


In [33]:
# pip install googletrans
%pip install googletrans




In [34]:
from googletrans import Translator


translator = Translator(service_urls=['translate.googleapis.com'])
translator

<googletrans.client.Translator at 0x7f39aca9ed90>

In [35]:
translator.translate('hello', src='en', dest='hi').text

'नमस्ते'

In [None]:
for lang in lang_script_list:
    print(lang)
#    sents.apply(lambda x: translator.translate(x, src='en', dest=lang_code_map[lang]).text) do the same lambda for sets array
    transls=  [translator.translate(sent, src='en', dest=lang_code_map[lang]).text for sent in sents]
    
    # save translations to a file test_translations
    with open('test_translations/google_translate/test_transl_googlet_'+lang+'.txt', 'w') as f:
        for sent in transls:
            f.write(sent + '\n')

            



pan_Guru
ben_Beng
mal_Mlym
mar_Deva
tam_Taml
guj_Gujr
tel_Telu
hin_Deva
kan_Knda
ory_Orya


# Conclusion

The translated sentences are kept model wise into the respective files.
Next steps will be to compare the translations and evaluate the models' bias based on the count of the relations in translations.
1. Combine the outputs of all models into a single Dataframe as below:
        df = { word: {lang: { m1:{transl_word : count}}}}
2. to calculate the count, instead of searching the whole file for occurrences, we can use an intermediate dictionary to store the count of each word in the translation  like below structure:



# Segregating the Translations

In [5]:
import pandas as pd

In [6]:
import importlib
import possible_indic_relations as poss_indic_rel
# Reload the module to reflect changes
importlib.reload(poss_indic_rel)

pir= poss_indic_rel.possible_relations
pir

ambiguos_words = list(pir.keys())

In [25]:
def segregate_files(eng_sents, model, model_path, resultant, others_res):
    # create a df with key as language and value as the list of sentences from respective languag file in same order
    df = pd.DataFrame({lang:[] for lang in lang_script_list})

    for lang in lang_script_list:
        with open(f'{model_path}/test_transl_{model}_{lang}.txt', 'r') as f:
            df[lang] = f.readlines()
    

    for i in range(len(eng_sents)):
        for lang in lang_script_list:
                    
            # try below below if any error or exception then print it in exception block
            try:
                    # check for the ith sentence in english sentences and the common ambiguous_word. the root_word will be the ambiguous_word present in the ith sentece
                    root_word = [word for word in ambiguos_words if word in eng_sents[i]][0]

                    # check which indic_word from pir[ambiguous_word] is present in the ith sentence of the language file. it will be transl_word
                    transl_word = [word for word in pir[root_word][lang].keys() if word in df[lang][i]][0]

                    # print(root_word, transl_word, eng_sents[i], df[lang][i])

            # if any exception occurs then print the exception
            except Exception as e:
                # print("got exp:", e)
                root_word=transl_word='others'
                continue
            # if not new add one, else initialise with 1
            if root_word not in resultant:
                resultant[root_word]={}
            if lang not in resultant[root_word]:
                resultant[root_word][lang]={}
            if model not in resultant[root_word][lang]:
                resultant[root_word][lang][model]={}
            if transl_word not in resultant[root_word][lang][model]:
                resultant[root_word][lang][model][transl_word]=0
            resultant[root_word][lang][model][transl_word]+=1

            # if transl_word is others then add the sentence to others_res
            if transl_word=='others':
                others_res[root_word][lang][model][transl_word].append([eng_sents[i], df[lang][i]])

            




In [26]:
eng_sents=[]
with open('test_sentences_eng.txt', 'r') as f:
        eng_sents = f.readlines()

others_res={}
resultant={}

segregate_files(eng_sents, 'it2', 'test_translations/indic_trans2', resultant, others_res)
segregate_files(eng_sents, 'googlet', 'test_translations/google_translate', resultant, others_res)
print(resultant)

got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: list index out of range
got exp: l

In [27]:
resultant

{'grandmother': {'pan_Guru': {'it2': {'ਦਾਦੀ': 201}, 'googlet': {'ਦਾਦੀ': 201}},
  'mar_Deva': {'it2': {'आजी': 200}, 'googlet': {'आजी': 201}},
  'tam_Taml': {'it2': {'பாட்டி': 201}, 'googlet': {'பாட்டி': 201}},
  'guj_Gujr': {'it2': {'દાદી': 201}, 'googlet': {'દાદી': 201}},
  'tel_Telu': {'it2': {'అమ్మమ': 201}, 'googlet': {'అమ్మమ': 201}},
  'hin_Deva': {'it2': {'दादी': 201}, 'googlet': {'दादी': 198, 'नानी': 3}},
  'kan_Knda': {'it2': {'ಅಜ್ಜಿ': 201}, 'googlet': {'ಅಜ್ಜಿ': 201}},
  'ory_Orya': {'it2': {'ଜେଜେମା': 201}, 'googlet': {'ଜେଜେମା': 181}},
  'mal_Mlym': {'googlet': {'അമ്മൂമ്മ': 18}},
  'ben_Beng': {'googlet': {'ঠাকুরমা': 3}}},
 'grandfather': {'pan_Guru': {'it2': {'ਦਾਦਾ': 201}, 'googlet': {'ਦਾਦਾ': 201}},
  'ben_Beng': {'it2': {'দাদু': 122}},
  'mar_Deva': {'it2': {'आजोबा': 201}, 'googlet': {'आजोबा': 201}},
  'tam_Taml': {'it2': {'தாத்தா': 201}, 'googlet': {'தாத்தா': 201}},
  'guj_Gujr': {'it2': {'દાદા': 201}, 'googlet': {'દાદા': 201}},
  'hin_Deva': {'it2': {'दादा': 201}, 'googlet': 

In [28]:
# save resultant to a file
import json

with open('resultant.json', 'w') as f:
    json.dump(resultant, f)

with open('others_res.json', 'w') as f:
    json.dump(others_res, f)
        