<h1> This notebook translates SQuAD 1.1 to Bangla then corrects and aligns answer spans.</h1>

*   In the translation section select train.json or dev.json to translate train or dev split.

*   Select low and high values to translate a part of dataset if low=0 and high=40 it means the first 40 paragraphs will be translated. The translated dictionary will be saved as "squad1_translated_"+str(low)+"_"+str(high)+"_dict.json"

*   In the merge translation section marge the translated dataset parts giving the correct filename and running the subsection merge parts as many times as the number of parts generated in the previous section. The full dataset is saved as "squad1_translated_merged_temp.json"

*   Finally run the answer correction and alignment section.




In [None]:
!pip install transformers=="4.25.1" sentencepiece=="0.1.97" fasttext=="0.9.2" utoken=="0.1.8" nltk=="3.8.1" torch=="1.13.1+cu116" numpy=="1.21.6" tqdm=="4.64.1" --quiet

In [None]:
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import random
import numpy as np
import torch
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
import numpy as np
from tqdm.notebook import tqdm, trange
import json
import nltk

In [None]:
!mkdir squad1_data
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O ./squad1_data/train.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O ./squad1_data/dev.json

In [None]:
f = open("./squad1_data/train.json")
data = json.load(f)

# **Translation**

In [None]:
len(data['data'])

In [None]:
data['data'][0]['paragraphs'][0]['context']

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

2022-12-29 22:42:15.672467: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-29 22:42:15.775746: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-29 22:42:15.793412: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-29 22:42:16.153518: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-3.3B", src_lang="eng_Latn", tgt_lang="ben_Beng"
)

model = AutoModelForSeq2SeqLM.from_pretrained("pretrained_models/nllb-200-3.3B")
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang="ben_Beng", device="cuda:0")

In [None]:
# nltk.download('punkt')
def get_c_sent_list(c_src):
  c_src_sent_list = nltk.sent_tokenize(c_src)
  # print(a_src)
  c_tran_sent_list = []

  for sent in c_src_sent_list:
    c_tran_sent = translator(sent, max_length=400)[0]['translation_text']
    c_tran_sent_list.append(c_tran_sent)

  return c_tran_sent_list, c_src_sent_list

In [None]:
# Test Run
output,_ = get_c_sent_list(data['data'][0]['paragraphs'][0]['context'])
print(output)

['সুপার বোল ৫০ ছিল ২০১৫ সালের মৌসুমে ন্যাশনাল ফুটবল লিগের (এনএফএল) চ্যাম্পিয়ন নির্ধারণের জন্য একটি আমেরিকান ফুটবল খেলা।', 'আমেরিকান ফুটবল কনফারেন্স (এএফসি) চ্যাম্পিয়ন ডেনভার ব্রঙ্কোস ন্যাশনাল ফুটবল কনফারেন্স (এনএফসি) চ্যাম্পিয়ন ক্যারোলিনা প্যান্থারসকে ২৪ঃ১০ দিয়ে পরাজিত করে তাদের তৃতীয় সুপার বোল খেতাব অর্জন করে।', 'খেলাটি ২০১৬ সালের ৭ ফেব্রুয়ারি ক্যালিফোর্নিয়ার সান্তা ক্লারাতে সান ফ্রান্সিসকো বে এরিয়ার লেভি স্টেডিয়ামে অনুষ্ঠিত হয়।', 'যেহেতু এটি ছিল ৫০তম সুপার বোল, তাই লিগটি বিভিন্ন স্বর্ণ-থিমযুক্ত উদ্যোগের সাথে "সোনার বার্ষিকী" এর উপর জোর দিয়েছিল, পাশাপাশি প্রতিটি সুপার বোল গেমকে রোমান সংখ্যা দিয়ে নামকরণের ঐতিহ্যকে সাময়িকভাবে স্থগিত করেছিল (যার অধীনে গেমটি "সুপার বোল এল" নামে পরিচিত হত), যাতে লোগোতে আরবি সংখ্যা 50 এর বৈশিষ্ট্যটি সুস্পষ্টভাবে প্রদর্শিত হতে পারে।']


In [None]:
import nltk
nltk.download('punkt')
import re
def word_tokenize(text):
    tokens = nltk.word_tokenize(text)
    new_tokens = []
    for token in tokens:
        if token[-1] == "।" and len(token)>1:
            token_1 = token[:-1]
            token_2 = token[-1]
            new_tokens.append(token_1)
            new_tokens.append(token_2)
        elif ("-" in token or  "–" in token) and len(token)>1:
            temp_tokens = token.replace("-", " - ").replace("–", " – ").replace("—"," — ").split(" ")
            new_tokens.extend(temp_tokens)
        else:
            new_tokens.append(token)
    return new_tokens

def detokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('‘ ', '‘').replace(' ’', '’').replace('. . .',  '...')
    step2 = step1.replace("( ", "(").replace(" )", ")").replace(" - ","-").replace(" – ","–").replace(" — ","—")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" ' ", "'").replace(" & ", "&")
    step6 = step5.replace(" ` ", " '").replace("“ ", "“").replace(" ”", "”")
    step7 = step6.replace(" ।", "।")
    step8 = step7.replace(" %","%").replace("[ ", "[").replace(" ]", "]").replace("{ ", "{").replace(" }", "}")
    return step8.strip()


[nltk_data] Downloading package punkt to /home/dlpc01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Translating any left-over english digits
digits_en = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
digits_bn = ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
def translate_digits(s):
  s = list(s)
  for i in range(len(s)):
    if s[i] in digits_en:
      s[i] = digits_bn[digits_en.index(s[i])]
  s = "".join(s)
  return s

In [None]:
def get_index_c_tran_with_ans(c_src_tokens, c_tran_tokens, a_ch_start_pos_src):
  ch_len = 0
  for ind in range(len(c_src_tokens)):
    ch_len += len(c_src_tokens[ind])
    ch_len += 1 # For space after a sentence
    if(ch_len > a_ch_start_pos_src):
      return ind
  return ind

In [None]:
# a_src = data['data'][0]['paragraphs'][3]['qas'][1]['answers'][0]['text']
import string
import re
punc = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—।–'''
def get_a_tran(c_tran_with_ans, a_src):
  a_tran = a_src
  if a_src in c_tran_with_ans:
    a_tran = a_src
  elif len(word_tokenize(a_src))==1:
    clean_ans_text = a_src.translate(str.maketrans('', '', punc))
    if re.match('^[0-9]*$', clean_ans_text):
      a_tran = translate_digits(a_src)
  else:
    a_src = re.sub(r'(?<=\d)[,\.](?=\d)','',a_src)
    a_tran = translator(a_src, max_length=400)[0]['translation_text']
    a_tran = translate_digits(a_tran)
  return a_tran

# print(c_tran_token)
# print(a_tran_token)

In [None]:
low = 0
high = 40
save_filename = "squad1_translated_"+str(low)+"_"+str(high)+"_dict"
save_loc = "./squad1_data"

In [None]:
# for di in tqdm(range(len(data['data'])),desc="Paragraphs"):
for di in tqdm(range(low,high),desc="Paragraphs"):
  d = data['data'][di]
  for p in tqdm(d['paragraphs'],desc="Contexts"):
    c_tran_sent_list, c_src_sent_list = get_c_sent_list(p['context'])
    # print('english context -> ', p['context'])
    # print('bangla context -> ', c_tran_sent_list)
    p['bangla_context'] = c_tran_sent_list
    # for qas in tqdm(p['qas'],desc="QuesAns"):
    for qas in p['qas']:
      q_tran = translator(qas['question'], max_length=512)[0]['translation_text']
      qas['q_tran'] = q_tran
      for ans in qas['answers']:
        a_ch_start_pos_src = ans['answer_start']
        a_src = ans['text']
        index_c_tran_with_ans = get_index_c_tran_with_ans(c_src_sent_list, c_tran_sent_list, a_ch_start_pos_src)
        a_tran = get_a_tran(c_tran_sent_list[index_c_tran_with_ans], a_src)

        # print('bangla sentence with ans -> ', c_tran_sent_list[index_c_tran_with_ans])
  # break
        # print('bangla answer span -> ', a_tran)
        ans['a_tran'] = a_tran
        ans['index_c_tran_with_ans'] = index_c_tran_with_ans
 

In [None]:
out_file = open(save_loc+'/'+save_filename+'.json', "w")
json.dump(data, out_file, indent = 4) # save whole data replace parts later
out_file.close()

In [None]:
data['data'][341]['paragraphs'][0]

# **Merge Translated dataset parts**

## **merge parts**

In [None]:
low = 0
high = 40
data_part_filename = "SQuAD_translated_"+str(low)+"_"+str(high)+"_dict"
save_loc = "./squad1_data"

In [None]:
part_dict_f = open(save_loc+'/'+data_part_filename+'.json','r')
part_dict = json.load(part_dict_f)
for di in tqdm(range(low,high),desc="Paragraphs"):
    data['data'][di] = part_dict['data'][di]
    d = data['data'][di]
    for p in tqdm(d['paragraphs'],desc="Contexts"):
        assert 'bangla_context' in p, 'bangla_context key not found'
        for qas in p['qas']:
            assert 'q_tran' in qas, 'q_tran key not found'
            for ans in qas['answers']:
                assert 'a_tran' in ans, 'a_tran key not found'
                assert 'index_c_tran_with_ans' in ans, 'index_c_tran_with_ans key not found'
part_dict_f.close()

## **save full dataset**

In [None]:
# Check if all data if there
for di in tqdm(range(len(data['data'])),desc="Paragraphs"):
    d = data['data'][di]
    for p in tqdm(d['paragraphs'],desc="Contexts"):
        assert 'bangla_context' in p, 'bangla_context key not found'
        for qas in p['qas']:
            assert 'q_tran' in qas, 'q_tran key not found'
            for ans in qas['answers']:
                assert 'a_tran' in ans, 'a_tran key not found'
                assert 'index_c_tran_with_ans' in ans, 'index_c_tran_with_ans key not found'

In [None]:
final_out_file = open('./squad1_data/squad1_translated_merged_temp.json', "w")
json.dump(data, final_out_file, indent = 4) # save whole data replace parts later
final_out_file.close()

# **Answer span correction and alignment**

In [None]:
!wget "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz"
!gunzip './cc.bn.300.bin.gz'

In [None]:
import io
# from gensim.models import KeyedVectors
import fasttext
fasttext_model = fasttext.load_model("./cc.bn.300.bin")



In [None]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

2022-12-30 02:36:22.142240: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-30 02:36:22.243125: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-30 02:36:22.260275: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-30 02:36:22.646514: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [None]:
bn_to_en_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
bn_to_en_model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")

In [None]:
bn_to_en_translator = pipeline('translation', model=bn_to_en_model, tokenizer=bn_to_en_tokenizer, device="cuda:0")

In [None]:
import string
punc = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—।–'''
uw = 0.001
punc_embed = {}
for p in punc:
  punc_embed[p] = np.random.uniform(-uw,uw,300)

sp_punc1 = np.random.uniform(-uw,uw,300)
sp_punc2 = np.random.uniform(-uw,uw,300)

def get_embedding(word):
    word = word.strip("@")
    if word=="...." or word=="..." or word=="..":
        return sp_punc1
    if word=="()":
        return sp_punc2
    if len(word)==1 and word in punc:
        return punc_embed[word]
    word = word.translate(str.maketrans('', '', punc))
    word_vec = fasttext_model.get_word_vector(word)
    return word_vec

In [None]:
from utoken import utokenize
from utoken import detokenize
tok = utokenize.Tokenizer(lang_code='ben')
detok = detokenize.Detokenizer(lang_code='ben')

# print(tok.utokenize_string("Dont worry!"))

In [None]:
# re.sub("([১-২][০-৯][০-৯][০-৯])([১-২]?[০-৯]?[০-৯][০-৯])",r"\1-\2", "১৯১৭১৫")

In [None]:
import json

f = open('./squad1_data/squad1_translated_merged_temp.json', 'r')
data = json.load(f)
f.close()

In [None]:
import re
regex_year = re.compile("([০-৯]+)-([০-৯]+)")
digits_dict = {"0":'০', "1": '১', "2": '২', "3": '৩', "4": '৪', "5": '৫', "6": '৬', "7": '৭', "8": '৮', "9": '৯'}
def digit_to_bn(digit):
    return digits_dict[digit.group()]
for di in tqdm(range(len(data['data'])),desc="Paragraphs"):
    d = data['data'][di]
    for p in d['paragraphs']:
        new_c_tran = []
        for c_tran in p['bangla_context']:
            c_tran = re.sub("[0-9]",digit_to_bn, c_tran)
            c_tran = re.sub("([০-৯]+)(th)",r"\1তম", c_tran)
            c_tran = re.sub("([০-৯]+)([অ-য়]+)",r"\1 \2", c_tran)
            c_tran = re.sub("([১-২][০-৯][০-৯][০-৯])([১-২]?[০-৯]?[০-৯][০-৯])",r"\1-\2", c_tran)
            new_c_tran.append(c_tran)
        p['bangla_context'] = new_c_tran

        new_c_tran = []
        # new_c_tran.append(p['bangla_context'][0])
        j=0
        for i,c_tran in enumerate(p['bangla_context']):
            if i!=0 and (new_c_tran[-1][-1]=='.' or len(c_tran)<30):
                new_c_tran[-1]+=(" "+c_tran)
                for qas in p['qas']:
                    for ans in qas['answers']:
                        if ans['index_c_tran_with_ans']>=(i-j):
                            ans['index_c_tran_with_ans']-=1
                j+=1
            else:
                new_c_tran.append(c_tran)                
        p['bangla_context'] = new_c_tran

        for qas in p['qas']:
            qas['q_tran'] = re.sub("[0-9]",digit_to_bn, qas['q_tran'])
            qas['q_tran'] = re.sub("([০-৯]+)(th)",r"\1তম", qas['q_tran'])
            qas['q_tran'] = re.sub("([০-৯]+)([অ-য়]+)",r"\1 \2", qas['q_tran'])
            qas['q_tran'] = re.sub("([১-২][০-৯][০-৯][০-৯])([১-২]?[০-৯]?[০-৯][০-৯])",r"\1-\2", qas['q_tran'])
            for ans in qas['answers']:
                ans['a_tran'] = re.sub("[0-9]",digit_to_bn, ans['a_tran'])
                ans['a_tran'] = re.sub("([০-৯]+)(th)",r"\1তম", ans['a_tran'])
                ans['a_tran'] = re.sub("([০-৯]+)([অ-য়]+)",r"\1 \2", ans['a_tran'])
                ans['a_tran'] = re.sub("([১-২][০-৯][০-৯][০-৯])([১-২]?[০-৯]?[০-৯][০-৯])",r"\1-\2", ans['a_tran'])


Paragraphs:   0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
digits_en = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
digits_bn = ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
for di in tqdm(range(len(data['data'])),desc="Paragraphs"):
    d = data['data'][di]
    for p in d['paragraphs']:
        c_tran_sent_list = p['bangla_context']
        c_tran_final = ""
        for c_tran_sent in c_tran_sent_list:
            c_tran_final += (c_tran_sent + " ")
        if any(n in c_tran_final for n in digits_en):
            print('c_tran: ', c_tran_final)
        for qas in p['qas']:
            if any(n in qas['q_tran'] for n in digits_en):
                print('q_tran: ', qas['q_tran'])
            for ans in qas['answers']:
                if any(n in ans['a_tran'] for n in digits_en):
                    print('a_tran: ', ans['a_tran'])

Paragraphs:   0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
def get_answer_start(index_c_tran_with_ans, a_tran, c_tran, debug=False):
  c_tran_with_ans = c_tran[index_c_tran_with_ans]
  c_tran_with_ans_token = tok.utokenize_string(c_tran_with_ans).split(" ")
#   a_tran = a_tran.translate(str.maketrans('', '', '''`'",।|'''))
  a_tran_token = tok.utokenize_string(a_tran).split(" ")
#   a_tran_token = [token for token in a_tran_token if token!='']
  if debug:
      print(c_tran_with_ans_token)  
      print(a_tran_token)
  if len(a_tran_token)==0:
    return a_tran, 0, 0 
#   if debug:
#     print(c_tran_with_ans_token)
#     print(a_tran_token)
  ans_vec = np.array([np.array(get_embedding(a_word)) for a_word in a_tran_token])
  #print(ans_vec)
  ans_norm = np.apply_along_axis(np.linalg.norm,1,ans_vec)
  # print(ans_norm)
  lengths = [-1, 0, 1]
  sim_lists = []
  for length in lengths:
    if len(a_tran_token)+length>=len(c_tran_with_ans_token):
        sim_lists.append(np.array([-1]))
        continue
    sim_mat = []
    for i in range(len(a_tran_token)+length):
        c_word = c_tran_with_ans_token[i]
        cw_vec = np.array(get_embedding(c_word))
        cw_norm = np.linalg.norm(cw_vec)
        sim_measure = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
        sim_mat.append(sim_measure)

    sim_mat = np.array(sim_mat)
    # print(sim_mat.shape)
    sim_list = []

    for i in range(len(a_tran_token)+length,len(c_tran_with_ans_token)):
        c_word = c_tran_with_ans_token[i]
        cw_vec = np.array(get_embedding(c_word))
        cw_norm = np.linalg.norm(cw_vec)
        sim_measures = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
        if(sim_mat.shape[0]==0):
            sim_mat = np.array([sim_measures])
        else:
            sim_mat = np.append(sim_mat,np.array([sim_measures]),0)
        sim_mat_cal = sim_mat
        
        similarity = 0
        for j in range(len(a_tran_token)):
            max_index = np.unravel_index(sim_mat_cal.argmax(),sim_mat_cal.shape)
            similarity+= sim_mat_cal[max_index]
            sim_mat_cal = np.delete(sim_mat_cal,max_index[0],0)
            sim_mat_cal = np.delete(sim_mat_cal,max_index[1],1)

        sim_list.append(similarity)
        sim_mat = np.delete(sim_mat,0,0)

    sim_list = np.divide(sim_list,len(a_tran_token))
    # if debug:
    #     print(sim_list)
    sim_lists.append(sim_list)


  if(max(sim_lists[2]) - max(sim_lists[0]) >= 0.08 and max(sim_lists[2]) - max(sim_lists[1]) >= 0.03):
    # print(c_tran_with_ans_token[sim_lists[2].argmax()])
    align_score = max(sim_lists[2])
    best_ans_start = sim_lists[2].argmax()
    best_ans_length = len(a_tran_token)+2#lengths[2]
    
    if debug:
        print('best in equal+2')
  elif(max(sim_lists[1]) - max(sim_lists[0]) >= 0.06):
    # print(c_tran_with_ans_token[sim_lists[1].argmax()])
    align_score = max(sim_lists[1])
    best_ans_start = sim_lists[1].argmax()
    best_ans_length = len(a_tran_token)+1#lengths[1]

    if debug:
        print('best in equal+1')
  else:
    # print(c_tran_with_ans_token[sim_lists[0].argmax()])
    align_score = max(sim_lists[0])
    best_ans_start = sim_lists[0].argmax()
    best_ans_length = len(a_tran_token)#lengths[0]
    
    if debug:
        print('best in equal')

  if best_ans_start+best_ans_length>len(c_tran_with_ans_token):
      best_ans_length=len(c_tran_with_ans_token)-best_ans_start+1

  a_tran_final = detok.detokenize_string(" ".join(c_tran_with_ans_token[best_ans_start:best_ans_start+best_ans_length]))
  a_tran_final = re.sub("([০-৯]+)([অ-য়]+)",r"\1 \2", a_tran_final)

  ans_start_ref = c_tran_with_ans.find(a_tran_final)
  if debug:
      print(a_tran_final)  

  ans_start_final = 0
  for i in range(len(c_tran)):
    c_tran_sent = c_tran[i]
    if i == index_c_tran_with_ans:
      ans_start_final+= ans_start_ref  
    if i < index_c_tran_with_ans:        
      ans_start_final += len(c_tran_sent)+1
      
  # print(c_tran_final)
  # print(ans_start_final)
  return a_tran_final, ans_start_final,align_score

In [None]:
import re
align_punc = '''!"#&'*,./:;<=>?@\^_`-|~—।–'''
align_punc2 = '''!#&'*,/:;<=>?@\^_`-|~—।–'''

regexp = re.compile(r'[A-z]')

In [None]:
regexp_bn = re.compile(r'[অ-য়]')
def translate_bn_en(text):
    if regexp_bn.search(text):
        return bn_to_en_translator(text)[0]['translation_text']
    else:
        return text

In [None]:
def get_back_trans_answer_start(c_tran_with_ans_token, a_tran_token, debug=False):
  if len(a_tran_token)==0:
    return a_tran, 0, 0 

  ans_vec = np.array([np.array(get_embedding(a_word)) for a_word in a_tran_token])
  #print(ans_vec)
  ans_norm = np.apply_along_axis(np.linalg.norm,1,ans_vec)
  # print(ans_norm)
  lengths = [-1, 0, 1]
  sim_lists = []
  for length in lengths:
    if len(a_tran_token)+length>=len(c_tran_with_ans_token):
        sim_lists.append(np.array([-1]))
        continue
    sim_mat = []
    for i in range(len(a_tran_token)+length):
        c_word = c_tran_with_ans_token[i]
        cw_vec = np.array(get_embedding(c_word))
        cw_norm = np.linalg.norm(cw_vec)
        sim_measure = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
        sim_mat.append(sim_measure)

    sim_mat = np.array(sim_mat)
    # print(sim_mat.shape)
    sim_list = []

    for i in range(len(a_tran_token)+length,len(c_tran_with_ans_token)):
        c_word = c_tran_with_ans_token[i]
        cw_vec = np.array(get_embedding(c_word))
        cw_norm = np.linalg.norm(cw_vec)
        sim_measures = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
        if(sim_mat.shape[0]==0):
            sim_mat = np.array([sim_measures])
        else:
            sim_mat = np.append(sim_mat,np.array([sim_measures]),0)
        sim_mat_cal = sim_mat
        
        similarity = 0
        for j in range(len(a_tran_token)):
            max_index = np.unravel_index(sim_mat_cal.argmax(),sim_mat_cal.shape)
            similarity+= sim_mat_cal[max_index]
            sim_mat_cal = np.delete(sim_mat_cal,max_index[0],0)
            sim_mat_cal = np.delete(sim_mat_cal,max_index[1],1)

        sim_list.append(similarity)
        sim_mat = np.delete(sim_mat,0,0)

    sim_list = np.divide(sim_list,len(a_tran_token))
    # if debug:
    #     print(sim_list)
    sim_lists.append(sim_list)

  if(max(sim_lists[1]) - max(sim_lists[0]) >= 0.06):
    # print(c_tran_with_ans_token[sim_lists[1].argmax()])
    align_score = max(sim_lists[1])
    best_ans_start = sim_lists[1].argmax()
    best_ans_length = len(a_tran_token)+1#lengths[1]

    if debug:
        print('best in equal+1')
  else:
    # print(c_tran_with_ans_token[sim_lists[0].argmax()])
    align_score = max(sim_lists[0])
    best_ans_start = sim_lists[0].argmax()
    best_ans_length = len(a_tran_token)#lengths[0]
    
    if debug:
        print('best in equal')
  
  if best_ans_start+best_ans_length>len(c_tran_with_ans_token):
      best_ans_length=len(c_tran_with_ans_token)-best_ans_start+1
      
  return best_ans_start ,align_score

In [None]:
import copy
tran_data = data
c_tran_tokens = []
c_tran_tokens_len = [len(c_tran_tokens)]
a_tran_tokens = []
a_tran_tokens_len = [len(a_tran_tokens)]

for di in tqdm(range(len(tran_data['data']))):
  d = tran_data['data'][di]
  for p in d['paragraphs']:
    c_tran_sent_list = copy.deepcopy(p['bangla_context'])
    c_tran_final = " ".join(copy.deepcopy(p['bangla_context']))
    p['bangla_context_list'] = c_tran_sent_list
    p['bangla_context'] = c_tran_final
    for qas in p['qas']:
      for ans in qas['answers']:
        a_tran = ans['a_tran']
        index_c_tran_with_ans = ans['index_c_tran_with_ans']
        a_src = ans['text']
        # print('banga question -> ',qas['q_tran'])
        # print('translated bangla answer -> ', a_tran)
        # print('english answer -> ', a_src)
        if len(a_tran)>1 and a_tran[0] in align_punc2:
            a_tran = a_tran[1:]
        if len(a_tran)>1 and a_tran[-1] in align_punc2:
            a_tran = a_tran[:-1]
        a_tran_final, ans_start_final, align_score = get_answer_start(index_c_tran_with_ans, a_tran, c_tran_sent_list,debug=False)
        
        
        if regexp.search(a_tran) or align_score<0.6: # or  "'" in a_tran_final or (a_tran_final[0] in align_punc or a_tran_final[-1] in align_punc):
            c_tran_with_ans_token = tok.utokenize_string(c_tran_sent_list[index_c_tran_with_ans]).split(" ")
            c_tran_tokens.extend(c_tran_with_ans_token)
            c_tran_tokens_len.extend([len(c_tran_tokens)])

            a_tran_token = tok.utokenize_string(a_tran).split(" ")
            a_tran_tokens.extend(a_tran_token)
            a_tran_tokens_len.extend([len(a_tran_tokens)])

        ans['a_tran_temp'] = a_tran
        ans['a_tran'] = a_tran_final
        ans['a_tran_start'] = ans_start_final
        ans['align_score'] = float(align_score)

  0%|          | 0/48 [00:00<?, ?it/s]

  sim_measures = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
  sim_measure = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm


In [None]:
import torch
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, text):
        self.text = text

    def __getitem__(self, idx):
        return self.text[idx].strip("@")

    def __len__(self):
        return len(self.text)


num_classes = 3

# convert our tokenized data into a torch Dataset
context_dataset = TextDataset(c_tran_tokens)
ans_dataset = TextDataset(a_tran_tokens)

In [None]:
c_back_tran_tokens = []
for out in tqdm(bn_to_en_translator(context_dataset, batch_size=32), total=len(context_dataset)):
    c_back_tran_tokens.append(out)

  0%|          | 0/350484 [00:00<?, ?it/s]

In [None]:
ans_back_tran_tokens = []
for out in tqdm(bn_to_en_translator(ans_dataset, batch_size=32), total=len(ans_dataset)):
    ans_back_tran_tokens.append(out)

  0%|          | 0/21807 [00:00<?, ?it/s]

In [None]:
c_back_tran_tokens = [t[0]['translation_text'] for t in c_back_tran_tokens]

In [None]:
ans_back_tran_tokens = [t[0]['translation_text'] for t in ans_back_tran_tokens]

In [None]:
ij=1
for di in tqdm(range(len(tran_data['data']))):
  d = tran_data['data'][di]
  for p in d['paragraphs']:

    c_tran_sent_list = p['bangla_context_list']
    for qas in p['qas']:
      for ans in qas['answers']:
        a_tran = ans['a_tran_temp']
        index_c_tran_with_ans = ans['index_c_tran_with_ans']
        a_src = ans['text']
        # print('banga question -> ',qas['q_tran'])
        # print('translated bangla answer -> ', a_tran)
        # print('english answer -> ', a_src)
        if len(a_tran)>1 and a_tran[0] in align_punc2:
            a_tran = a_tran[1:]
        if len(a_tran)>1 and a_tran[-1] in align_punc2:
            a_tran = a_tran[:-1]
        a_tran_final, ans_start_final, align_score = get_answer_start(index_c_tran_with_ans, a_tran, c_tran_sent_list,debug=False)
        
        
        if regexp.search(a_tran) or align_score<0.6: # or  "'" in a_tran_final or (a_tran_final[0] in align_punc or a_tran_final[-1] in align_punc):
            c_tran_with_ans_token = tok.utokenize_string(c_tran_sent_list[index_c_tran_with_ans]).split(" ")
            a_tran_token = tok.utokenize_string(a_tran).split(" ")

            ans_start_final2 ,align_score2 = get_back_trans_answer_start(c_back_tran_tokens[c_tran_tokens_len[ij-1]:c_tran_tokens_len[ij]], ans_back_tran_tokens[a_tran_tokens_len[ij-1]:a_tran_tokens_len[ij]], debug=False)
            ij+=1
            a_tran_final2 = detok.detokenize_string(" ".join(c_tran_with_ans_token[ans_start_final2:ans_start_final2+len(a_tran_token)]))
            a_tran_final2 = re.sub("([০-৯]+)([অ-য়]+)",r"\1 \2", a_tran_final2)        
            ans_start_ref = c_tran_sent_list[index_c_tran_with_ans].find(a_tran_final2)
            ans_start_final2 = 0
            for i in range(len(c_tran_sent_list)):
                c_tran_sent = c_tran_sent_list[i]
                if i == index_c_tran_with_ans:
                    ans_start_final2+= ans_start_ref  
                if i < index_c_tran_with_ans:        
                    ans_start_final2 += len(c_tran_sent)+1

            if align_score2>align_score+0.05:
                align_score = align_score2
                ans_start_final = ans_start_final2
                a_tran_final = a_tran_final2

        ans['a_tran_temp'] = a_tran
        ans['a_tran'] = a_tran_final
        ans['a_tran_start'] = ans_start_final
        ans['align_score'] = float(align_score)

  0%|          | 0/48 [00:00<?, ?it/s]

  sim_measures = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
  sim_measures = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
  sim_measure = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm
  sim_measure = (np.dot(ans_vec,cw_vec)/ans_norm)/cw_norm


In [None]:
final_out_file = open('./translated_squad/squad1_translated_final_aligned.json', "w")
json.dump(data, final_out_file, indent = 4) # save whole data replace parts later
final_out_file.close()

# **Check files and functions**

In [None]:
#check

for di in range(400,409):
    d = tran_data['data'][di]
    for p in d['paragraphs']:
        #print('english context -> ', p['context'])
        for qas in p['qas']:
            for ans in qas['answers']:
                if ans['align_score']<0.6:
                    print('bangla context with ans -> ', p['bangla_context_list'][ans['index_c_tran_with_ans']])
                    print('banga question -> ',qas['q_tran'])
                    print('align_score -> ', ans['align_score'])
                    print('final bangla answer span -> ', ans['a_tran'])
                    print('english answer -> ', ans['text'])

In [None]:
#check

for di in tqdm(range(len(tran_data['data']))):
  d = tran_data['data'][di]
  d_temp = data_temp['data'][di]
  for p1,p2 in zip(d['paragraphs'],d_temp['paragraphs']):
#     p1['bangla_context_list'] = p2['bangla_context']
#     p1['bangla_context'] = " ".join(p2['bangla_context'].copy())
    print(p1['bangla_context'])
    print(p1['bangla_context_list'])
    break
  break

In [None]:
import numpy as np

qw_vec = fasttext_model.get_word_vector("pulse")
aw_vec = fasttext_model.get_word_vector("pulses")
qw_norm = np.linalg.norm(qw_vec)
aw_norm = np.linalg.norm(aw_vec)
print((((np.dot(qw_vec,aw_vec))/(qw_norm*aw_norm)))/1)

In [None]:
qw_vec = embeddings["পর"]
aw_vec = embeddings["টপশপের"]
qw_norm = np.linalg.norm(qw_vec)
aw_norm = np.linalg.norm(aw_vec)
print(np.dot(qw_vec,aw_vec)/(qw_norm*aw_norm))