### Bengali sentences > code-mixing > Romanization

#### useful imports

In [0]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd

In [37]:
import sys
!{sys.executable} -m pip install googletrans



In [38]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk import pos_tag
from nltk import word_tokenize
from datetime import datetime
import unicodedata
import random
import time

from googletrans import Translator
translator = Translator()

In [0]:
def strip_accents(text):
    return ''.join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn')

#### data loading

In [0]:
#### loading POS tagged bangla sentences

temp = []
with open("bengali_pos_tagged_sents.txt","r",encoding="utf-8") as file:
    for line in file:
        temp.append(line.replace("\n","").split(" "))
        
tagged_tokenize_sentence = []

for i in range(len(temp)):
    sent = []
    for word_pos in temp[i]:
        tup = word_pos.split("|")
        sent.append(tup)
    tagged_tokenize_sentence.append(sent)

In [42]:
temp[0]

['আশা|NN',
 'করি|VF',
 'আপনারা|NN',
 'এখানে|NST',
 'এসে|VNF',
 'খুব|INTF',
 'মজা|NN',
 'করেছেন|VF',
 '।|PUNC']

In [43]:
bn_pos_sents = (np.array(tagged_tokenize_sentence)).copy()
bn_pos_sents[0]

[['আশা', 'NN'],
 ['করি', 'VF'],
 ['আপনারা', 'NN'],
 ['এখানে', 'NST'],
 ['এসে', 'VNF'],
 ['খুব', 'INTF'],
 ['মজা', 'NN'],
 ['করেছেন', 'VF'],
 ['।', 'PUNC']]

In [44]:
### loading actual bengali and english sentences

def load_data(file):
    sentences = []
    with open(file, 'r', encoding="utf-8") as f:
        for line in f:
            sentences.append(line.replace("-","").replace("...","").replace("..","").strip())
    return sentences

en_sents = load_data("english_sents.txt")
bn_sents = load_data("bengali_sents.txt")

print(bn_sents[0])
print(en_sents[0])

আশা করি আপনারা এখানে এসে খুব মজা করেছেন ।
I hope you have had a lot of fun here.


#### code_mixing function (replaces certain bangla words with their correspomding english words)

In [0]:
def code_mix(exp_sent):
  
    """0. Rules for CCS """

    rest_NN  = ["CCS"] # ,"QTC","QTO","QTF"]
    for w_pos in exp_sent:
        if w_pos[1] in rest_NN:
            temp_eng = word_tokenize(translator.translate(w_pos[0],src = "bn",dest = "en").text)
            if len(temp_eng) == 1:
                w_pos[0] = temp_eng[0]
    
    """ 1. Rules for JJ"""
    
    # JJ at SOS
    if len(exp_sent)>1:
        if exp_sent[0][1] == "JJ" and exp_sent[1][1] != "JJ":
            temp_words = exp_sent[0][0]+" "+exp_sent[1][0]
            en_word_pos = nltk.pos_tag(word_tokenize(translator.translate(temp_words,src = "bn",dest = "en").text))

            nltk_pos  = [i[1] for i in en_word_pos]

            try:
                word_index = nltk_pos.index("JJ")
            except:
                word_index = "NULL"

            if word_index!="NULL":
                exp_sent[0][0] = en_word_pos[word_index][0]

                
    # JJ at End of the sentence < EOS >
    if len(exp_sent)>1:
        if exp_sent[-1][1] == "JJ" and exp_sent[-2][1] != "JJ":
            temp_words =  exp_sent[-2][0]+ exp_sent[-1][0]
            en_word_pos = nltk.pos_tag(word_tokenize(translator.translate(temp_words,src = "bn",dest = "en").text))

            nltk_pos  = [i[1] for i in en_word_pos]

            try:
                word_index = nltk_pos.index("JJ")
            except:
                word_index = "NULL"

            if word_index!="NULL":
                exp_sent[-1][0] = en_word_pos[word_index][0]


    # JJ at mid-sentence < MID >
    """ obtaining indices of the JJ tagged words which are to replaced with their english words"""

    pos_in_sent = [i[1] for i in exp_sent]
    JJ_mid_indices = [i for i,x in enumerate(pos_in_sent) if x == 'JJ']
    
    if len(pos_in_sent)-1 in JJ_mid_indices:
        JJ_mid_indices.remove(len(pos_in_sent)-1)
    
    if len(JJ_mid_indices)>1:
        if np.logical_and(JJ_mid_indices[0]==0,JJ_mid_indices[1]==1):
            JJ_mid_indices.remove(1)
    
    fin_JJ_mid_indices = [-2]
    for index in JJ_mid_indices:
        if index-fin_JJ_mid_indices[-1]>2:
            fin_JJ_mid_indices.append(index)
    fin_JJ_mid_indices.remove(-2)   

    
    def replace_ADJ(x): ## helper function to replace all nltk JJ POS tags to single noun category "JJ"
        if x in ["JJ"]:
            x = "JJ"
        return x
      

    for index in fin_JJ_mid_indices:

        proceed = []
        try:
            eval(exp_sent[index][0])
            proceed.append("no")
        except:
            proceed.append("yes")

        if proceed[0]=="yes":
            temp_words = exp_sent[index-1][0] +" "+ exp_sent[index][0] #+ " " + exp_sent[index+1][0]
            en_word_pos = nltk.pos_tag(word_tokenize((translator.translate(temp_words,src = "bn",dest = "en").text).lower()))

            nltk_pos  = [i[1] for i in en_word_pos]

            for i in range(len(nltk_pos)):
                nltk_pos[i] = replace_ADJ(nltk_pos[i])

            if np.prod(np.array(nltk_pos)=="JJ")==1:
                word_index = len(nltk_pos)-1

            else:
                try:
                    word_index = nltk_pos.index("JJ") 
                except:
                    word_index = "NULL"

            if word_index!="NULL":
                exp_sent[index][0] = en_word_pos[word_index][0]  

    
    """ 2. Rules for NN """
    # """ NN at start of the sentence < SOS > """
    
        
    def replace_noun(x): ## helper function to replace all nltk noun POS tags to single noun category "NN"
        if x in ["NNP","NNS","NNPS"]:
            x = "NN"
        return x
    
    if len(exp_sent)>1:
        if exp_sent[0][1] == "NN" and exp_sent[1][1] != "NN":
            temp_words = exp_sent[0][0]+" "+exp_sent[1][0]
            en_word_pos = nltk.pos_tag(word_tokenize(translator.translate(temp_words,src = "bn",dest = "en").text))

            nltk_pos  = [i[1] for i in en_word_pos]
            
            for i in range(len(nltk_pos)):
                nltk_pos[i] = replace_noun(nltk_pos[i])

            try:
                word_index = nltk_pos.index("NN")
            except:
                word_index = "NULL"

            if word_index!="NULL":
                exp_sent[0][0] = en_word_pos[word_index][0]

                
    # """ NN at End of the sentence < EOS > """
    if len(exp_sent)>1:
      if exp_sent[-1][1] == "NN" and exp_sent[-2][1] != "NN":
          temp_words = exp_sent[-2][0]+ exp_sent[-1][0]
          en_word_pos = nltk.pos_tag(word_tokenize(translator.translate(temp_words,src = "bn",dest = "en").text))

          nltk_pos  = [i[1] for i in en_word_pos]
          for i in range(len(nltk_pos)):
              nltk_pos[i] = replace_noun(nltk_pos[i])

          try:
              word_index = nltk_pos.index("NN")
          except:
              word_index = "NULL"

          if word_index!="NULL":
              exp_sent[-1][0] = en_word_pos[word_index][0]


    """ NN at mid-sentence < MID > """
    """ obtaining indices of the NN tagged words which are to replaced with their english words"""

    pos_in_sent = [i[1] for i in exp_sent]
    NN_mid_indices = [i for i,x in enumerate(pos_in_sent) if x == 'NN']
    
    if len(pos_in_sent)-1 in NN_mid_indices:
        NN_mid_indices.remove(len(pos_in_sent)-1)
    
    if len(NN_mid_indices)>1:
        if np.logical_and(NN_mid_indices[0]==0,NN_mid_indices[1]==1):
            NN_mid_indices.remove(1)
    
    fin_NN_mid_indices = [-2]
    for index in NN_mid_indices:
        if index-fin_NN_mid_indices[-1]>2:
            fin_NN_mid_indices.append(index)
    fin_NN_mid_indices.remove(-2)   

      
    for index in fin_NN_mid_indices:

        proceed = []
        try:
            eval(exp_sent[index][0])
            proceed.append("no")
        except:
            proceed.append("yes")

        if proceed[0]=="yes":
            temp_words = exp_sent[index-1][0] +" "+ exp_sent[index][0] #+ " " + exp_sent[index+1][0]
            en_word_pos = nltk.pos_tag(word_tokenize((translator.translate(temp_words,src = "bn",dest = "en").text).lower()))

            nltk_pos  = [i[1] for i in en_word_pos]

            for i in range(len(nltk_pos)):
                nltk_pos[i] = replace_noun(nltk_pos[i])

            if np.prod(np.array(nltk_pos)=="NN")==1:
                word_index = len(nltk_pos)-1

            else:
                try:
                    word_index = nltk_pos.index("NN")
                except:
                    word_index = "NULL"

            if word_index!="NULL":
                exp_sent[index][0] = en_word_pos[word_index][0]  

  
    cm_sent = ""
    for i in exp_sent:
        cm_sent = cm_sent + " " +i[0]
    
    return cm_sent.strip()

#### sentence romanization function (romanizes code-mix sentences)

In [0]:
def roman_cm(cm_sentence):
    try:
        t_raw = translator.translate(cm_sentence,dest = "bn").pronunciation
        roman_cm_sentence = strip_accents(t_raw).replace("'","")
    except:
        roman_cm_sentence = "TRANSLATION ERROR"
    return roman_cm_sentence

#### data synthesis (generating code-mixed romanized sentences)

In [48]:
bn_ro_en_sents = []

for i in range(len(bn_sents)):
    cm = code_mix(bn_pos_sents[i])
    ro = roman_cm(cm)
    bn_ro_en_sents.append([bn_sents[i], ro, en_sents[i]])
        
    print(i)
    print("Bangla: ", bn_ro_en_sents[i][0])
    print("Romanized: ", bn_ro_en_sents[i][1])
    print("English: ", bn_ro_en_sents[i][2],"\n\n")

0
Bangla:  আশা করি আপনারা এখানে এসে খুব মজা করেছেন ।
Romanized:  Hope kari apanara ekhane ese khuba maja karechena.
English:  I hope you have had a lot of fun here. 


1
Bangla:  এর স্ফটিক ;
Romanized:  Era crystal;
English:  Its crystals; 


2
Bangla:  আমি আপনাকে সাহায্য করবো ।
Romanized:  Ami apanake help karabo.
English:  I will help you 


3
Bangla:  গেট খোলো .
Romanized:  Geta kholo.
English:  Open the gate 


4
Bangla:  আমি বিশ্বাস করি এটা আমার জন্য ভালো কিছু এনে দিয়েছে , আর আমি বলবো , ঈশ্বর এর মঙ্গল করুন !
Romanized:  Ami i kari eta amara janya good kichu ene diyeche, ara ami balabo, god era mangala karuna!
English:  I believe it has brought something good to me, and I will say, God bless it! 


5
Bangla:  আমরা দ্বিতীয় ধাপ করিনি !
Romanized:  Amara dbitiya step karini!
English:  We didn't do the second step! 


6
Bangla:  তোমার মুখটা দেখার মত ।
Romanized:  Tomara face dekhara mata.
English:  Your face is like a watch 


7
Bangla:  আমার লোকেরা আমাকে পথ দেখাবে .
Romanized:  Amara 

In [0]:
# with open('result_file_100_sents.txt', 'w',encoding="utf-8") as f:
#     for bn_en_ro in bn_ro_en_sents:
#         f.write(bn_en_ro[0])
#         f.write("\n")
#         f.write(bn_en_ro[1])
#         f.write("\n")
#         f.write(bn_en_ro[2])
        
#         f.write("\n\n\n")

In [0]:
# with open('en.txt', 'w',encoding="utf-8") as f:
#     for en in en_sents[:len(rom_temp)]:
#         f.write(en)
#         f.write("\n")


In [0]:
# with open('bn.txt', 'w',encoding="utf-8") as f:
#     for en in bn_sents[:len(rom_temp)]:
#         f.write(en)
#         f.write("\n")

In [0]:
# with open('ro.txt', 'w',encoding="utf-8") as f:
#     for en in rom_temp[:len(rom_temp)]:
#         f.write(en)
#         f.write("\n")