# Ilokano to Tagalog Basic Translator

## Ilokano Rule-Based Modeling

### Initialization of Data Sets

#### Opening and processing the Source Document

In [None]:
import pandas as pd

# Opening the file
test_doc = open("../../src/text data/testing data/Ilokano/il_test_data_bible.txt", encoding='utf-8').read()
target_op = open("../../src/text data/testing data/Tagalog/tl_test_data_bible.txt", encoding='utf-8').read()

# Splitting the raw data into sentences
parsed_test_doc = test_doc.split("\n")

##### Cleaning the Source Document

In [None]:
import string

def remove_punct(pText):
    text_nopunct = "".join([char for char in pText if char not in string.punctuation])
    return text_nopunct

cleaned_test_doc = [remove_punct(word) for word in parsed_test_doc]

In [None]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    
    for token in tokens:
        if token == '':
            tokens.remove(token)
    
    return tokens


tokenized_test_doc = [tokenize(word) for word in cleaned_test_doc]

In [None]:
dict_test_doc = pd.DataFrame({'Tokenized': tokenized_test_doc})  
dict_test_doc.head()

#### Setting up the important lists and variables

In [None]:
"""
Determiner Lists
"""
noun_dtmn_list = ["dagiti", "ti", "cadagiti", "kadagiti", "ni", "ken", "ni", "coma", "koma", "a", "iti"] # Noun Determiners * Tinanggal ko si ket

adv_dtmn_list = ["idi", "iti"]

prepo_dtmn_list = ["ti", "addaak", "iti"] # Preposition Determiners * Tinanggal ko muna yung adda
# added "iti" to the list (eg. nagtignay iti = ay sumasa)

adv_time_list = ['madamdama', 'ita', 'kalman', 'inton bigat', 'ditoy', 'idiay', 'ita a rabii', 'iti kaaldawantayo', 'idi rabii', 'sumaruno a lawas', 'nga', 'nabiit pay', 'nasapa', 'dagus', 'pay laeng', 'pay', 'napalabas']

adv_place_list = ['ditoy', 'sadiay', 'iti labesna', 'iti sadinoman a lugar', 'sadinoman', 'balay', 'pabuya']

adv_manner_list = ['naan-anay', 'medio', 'napartak', 'narigat', 'napartak', 'sibabannayat', 'kasla saan', 'dandani amin', 'dandani', 'awan pagpambarna', 'sangsangkamaysa', 'agmaymaysa']

adv_freq_list = ['masansan', 'kadawyan', 'maminsan', 'sagpaminsan', 'manmanon']

In [None]:
"""
Affixes
"""
PREFIX_SET = [
'na', 'ag', 'ka', 'ca', 'nag', 'im', 'maipa',
'maki', 'panna', 'maka', 'naki', 'naka', 'nang', 'makapag',
'mang', 'agan', 'agay', 'pananga', 'agam', 'nagpa', 'magpa', 
'ipa', 'pag', 'pam', 'taga', 'i', 'napa', 'in', 'manang',
'ma' # a translation for 'ma'
'para', 'pang', 'panag', 'nai', 'manag', 'man', 'kina',
'nai', 'nai', 'nagpa', 'mapag' # nangi
]

Adj_Prefix =[
'ka', 
'na' # a translation for 'ma'
]

INFIX_SET = ['in'] # eg. 'in' in 'kinunana' (sinabi)
"""
infix sa tagalog ay prefix sa ilokano
sumigaw = inpukaw
"""

SUFFIX_SET = [
'to', 'nto', 'ak' 'en'
'na', 'an', 'm', 'nyo', 
'cayo', 'tayo',
'anda',
]

Adj_Suffix = [
'an'
]

PREPO_SET = [
    'tengnga',
    'rabaw', 'rabao', 'baba', 'babaen', 
    'ngatuen', 'ngato', 'sirok', 'sidong',
    'sango', 'sarang', 'saklang', 'sanguanan' 'likud', 
    'ruar', 'uneg',
    'baet', 'sango', 'umuna'
    'ngudo', 'ungto', 'abay', 'igid'
]

CONJ_SET = [
    'ken', 'ket',
    'gapu', 'ta', 'agsipud',    'laeng', 'ngem', 'nupay kasta',
    'bayat', 'uray', 
    'intono', 'no', 'ta', 'ngamin', 
    'kaso', 'gapuna', 
    'ngem', 'idi',
    'nga', 
    'ni',  'wenno', 
    'para', 'tapno', 'agraman', 
    'numpay kasta', 
    'ken', 'ket', 'kabayatanna', 'bayat', 
    'kada', 'cas'
]

PER_PRONOUN = [
    'siak', 'sika', 'isu', 'dakami', 'datayo', 'dakayo', 'kayo', 'da', 'caycayo', 'kaykayo'
    'dinak', 'diak', 'kaniak',
    'kadakami', 'kami',
    'kadakayo', 'dakayo', 'kayo',
    'ida', 'da',
    'ko',
    'kukuami', 'kadatayo', 'kukuatayo', 'tayo', 
    'kata', 'mo',
    'cadacuada', 
    'kenkuana', 'kencuana','mi',
    'yo', 'nyo'
    'na',
]

In [None]:
""" 
    Other Lists
"""
vowels = ['a', 'e', 'i', 'o', 'u']

### Setting up the checker functions

In [None]:
"""
    Determiner Checker Function
"""
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list): # if the word is a determiner 
        ans = True
    else:
        ans = False

    return ans

In [None]:
"""
    Verb Affixer Checker Function
"""
def check_verb_affixes(word, prev_word, isTagged, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has a verb affix, and extracts it.
    """
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
            
    for infix in INFIX_SET:
        if word.__contains__(infix) and not isTagged:
            """
            eg. kinunana = sinabi
            """
            hasVerbAffixes = True
            isTagged = True
    
    for suffix in SUFFIX_SET:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes
# end of check_verb_affixes()

In [None]:
"""
    Verb Checker Function
"""

def isVerb(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    isDone = False
    isVerb = False
    
    if word not in PREPO_SET:    
        if word == 'espiritu' and not isDone:
            """
            if the word is 'espiritu' then it is not a verb
            eg. 'Esperitu' = 'spirit' (ti espiritu ti Dios)
            issue: there might be more words that vae a previous word ti and next word ti that is not a verb
            maybe noun database will solve this issue
            """
            isVerb = False
            isDone = True
           
        if word == 'naimbag' and not isDone:
            """
            naimbag = 'maganda', 'na maganda'
            """
            if next_word == 'iti':
                """
                if the next word is 'nga' then it means
                naimbag nga = "magandang", "maganda ang"
                """
            isVerb = False
            isDone = True
           
        if word == 'amin' and not isDone:
            isVerb = False
            isDone = True
    
        
        if (word.find("adda") != -1) and not isDone:
            isVerb = True
            isDone = True
            
        if word == 'nagtignay' and not isDone:
            if next_word == 'iti':
                """
                if the word is 'nagtignay' and next word is 'iti' then it is a verb
                then it is a propositional determiner
                eg. nagtignay iti = sumasa / ay sumasa
                """
                isVerb = False
                isDone = True
    
        if word == 'ninagananna' and not isDone:
            """
            ninagananna = "tinawag", "tinawag + niya"
            "pinangalan", "pinangalan + niya"
            ninaganna = "tinawag", "tinawag + ng"
            "pinangalan", "pinangalan + ng"
            """
            can2Viterbi = True
            isVerb = True
            isDone = True
        
        if word == 'naaramid a casta' and not isDone:
            """
            naaramid a casta = 'nagkagayon'
            """
            isVerb = True
            isDone = True
    
        if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET) and not isDone:
            if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): # if the previous word is not a determiner
                if next_word in (noun_dtmn_list): 
                    """
                    if the previous word is not in the noun, adverb, and preposition determiner and 
                    the next word is a noun determiner
                    """
                    if hasVerbAffixes:
                        """
                        if the current word has a verb affix/es, then it is a verb
                        """
                        isVerb = True
                        isDone = True
            
                if next_word in PER_PRONOUN and not isDone:
                    """
                    if the next word is a personal pronoun
                    eg. (insert an example sentence)
                    issue: check if there's an issue
                    """
                    isVerb = True
                    isDone = True                
        
            if word.startswith('pa') and (word.endswith('en') or word.endswith('in')):
                """
                if word starts with pa and ends with en or in then it is verb
                eg. patuboen
                """
                isVerb = True
                isDone = True                
        
            if word.startswith('ag') and prev_word == 'nga' and next_word in ('nga', 'a'):
                isVerb = True
                isDone = True                
        
            if word.startswith('ag') and (word.endswith('kayo') or word.endswith('cayo')):
                isVerb = True
                isDone = True

            if prev_word == 'ti' and next_word in (noun_dtmn_list) and (not next_word in ('a','iti', 'ken')) and not isDone:
                """
                if the previous word is 'ti' and the next word is a noun determiner
                eg. ti aramid ti dios (Nilalang ng Dios)
                """
                if word == 'aramid' and next_word == 'ti':
                    """
                    aramid which means gawa that is a noun is being used as a verb translation of 'nilalang'
                    """
                    isVerb = True
                    isDone = True

                elif next_word != 'ti':
                    isVerb = True
                    isDone = True
            
            if prev_word in CONJ_SET and hasVerbAffixes and next_word in noun_dtmn_list and not isDone:
                isVerb = True
                isDone = True

            if word.startswith("ag") and word[2:5] == word[5:8] and not isDone:
                """
                if the word is an adjective it repeats the next 3 letters after 'ag'
                eg. 'agcarcaryam' = umuusad
                """
                isVerb = True
                isDone = True

            if prev_word == 'nga' and next_word =='a':
                isVerb = True
                isDone = True

            if word == 'aguy' and next_word == 'uyas':
                isVerb = True
                isDone = True

            if prev_word == 'aguy' and word == 'uyas':
                isVerb = True
                isDone = True

            if prev_word == 'iti' and next_word == 'ken' and word.endswith('da') and hasVerbAffixes:
                isVerb = True
                isDone = True

            if prev2_word == 'ti' and not isDone:
                if next_word in (noun_dtmn_list) and not next_word == 'a':
                    """
                    if the previous of previous word is 'ti' and the next word is a noun determiner
                    eg. ti Dios pinarsuana dagiti (ay nilikha ng Dios)
                    """
                    isVerb = True
                    isDone = True
                
                if hasVerbAffixes and not isDone:
                    """
                    if the current word has a verb affix/es, then it is a verb
                    """
                    isVerb = True
                    isDone = True
        
        if hasVerbAffixes and prev_word == None and not isDone:
            """
            if the current word has a verb affix/es and the previous word is None
            """
            isVerb = True
            isDone = True
    
    return isVerb

# end of function

In [None]:
"""
    Noun Checker Function
"""
def isNoun(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a noun, and extracts it.
    """
    isDone = False
    isNoun = False
    
    if word in PER_PRONOUN and word not in PREPO_SET:
        """
        if the word is a personal pronoun, then it is a noun
        """
        isNoun = True
        isDone = True

    if word and not isDone:
        if prev_word in (noun_dtmn_list) and word not in (PREPO_SET + CONJ_SET + noun_dtmn_list) and not isDone:
            isNoun = True
            
            if not word.startswith("maica") and not isDone:
                """
                if previous word is a and the word does not start with maica, then it is a noun
                e.g. aldaw a maicadua -> nattag kasi maicadua pag wala tong condition
                """
                isNoun = True
                isDone = True
                
            elif word.startswith("maica"):
                isNoun = False
                isDone = True

            if next2_word.startswith("maica") and next_word == "a" and not isDone:
                """
                if next next word starts with maic prefix and next word is a, then it is a noun
                e.g. aldaw a maicadua -> di nattag aldaw since wala siyang noun_dtmn before aldaw
                """
                isNoun = True
                isDone = True

            if word[:2] == word[2:4]:
                if prev_word in (noun_dtmn_list) and next_word not in ("ti", "nga", "a"):
                    """
                    if the first two letters of a word is repeated and next_word is not ti/nga/a, then it is a noun
                    e.g. dadackel -> adjective dapat
                    """
                    isNoun = True
                    isDone = True
                else:
                    isNoun = False
                    isDone = False
            
            if word[:3] == word[3:6]:
                # untags adjs such as dacdackel
                if prev_word in (noun_dtmn_list) and next_word not in (noun_dtmn_list):
                    isNoun = False

                elif prev_word in (noun_dtmn_list) and next_word == None:
                    isNoun = True

            isDone = True

        if prev_word == 'idi' and  not hasVerbAffixes and not isDone:
            isNoun = True
            isDone = True           
        
        if (word.startswith('ka') or word.startswith('ca')) and word.endswith('tayo'):
            """
            if word starts with pa and ends with en or in then it is verb
            eg. caaspingtayo = sa ating wangis
            """
            isNoun = True
            isDone = True 

        if next_word in CONJ_SET and not hasVerbAffixes and not isDone:
            isNoun = True
            isDone = True
        
        if prev_word in noun_dtmn_list and (next_word.find("adda") != -1):
            isNoun = True
            isDone = True
        
        if next_word =='a' and prev2_word == 'nga':
            isNoun = True
            isDone = True

        if prev_word == word[:2]:
            """
            eg.  an-animal
            """
            isNoun = True
            isDone = True

        if prev_word == 'nga' and next_word == 'ti':
            isNoun = True
            isDone = True

        if prev2_word in noun_dtmn_list and not isDone:
            isNoun = True
            isDone = True

        if word.endswith('um') or word.endswith('en'):
            isNoun = True
            isDone = True

        if word in PER_PRONOUN:
            """
            if the word is a personal pronoun, then it is a noun
            """
            isNoun = True
            isDone = True
    return isNoun
# end of function

In [None]:
"""
    Adjective Checker Function
"""
def isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adjective, and extracts it.
    """
    isDone = False
    isAdj = False
        
    if word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + PREPO_SET + PER_PRONOUN + CONJ_SET):
            
        if word.startswith("na") and (next_word in noun_dtmn_list or next_word == 'a' or prev_word == 'ti') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'na' and the next word is noun determiner
            eg. napintas ti balay (maganda ang bahay)
            eg. naimbag a bigat (magandang umaga)
            """
            isAdj = True
            isDone = True

        if word.startswith("na") and word[:3] != 'nag' and prev2_word in noun_dtmn_list and (next_word in noun_dtmn_list or next_word == 'ket') and not isDone:
            """
            if the word is sandwiched between two nouns
            eg.
            """
            isAdj = True
            isDone = True

        if word.startswith("na") and not word.startswith("nag") and (prev_word in ("ti", "nga", "a")) and (word.find("biag") == -1) and not word.endswith('sua') and not isDone:
            """
            if the adjective is at the end
            eg.
            """
            isAdj = True
            isDone = True 

        if word.startswith("ka") and word.endswith("an") and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'ka' and adjective suffix 'an' and its a superlative adjective
            eg. kadakkelan (pinakamalaki)
            """
            isAdj = True
            isDone = True 
    
        if (word.find("una") != -1) and (next_word == 'a' or next_word == 'nga') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has a word 'una' and next word is 'a' or 'nga'
            eg. umuna a bilin (unang bilin)
            eg. immuna nga arida (unang hari)
            """
            isAdj = True
            isDone = True

        if word == 'awan' and next_word in noun_dtmn_list and not isDone:
            """
            if the word is an adjective it is awan followed by noun
            eg. awan (walang)
            """
            isAdj = True
            isDone = True
        
        if word == 'awan' and prev_word in noun_dtmn_list and not isDone:
            isAdj = True
            isDone = True

        if word == 'amin' and prev_word in PER_PRONOUN:
            isAdj = True
            isDone = True
    
        if word == 'maysa' and (next_word == 'a' or next_word == 'nga') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it is maysa followed by nga or a and its an ordinal adjective
            eg. maysa (unang or isang)
            """
            isAdj = True
            isDone = True 

        if word.startswith("maika") or word.startswith("maica"):
            """
            if the word is an adjective it has an adjective prefix 'maika' or 'maica' and its an ordinal adjective
            eg. maicadua (ikalawang)
            """
            isAdj = True
            isDone = True 

        if word[:3] == word[3:6] and not word.endswith('aw') and (next_word in noun_dtmn_list or prev_word == 'a') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it repeats the first 3 letters to make it comparative
            eg. dakdakkel, basbassit
            """
            isAdj = True
            isDone = True 

        if word[:2] == word[2:4] and (next_word in noun_dtmn_list or prev_word in ('a', 'dagiti')) and not hasVerbAffixes and not isDone:
            if word =='lalaki' or word == 'babai':
                isAdj = False
                isNoun = True 
                isDone= False
            else:
                """
                if the word is an adjective it repeats the first 2 letters
                eg. dadakkel (malalaking), babassit (maliliit)
                """
                isAdj = True
                isDone = True

        if word.startswith("na") and word[2:5] == word[5:8] and not isDone:
            """
            if the word is an adjective it repeats the next 3 letters after 'na' to make it comparative
            eg. nalaklaka, napinpintas
            """
            isAdj = True
            isDone = True
        
        if word.startswith("na") and word[2:6] == word[6:10] and not isDone:
            """
            if the word is an adjective it repeats the next 4 letters after 'na' to make it comparative
            eg. nasingsingpet
            """
            isAdj = True
            isDone = True

    return isAdj
# end of function

In [None]:
"""
    Adverb Checker Function
"""
def isAdv(word, prev_word, next_word, next2_word, hasVerbAffixes):
   """
   This function tags if the specific word in the Ilokano sentences is an adverb, and extracts it.
   """
   isDone = False
   isAdv = False
   
   if word not in PER_PRONOUN and word not in PREPO_SET:
      if word.startswith('idi') or word.startswith('di') and not prev_word == 'nga' and not isDone:
         """
         If the word starts with idi and has nga as its next word it is an adverb describing an adjective
         
         """
         isAdv = True
         isDone = True
           
      if word in adv_time_list and not isDone:
         """
         If the word is in the adverb of time list, then it is an adverb
         """ 
         isAdv = True
         isDone = True
         
      if word in adv_manner_list and not isDone:
         """
         If the word is in the adverb of time list, then it is an adverb
         """ 
         isAdv = True
         isDone = True
         
      if word in adv_freq_list and not isDone:
         """
         If the word is in the adverb of time list, then it is an adverb
         """ 
         isAdv = True
         isDone = True
         
      if word in adv_place_list and not isDone:
         """
         If the word is in the adverb of time list, then it is an adverb
         """ 
         isAdv = True
         isDone = True
         
      if prev_word in adv_dtmn_list and not isVerb and not isNoun and not isDone:
         """
         If the word's previous word is in the determiner's list and not a verb or a noun, then it is n adverb
         """
         isAdv = True
         isDone = True
            
      if next_word =='nga' or next_word == 'a' and word.startswith("na") and not isDone: 
         """
         If the word starts with na and has nga as its next word it is an adverb describing an adjective
         eg. napartak nga iyaadu = mabilis na pagdami, Napigsa a tudo = malakas na ulan
         """   
         isAdv = True
         isDone = True 
         
      # if next_word == 'a' and next2_word isAdj and not hasVerbAffixes and not isDone:
      #     """
      #     If the next word is a and has no Verb affixes, then the word is an adverb
      #     eg. tiyak na maganda ang kinabukasan ng mga ... =  sigurado a naraniag ti masakbayan dagidiay...
      #     """
      #     isAdv = True
      #     isDone = True
         
      if word.startswith('na') and not next_word in noun_dtmn_list and not isDone:
         """
         If the next word is not a noun dtrmr and the word starts with 'a'
         eg. mabilis na naglalaho = napartak a mapukpukaw
         """
         isAdv = True
         isDone = True
         
      if word == "awan" and not next_word in noun_dtmn_list or isNoun and not isDone:
         """
         If the next word is not a noun or pronoun and if the word is Awan, then it is adverb
         """
         isAdv = True
         isDone = True
                  
   return isAdv
# end of function

In [None]:
"""
    Preposition Checker Function
"""
def isPrepo(word, prev_word):
    """
    This function checks if the specific word in the sentence is a preposition, and extracts it.
    """
    isPrepo = False
    isDone = False
    prev_word = ""
    
    if prev_word in (prepo_dtmn_list) and word in (PREPO_SET) and not isDone:
        isPrepo = True
        isDone = True
        
    if word not in (PREPO_SET) and not isDone:
        isPrepo = True
        isDone = True
        
    if word in (PREPO_SET) and not isDone:
        isPrepo = True
        isDone = True
    if (word.find("ruar") != -1):
        """
        eg. makinruar (dakong labas)
        """
        isPrepo = True
        isDone = True

    return isPrepo
# end of function

In [None]:
"""
    Conjunction Checker Function
"""

def isConj(word):
    """
    This function checks if the specific word in the sentence is a conjunction
    """
    if word in CONJ_SET:
        return True
    else:
        return False
# end of function

### Setting up the models

In [None]:
""""
    Ilokano to tagalog Dictionaries
"""

# Single Words Dictionary
dict_sw = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_sw.json')
dict_vb = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_vb.json')
dict_nn = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_nn.json')
dict_jj = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_jj.json')
dict_rb = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_rb.json')
dict_cc = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_cc.json')
dict_pr = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_pr.json')
dict_dt = pd.read_json('../../src/json data/Ilokano to Tagalog/Example-Based/dict_il_dt.json')

In [None]:
"""
    Putting the columns in a list
"""
sw_il_list = dict_sw['Ilokano Single Words'].tolist()
sw_tl_list = dict_sw['Tagalog Single Words'].tolist()
vb_il_list = dict_vb['Ilokano Verb'].tolist()
vb_tl_list = dict_vb['Tagalog Verb'].tolist()
nn_il_list = dict_nn['Ilokano Noun'].tolist()
nn_tl_list = dict_nn['Tagalog Noun'].tolist()
jj_il_list = dict_jj['Ilokano Adjective'].tolist()
jj_tl_list = dict_jj['Tagalog Adjective'].tolist()
rb_il_list = dict_rb['Ilokano Adverb'].tolist()
rb_tl_list = dict_rb['Tagalog Adverb'].tolist()
cc_il_list = dict_cc['Ilokano Conjunction'].tolist()
cc_tl_list = dict_cc['Tagalog Conjunction'].tolist()
pr_il_list = dict_pr['Ilokano Preposition'].tolist()
pr_tl_list = dict_pr['Tagalog Preposition'].tolist()
dt_il_list = dict_dt['Ilokano Determiner'].tolist()
dt_tl_list = dict_dt['Tagalog Determiner'].tolist()



### Tagger

In [None]:
def tag(sentence_list):
    isTagged = None
    hasVerbAffixes = None
    pos_sen_list = []
    """
    instantiations of the variables
    """

    for sentence in sentence_list:
        pos_list = []
        prev_word = ""
        prev2_word = ""
        sen_len = len(sentence)
        """
        instantiations of the variables
        """
        
        for word in sentence:
            
            isTagged = False
            hasVerbAffixes = False
            """
            instantiations of the variables
            """
            try:
                next_word = sentence[sentence.index(word) + 1]
            except (ValueError, IndexError):
                next_word = ""
            """
            gets the next word in the sentence
            """
            
            try:
                next2_word = sentence[sentence.index(word) + 2]
            except (ValueError, IndexError):
                next2_word = ""
            """
            gets the next word in the sentence
            """
            
            try:
                hasVerbAffixes = check_verb_affixes(word, prev_word, isTagged, hasVerbAffixes)
            except (ValueError, IndexError):
                hasVerbAffixes = False
            """
            checks if the word has verb affixes
            """
            
            if sen_len == 1:
                """
                if the sentence is only one word long
                """
                pos_list.append('SW')
                isTagged = True
            
            elif isDtmn(word) and not isTagged:
                """
                checks if the word is a determiner
                """
                pos_list.append('DT')
                isTagged = True
                
            elif isConj(word) and not isTagged:
                """
                checks if the word is a conjunction and not tagged
                """
                pos_list.append('CC')
                isTagged = True
            
            elif isVerb(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is a determiner
                """
                pos_list.append('VB')
                isTagged = True

            elif isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adjective and not tagged
                """
                pos_list.append('JJ')
                isTagged = True

            elif isNoun(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is a determiner
                """
                pos_list.append('NN')
                isTagged = True

            elif isAdv(word, prev_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adverb and not tagged
                """
                pos_list.append('RB')
                isTagged = True
            
            elif isPrepo(word, prev_word) and not isTagged:
                """
                checks if the word is a preposition and not tagged
                """
                pos_list.append('PR')
                isTagged = True
                
            else:
                """
                if the word is not tagged, then it is an unknown word
                """
                pos_list.append('UNK')
                isTagged = True
            
            prev_word = word
            """
            getting the previous word
            """
            
            try:
                prev2_word = sentence[sentence.index(word) - 1]
            except (ValueError, IndexError):
                prev2_word = None
            """
            getting the previous after the previous word
            """
            
        pos_sen_list.append(pos_list)
        """
        storing the words in the list to the list of sentences
        """
        
    dict_test_doc['POS'] = pos_sen_list

tag(dict_test_doc['Tokenized'])

In [None]:
dict_test_doc.head()

#### Token Combiner

In [None]:
def combine_tokens(sen_translation_list):
    temp_sen_list = []

    for sen_translation in sen_translation_list:
        temp_sen = ''
        for word_translation in sen_translation:
            temp_index = sen_translation.index(word_translation)
            if temp_index == len(sen_translation) - 1:
                temp_sen += word_translation
            else:
                temp_sen += word_translation + ' '
        temp_sen_list.append(temp_sen)
    
    return temp_sen_list

In [None]:
f_phrases = pd.read_csv('../../src/csv data/f_phrases.csv')
il_phrases = f_phrases['Ilokano'].to_list()
il_phrases = [remove_punct(word) for word in il_phrases]
il_phrases = [tokenize(word) for word in il_phrases]

tl_phrases = f_phrases['Tagalog'].to_list()
tl_phrases = [remove_punct(word) for word in tl_phrases]
tl_phrases = [tokenize(word) for word in tl_phrases]

In [None]:
def inFPhrases(word, word2, word3, word4, word5, word6, word7, il_phrases):
    inFPhrases = False
    il_phrase = []
    w_used = 0
    for phrase in il_phrases:
        length = len(phrase)
        if length == 7:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5] and word7 == phrase[6]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 7
                break        
        if length == 6:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 6
                break
        if length == 5:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 5
                break
        if length == 4:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 4
                break
        if length == 3:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2]:
                inFPhrases = True
                il_phrase = phrase
                w_used = 3
                break
        if length == 2:
            if word == phrase[0] and word2 == phrase[1]:
                inFPhrases = True
                il_phrase = phrase
                w_used = 2
                break
        if length == 1:
            if word == phrase[0]:
                inFPhrases = True
                il_phrase = phrase
                w_used = 1
                break
                
    return inFPhrases, il_phrase, w_used
# end of function

### Translator

In [None]:
def translate(sen_poss_list):
    sp_index = 0 # sentence POS index
    sen_translation_list = []
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        
        wp_index = 0 # word POS index
        cur_wp_index = 0
        
        for word_pos in sen_poss:
            if wp_index == cur_wp_index:
                word = dict_test_doc['Tokenized'][sp_index][wp_index]
                # gets the word in every sentence
                
                try: 
                    word2 = dict_test_doc['Tokenized'][sp_index][wp_index+1]
                except:
                    word2 = None
                try:
                    word3 = dict_test_doc['Tokenized'][sp_index][wp_index+2]
                except:
                    word3 = None
                try:
                    word4 = dict_test_doc['Tokenized'][sp_index][wp_index+3]
                except:
                    word4 = None
                try:
                    word5 = dict_test_doc['Tokenized'][sp_index][wp_index+4]
                except:
                    word5 = None
                try:
                    word6 = dict_test_doc['Tokenized'][sp_index][wp_index+5]
                except:
                    word6 = None
                try:
                    word7 = dict_test_doc['Tokenized'][sp_index][wp_index+6]
                except:
                    word7 = None
                
                ans = inFPhrases(word, word2, word3, word4, word5, word6, word7, il_phrases)
                inFPDict = ans[0]
                il_phrase = ans[1]
                w_used = ans[2]                
                
                if inFPDict and il_phrase != []:
                    """
                    if the word is in the list of Tagalog phrases
                    """
                    p_index = il_phrases.index(il_phrase)
                    tl_phrase = tl_phrases[p_index]
                    for tl_word in tl_phrase:
                        sen_translation.append(tl_word)
                    cur_wp_index = wp_index + w_used
                    
                else:
                    cur_wp_index = wp_index + 1
                
                    # Matching Conditions    
                    # 1. SW
                    if word_pos == 'SW':
                        """
                        if the POS of the word is 'SW'
                        """
                        if word in sw_il_list:
                            """
                            if the word is in the Ilokano list of single words
                            """
                            temp_index = sw_il_list.index(word)
                            isNone = False
                            if sw_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(sw_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                    
                    # 2. VB
                    elif word_pos == 'VB':
                        """
                        if the POS of the word is 'VB'
                        """
                        if word in vb_il_list:
                            """
                            if the word is in the Ilokano list of verbs
                            """
                            temp_index = vb_il_list.index(word)
                            isNone = False
                            if vb_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(vb_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    # 3. NN
                    elif word_pos == 'NN':
                        """
                        if the POS of the word is 'NN'
                        """
                        if word in nn_il_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = nn_il_list.index(word)
                            isNone = False
                            if nn_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(nn_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                                        
                    # 4. JJ
                    elif word_pos == 'JJ':
                        """
                        if the POS of the word is 'JJ'
                        """
                        if word in jj_il_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = jj_il_list.index(word)
                            isNone = False
                            if jj_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(jj_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                                    
                    # 5. RB
                    elif word_pos == 'RB':
                        """
                        if the POS of the word is 'RB'
                        """
                        if word in rb_il_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = rb_il_list.index(word)
                            isNone = False
                            if rb_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(rb_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    # 6. CC
                    elif word_pos == 'CC':
                        """
                        if the POS of the word is 'CC'
                        """
                        if word in cc_il_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = cc_il_list.index(word)
                            isNone = False
                            if cc_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(cc_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                                    
                    # 7. PR
                    elif word_pos == 'PR':
                        """
                        if the POS of the word is 'CC'
                        """
                        if word in pr_il_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = pr_il_list.index(word)
                            if pr_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(pr_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    # 7. DT
                    elif word_pos == 'DT':
                        """
                        if the POS of the word is 'DT'
                        """
                        if word in dt_il_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = dt_il_list.index(word)
                            if dt_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(dt_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    else:
                        sen_translation.append(word)
            
            wp_index += 1
        sp_index += 1
        sen_translation_list.append(sen_translation)
    
    return sen_translation_list

"""
    putting the tokens together in one sentence
"""
sen_translation_list = translate(dict_test_doc['POS'])
temp_sen_list = combine_tokens(sen_translation_list)

dict_op_ex = pd.DataFrame({'Source Text': cleaned_test_doc, 'System Output': temp_sen_list})

In [None]:
dict_op_ex.head()

In [None]:
parsed_test_doc = target_op.split("\n")
cleaned_target_op = [remove_punct(word) for word in parsed_test_doc]
tokenized_target_op = [tokenize(word) for word in cleaned_target_op]
combine_tokens_target_op = combine_tokens(tokenized_target_op)

dict_op_ex['Target Output'] = combine_tokens_target_op

In [None]:
dict_op_ex.head()

In [None]:
import json

dict_il_tl_result = dict_op_ex.to_dict('records')

try:
    with open("../../src/json data/Ilokano to Tagalog/Standard Translator/dict_il_tl_test.json", "w") as outfile:
        json.dump(dict_il_tl_result, outfile)
    print("successfully saved the dict_il_tl_result.json file")
except:
    print("Error in saving the dict_il_tl_result.json file")