# Ilokano Part of Speech Tagger

### INITIALIZATION

#### Opening and Processing Data

In [None]:
import pandas as pd

sample_il_raw = open("../../src/text data/training/Bible_Ilokano.txt", encoding='utf-8').read()

parsed_sp_il_raw = sample_il_raw.split("\n")

dict_sm_il = pd.DataFrame(parsed_sp_il_raw, columns = ['Sentence'])

In [None]:
import string

def remove_punct(Text):
    text_nopumct = "".join([char for char in Text if char not in string.punctuation])
    return text_nopumct

cleaned_sp_il = [remove_punct(word) for word in parsed_sp_il_raw]

#### Lists and Fixes

In [None]:
"""
Determiner Lists
"""
noun_dtmn_list = ["dagiti", "ti", "cadagiti", "kadagiti", "ni", "ken", "ni", "coma", "koma", "a", "iti"]

adv_dtmn_list = ["idi", "iti"]

prepo_dtmn_list = ["ti", "addaak", "iti"]

adv_time_list = ['madamdama', 'ita', 'kalman', 'inton bigat', 'ditoy', 'idiay', 'ita a rabii', 'iti kaaldawantayo', 'idi rabii', 'sumaruno a lawas', 'nga', 'nabiit pay', 'nasapa', 'dagus', 'pay laeng', 'pay', 'napalabas']

adv_place_list = ['ditoy', 'sadiay', 'iti labesna', 'iti sadinoman a lugar', 'sadinoman', 'balay', 'pabuya']

adv_manner_list = ['naan-anay', 'medio', 'napartak', 'narigat', 'napartak', 'sibabannayat', 'kasla saan', 'dandani amin', 'dandani', 'awan pagpambarna', 'sangsangkamaysa', 'agmaymaysa']

adv_freq_list = ['masansan', 'kadawyan', 'maminsan', 'sagpaminsan', 'manmanon']

In [None]:
PREFIX_SET = [
'na', 'ag', 'ka', 'ca', 'nag', 'im', 'maipa',
'maki', 'panna', 'maka', 'naki', 'naka', 'nang', 
'makapag','mang', 'agan', 'agay', 'pananga', 'agam', 
'nagpa', 'magpa', 'ipa', 'pag', 'pam', 'taga', 'i', 
'napa', 'in', 'manang','ma', 'para', 'pang', 'panag', 
'nai', 'manag', 'man', 'kina', 'nai', 'nai', 'nagpa', 'mapag'
]

ADJ_PREFIX =[
'ka', 'na'
]

INFIX_SET = ['in']

SUFFIX_SET = [
'to', 'nto', 'ak' 'en'
'na', 'an', 'm', 'nyo', 
'cayo', 'tayo', 'anda',
]

ADJ_SUFFIX = [
'an'
]

PREPO_SET = [
    'tengnga', 'rabaw', 'rabao', 'baba', 'babaen', 
    'ngatuen', 'ngato', 'sirok', 'sidong', 'sango', 
    'sarang', 'saklang', 'sanguanan', 'likud', 'ruar', 
    'uneg', 'baet', 'sango', 'umuna', 'ngudo', 'ungto', 
    'abay', 'igid'
]

CONJ_SET = [
    'ken', 'ket', 'gapu', 'ta', 'agsipud', 'laeng', 
    'ngem', 'nupay kasta', 'bayat', 'uray', 'intono', 
    'no', 'ta', 'ngamin', 'kaso', 'gapuna', 'ngem', 'idi',
    'nga', 'ni',  'wenno', 'para', 'tapno', 'agraman', 'numpay kasta', 
    'ken', 'ket', 'kabayatanna', 'bayat', 'kada', 'cas'
]

PER_PRONOUN = [
    'siak', 'sika', 'isu', 'dakami', 'datayo', 'dakayo', 
    'kayo', 'da', 'caycayo', 'kaykayo', 'dinak', 'diak', 
    'kaniak', 'kadakami', 'kami','kadakayo', 'dakayo', 'kayo',
    'ida', 'da','ko', 'kukuami', 'kadatayo', 'kukuatayo', 'tayo', 
    'kata', 'mo', 'cadacuada', 'kenkuana', 'kencuana','mi','yo', 
    'nyo', 'na'
]

VOWELS = ['a', 'e', 'i', 'o', 'u']

#### Tokenizing

In [None]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    
    for token in tokens:
        
        try:
            next_token = tokens[tokens.index(token) + 1]
        except (ValueError, IndexError):
            next_token = None
        """
        gets the next word in the sentence
        """
        
        try:
            next2_token = tokens[tokens.index(token) + 2]
        except (ValueError, IndexError):
            next2_token = None
        """
        gets the next word in the sentence
        """
        
        if token == 'naaramid' and next_token == 'a' and next2_token == 'casta':
            temp_token = token + " " + next_token + " " + next2_token
            tokens[tokens.index(token)] = temp_token
            tokens.remove(next_token)
            tokens.remove(next2_token)
            
        if token == '':
            tokens.remove(token)
    return tokens



tokenized_sp_tl = [tokenize(word) for word in parsed_sp_il_raw]

dict_sm_il['Tokenized'] = tokenized_sp_tl
dict_sm_il.head()

### VITERBI ALGORITHM

In [None]:
"""
    Determiner Checker Function
"""
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list):
        ans = True
    else:
        ans = False

    return ans

In [None]:
"""
    Verb Affixer Checker Function
"""
def check_verb_affixes(word, prev_word, isTagged, hasVerbAffixes):
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
            
    for infix in INFIX_SET:
        if word.__contains__(infix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
    
    for suffix in SUFFIX_SET:
        if word.endswith(suffix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes

In [None]:
"""
    Verb Checker Function
"""
def isVerb(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes):
    isDone = False
    isVerb = False
    
    if word not in PREPO_SET:    
        if word == 'espiritu' and not isDone:
            isVerb = False
            isDone = True
           
        if word == 'naimbag' and not isDone:
            if next_word == 'iti':
                isVerb = False
                isDone = True
            
        if word == 'amin' and not isDone:
            isVerb = False
            isDone = True
        
        if (word.find("adda") != -1) and not isDone:
            isVerb = True
            isDone = True
            
        if word == 'nagtignay' and not isDone:
            if next_word == 'iti':
                isVerb = False
                isDone = True
    
        if word == 'ninagananna' and not isDone:
            can2Viterbi = True
            isVerb = True
            isDone = True
        
        if word == 'naaramid a casta' and not isDone:
            isVerb = True
            isDone = True
    
        if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET) and not isDone:
            if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list):
                if next_word in (noun_dtmn_list): 
                    if hasVerbAffixes:
                        isVerb = True
                        isDone = True
            
                if next_word in PER_PRONOUN and not isDone:
                    isVerb = True
                    isDone = True                
        
            if word.startswith('pa') and (word.endswith('en') or word.endswith('in')):
                isVerb = True
                isDone = True                
        
            if word.startswith('ag') and prev_word == 'nga' and next_word in ('nga', 'a'):
                isVerb = True
                isDone = True                
        
            if word.startswith('ag') and (word.endswith('kayo') or word.endswith('cayo')):
                isVerb = True
                isDone = True

            if prev_word == 'ti' and next_word in (noun_dtmn_list) and (not next_word in ('a','iti', 'ken')) and not isDone:
                if word == 'aramid' and next_word == 'ti':
                    isVerb = True
                    isDone = True

                elif next_word != 'ti':
                    isVerb = True
                    isDone = True
            
            if prev_word in CONJ_SET and hasVerbAffixes and next_word in noun_dtmn_list and not isDone:
                isVerb = True
                isDone = True

            if word.startswith("ag") and word[2:5] == word[5:8] and not isDone:
                isVerb = True
                isDone = True

            if prev_word == 'nga' and next_word =='a':
                isVerb = True
                isDone = True

            if word == 'aguy' and next_word == 'uyas':
                isVerb = True
                isDone = True

            if prev_word == 'aguy' and word == 'uyas':
                isVerb = True
                isDone = True

            if prev_word == 'iti' and next_word == 'ken' and word.endswith('da') and hasVerbAffixes:
                isVerb = True
                isDone = True

            if prev2_word == 'ti' and not isDone:
                if next_word in (noun_dtmn_list) and not next_word == 'a':
                    isVerb = True
                    isDone = True
                
                if hasVerbAffixes and not isDone:
                    isVerb = True
                    isDone = True
        
        if hasVerbAffixes and prev_word == None and not isDone:
            isVerb = True
            isDone = True
    
    return isVerb

In [None]:
"""
    Noun Checker Function
"""
def isNoun(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes):
    isDone = False
    isNoun = False
    
    if word in PER_PRONOUN and word not in PREPO_SET:
        isNoun = True
        isDone = True

    if word and not isDone:
        if prev_word in (noun_dtmn_list) and word not in (PREPO_SET + CONJ_SET + noun_dtmn_list) and not isDone:
            isNoun = True
            
            if not word.startswith("maica") and not isDone:
                isNoun = True
                isDone = True
                
            elif word.startswith("maica"):
                isNoun = False
                isDone = True

            if next2_word.startswith("maica") and next_word == "a" and not isDone:
                isNoun = True
                isDone = True

            if word[:2] == word[2:4]:
                if prev_word in (noun_dtmn_list) and next_word not in ("ti", "nga", "a"):
                    isNoun = True
                    isDone = True
                else:
                    isNoun = False
                    isDone = False
            
            if word[:3] == word[3:6]:
                if prev_word in (noun_dtmn_list) and next_word not in (noun_dtmn_list):
                    isNoun = False

                elif prev_word in (noun_dtmn_list) and next_word == None:
                    isNoun = True

            isDone = True

        if prev_word == 'idi' and  not hasVerbAffixes and not isDone:
            isNoun = True
            isDone = True           
        
        if (word.startswith('ka') or word.startswith('ca')) and word.endswith('tayo'):
            isNoun = True
            isDone = True 

        if next_word in CONJ_SET and not hasVerbAffixes and not isDone:
            isNoun = True
            isDone = True
        
        if prev_word in noun_dtmn_list and (next_word.find("adda") != -1):
            isNoun = True
            isDone = True
        
        if next_word =='a' and prev2_word == 'nga':
            isNoun = True
            isDone = True

        if prev_word == word[:2]:
            isNoun = True
            isDone = True

        if prev_word == 'nga' and next_word == 'ti':
            isNoun = True
            isDone = True

        if prev2_word in noun_dtmn_list and not isDone:
            isNoun = True
            isDone = True

        if word.endswith('um') or word.endswith('en'):
            isNoun = True
            isDone = True

        if word in PER_PRONOUN:
            isNoun = True
            isDone = True
    return isNoun

In [None]:
"""
    Adjective Checker Function
"""
def isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes):
    isDone = False
    isAdj = False
        
    if word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + PREPO_SET + PER_PRONOUN + CONJ_SET):
            
        if word.startswith("na") and (next_word in noun_dtmn_list or next_word == 'a' or prev_word == 'ti') and  not hasVerbAffixes and not isDone:
            isAdj = True
            isDone = True

        if word.startswith("na") and word[:3] != 'nag' and prev2_word in noun_dtmn_list and (next_word in noun_dtmn_list or next_word == 'ket') and not isDone:
            isAdj = True
            isDone = True

        if word.startswith("na") and not word.startswith("nag") and (prev_word in ("ti", "nga", "a")) and (word.find("biag") == -1) and not word.endswith('sua') and not isDone:
            isAdj = True
            isDone = True 

        if word.startswith("ka") and word.endswith("an") and not isDone:
            isAdj = True
            isDone = True 
    
        if (word.find("una") != -1) and (next_word == 'a' or next_word == 'nga') and  not hasVerbAffixes and not isDone:
            isAdj = True
            isDone = True

        if word == 'awan' and next_word in noun_dtmn_list and not isDone:
            isAdj = True
            isDone = True
        
        if word == 'awan' and prev_word in noun_dtmn_list and not isDone:
            isAdj = True
            isDone = True

        if word == 'amin' and prev_word in PER_PRONOUN:
            isAdj = True
            isDone = True
    
        if word == 'maysa' and (next_word == 'a' or next_word == 'nga') and  not hasVerbAffixes and not isDone:
            isAdj = True
            isDone = True 

        if word.startswith("maika") or word.startswith("maica"):
            isAdj = True
            isDone = True 

        if word[:3] == word[3:6] and not word.endswith('aw') and (next_word in noun_dtmn_list or prev_word == 'a') and  not hasVerbAffixes and not isDone:
            isAdj = True
            isDone = True 

        if word[:2] == word[2:4] and (next_word in noun_dtmn_list or prev_word in ('a', 'dagiti')) and not hasVerbAffixes and not isDone:
            if word =='lalaki' or word == 'babai':
                isAdj = False
                isNoun = True 
                isDone= False
            else:
                isAdj = True
                isDone = True

        if word.startswith("na") and word[2:5] == word[5:8] and not isDone:
            isAdj = True
            isDone = True
        
        if word.startswith("na") and word[2:6] == word[6:10] and not isDone:
            isAdj = True
            isDone = True

    return isAdj

In [None]:
"""
    Adverb Checker Function
"""
def isAdv(word, prev_word, next_word, next2_word, hasVerbAffixes):
   isDone = False
   isAdv = False
   
   if word not in PER_PRONOUN and word not in PREPO_SET:
      if word.startswith('idi') or word.startswith('di') and not prev_word == 'nga' and not isDone:
         isAdv = True
         isDone = True
           
      if word in adv_time_list and not isDone:
         isAdv = True
         isDone = True
         
      if word in adv_manner_list and not isDone:
         isAdv = True
         isDone = True
         
      if word in adv_freq_list and not isDone:
         isAdv = True
         isDone = True
         
      if word in adv_place_list and not isDone:
         isAdv = True
         isDone = True
         
      if prev_word in adv_dtmn_list and not isVerb and not isNoun and not isDone:
         isAdv = True
         isDone = True
            
      if next_word =='nga' or next_word == 'a' and word.startswith("na") and not isDone: 
         isAdv = True
         isDone = True 
         
      if word.startswith('na') and not next_word in noun_dtmn_list and not isDone:
         isAdv = True
         isDone = True
         
      if word == "awan" and not next_word in noun_dtmn_list or isNoun and not isDone:
         isAdv = True
         isDone = True
                  
   return isAdv

In [None]:
"""
    Preposition Checker Function
"""
def isPrepo(word, prev_word):
    isPrepo = False
    isDone = False
    prev_word = ""
    
    if prev_word in (prepo_dtmn_list) and word in (PREPO_SET) and not isDone:
        isPrepo = True
        isDone = True
        
    if word not in (PREPO_SET) and not isDone:
        isPrepo = True
        isDone = True
        
    if word in (PREPO_SET) and not isDone:
        isPrepo = True
        isDone = True
    if (word.find("ruar") != -1):
        isPrepo = True
        isDone = True

    return isPrepo

In [None]:
"""
    Conjunction Checker Function
"""

def isConj(word):
    if word in CONJ_SET:
        return True
    else:
        return False

### TAGGER

In [None]:
def tag(sentence_list):
    isTagged = None
    hasVerbAffixes = None
    sw_sen_list = []
    dtmn_sen_list = []
    conj_sen_list = []
    verb_sen_list = []
    noun_sen_list = []
    adj_sen_list = []
    adv_sen_list = []
    prepo_sen_list = []
    unkn_sen_list = []
    pos_sen_list = []

    for sentence in sentence_list:
        sw_list = []
        dtmn_list = []
        conj_list = []
        verb_list = []
        noun_list = []
        adj_list = []
        adv_list = []
        prepo_list = []
        unkn_list = []
        pos_list = []
        prev_word = None
        prev2_word = None
        sen_len = len(sentence)
        
        for word in sentence:
            
            isTagged = False
            hasVerbAffixes = False
            try:
                next_word = sentence[sentence.index(word) + 1]
            except (ValueError, IndexError):
                next_word = ""
            
            try:
                next2_word = sentence[sentence.index(word) + 2]
            except (ValueError, IndexError):
                next2_word = ""
            
            try:
                hasVerbAffixes = check_verb_affixes(word, prev_word, isTagged, hasVerbAffixes)
            except (ValueError, IndexError):
                hasVerbAffixes = False
            
            if sen_len == 1:
                sw_list.append(word)
                pos_list.append('SW')
                isTagged = True
            
            elif isDtmn(word) and not isTagged:
                dtmn_list.append(word)
                pos_list.append('DT')
                isTagged = True
                
            elif isConj(word) and not isTagged:
                conj_list.append(word)
                pos_list.append('CC')
                isTagged = True
            
            elif isVerb(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                verb_list.append(word)
                pos_list.append('VB')
                isTagged = True

            elif isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes) and not isTagged:
                adj_list.append(word)
                pos_list.append('JJ')
                isTagged = True

            elif isNoun(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                noun_list.append(word)
                pos_list.append('NN')
                isTagged = True

            elif isAdv(word, prev_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                adv_list.append(word)
                pos_list.append('RB')
                isTagged = True
            
            elif isPrepo(word, prev_word) and not isTagged:
                prepo_list.append(word)
                pos_list.append('PR')
                isTagged = True
                
            else:
                unkn_list.append(word)
                pos_list.append('UNK')
                isTagged = True
            
            prev_word = word
            
            try:
                prev2_word = sentence[sentence.index(word) - 1]
            except (ValueError, IndexError):
                prev2_word = None
            
        sw_sen_list.append(sw_list)
        dtmn_sen_list.append(dtmn_list)
        conj_sen_list.append(conj_list)
        verb_sen_list.append(verb_list)
        noun_sen_list.append(noun_list)
        adj_sen_list.append(adj_list)
        adv_sen_list.append(adv_list)
        prepo_sen_list.append(prepo_list)
        unkn_sen_list.append(unkn_list)
        pos_sen_list.append(pos_list)

    dict_sm_il['Single Word'] = sw_sen_list
    dict_sm_il['Determiner'] = dtmn_sen_list
    dict_sm_il['Conjunction'] = conj_sen_list
    dict_sm_il['Verb'] = verb_sen_list
    dict_sm_il['Noun'] = noun_sen_list
    dict_sm_il['Adjective'] = adj_sen_list
    dict_sm_il['Adverb'] = adv_sen_list
    dict_sm_il['Preposition'] = prepo_sen_list
    dict_sm_il['Unknown'] = unkn_sen_list
    dict_sm_il['POS'] = pos_sen_list

tag(dict_sm_il['Tokenized'])

dict_sm_il.head(30)

### SAVING FILES

In [None]:
import json

dictionary = dict_sm_il.to_dict('records')

try:
    with open("../../src/json data/Ilokano to Tagalog/il_pos.json", "w") as outfile:
        json.dump(dictionary, outfile)
    print("successfully saved the json file")
except:
    print("Error in saving the json file")