In [26]:
# setting up the environment
import os
import re
from os.path import join, isfile, splitext
from os import listdir
import pandas as pd
from IPython.core.display import display, HTML
import sys
import yaml

from nomorwhat import *

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', -1)

%load_ext autoreload
%autoreload 2

macro_config_file = 'macros.yml'
input_file = 'test_input.txt'
output_file = 'test_output.txt'
orig_annot_file = 'test_full_annot.txt'
merged_output_file = 'test_merged_output.txt'



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  app.launch_new_instance()


In [3]:
# reading macros
with open(macro_config_file, 'r') as fin:
    macros = yaml.load(fin)
my_sentences = readtext(input_file)

  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:

def check_macro(macro_name, anal):
    macro_res = False
    # print(macro_name, anal)
    if macros['macros'][macro_name]['type'] == 'list':
        if anal[0] in macros['macros'][macro_name]['value']:
            macro_res = True
        else:
            macro_res = False
    elif macros['macros'][macro_name]['type'] == 'regex':
        if macros['macros'][macro_name]['regexp_type'] == 'search':
            macro_res = re.search(macros['macros'][macro_name]['value'], anal[2])
        else:
            pass
    elif macros['macros'][macro_name]['type'] == 'ends':
        macro_res = anal[2].endswith(macros['macros'][macro_name]['value'])
    elif macros['macros'][macro_name]['type'] == 'complex':
        if macros['macros'][macro_name]['compl_type'] == 'and': 
            all_true = True
            for macro in macros['macros'][macro_name]['sub_macros']:
                if not check_macro(macro,anal):
                    all_true = False
                    break
            macro_res = all_true
        elif macros['macros'][macro_name]['compl_type'] == 'or':
            one_true = False
            for macro in macros['macros'][macro_name]['sub_macros']:
                if check_macro(macro,anal):
                    one_true = True
                    break
            macro_res = one_true
    elif macros['macros'][macro_name]['type'] == 'neg':
        macro_res = not check_macro(macros['macros'][macro_name]['sub_macro'],anal)
            
    return macro_res


In [5]:
#macro_name = 'nu_mn'
#macros['macros'][macro_name]

if check_macro('not_gen', ['az', 'az', 'DET']):
    print("ugyesvagy")
if check_macro('NPMod', ['szép', 'szép', 'MN.NOM']):
    print("ugyesvagy")
if check_macro('NOM', ["almák", 'alma', 'FN.PL.NOM']):
    print("ugyesvagy")
if check_macro('not_kop_v', ['teszek', 'tesz', 'IGE.SG1']):
    print("ugyesvagy")
if check_macro('def_art', ['a', 'a', 'DET']):
    print('ugyesvagy')
if not check_macro('def_art', ['az', 'az', 'FN_NM.NOM']):
    print('ugyesvagy')
    
if not check_macro('Adj_tree', ['szépek', 'szép', 'MN.PL.NOM']):
    print('ugyesvagy')

if check_macro('NPMod', ['1848_49-es', '1848_49-es', 'MN.NOM']):
    print("na")

ugyesvagy
ugyesvagy
ugyesvagy
ugyesvagy
ugyesvagy
ugyesvagy
ugyesvagy
na


In [6]:
def NUM_rules(window, curr_POS):
    first_right_word, first_right_lemma, first_right_annot = window[0]
    second_right_word, second_right_lemma, second_right_annot = window[1]
    
    if re.search("FN|SZN|MN|NU", first_right_annot): # if the next token is a nominal, or a postposition, the word gets a "none"
        curr_POS = curr_POS.replace('NOM', 'semmi')
        
    elif check_macro('not_kop_v', window[0]): # if the next token is verb, but not a copula, the word is a nominative
        curr_POS = curr_POS.replace('NOM', 'nom')
        
    elif check_macro('def_art', window[0]) or re.search('HA', first_right_annot): # if the next token is a definite article or an adverb, the word is a nominative
        curr_POS = curr_POS.replace('NOM', 'nom')
        
    else:
        curr_POS = curr_POS.replace('NOM', 'defsemmi')
        
    return curr_POS

    

In [7]:
def ADJ_rules(window, curr_POS):
    first_right_word, first_right_lemma, first_right_annot = window[0]
    second_right_word, second_right_lemma, second_right_annot = window[1]
    
    if check_macro('NUs', window[0]): # if the next token is postposition(al like element), the word gets a 'none'
        curr_POS = curr_POS.replace('NOM', 'semmi')
    
    elif check_macro('not_kop_v', window[0]): # if the next token is a verb, but not a copula, the word is a nominative
        curr_POS = curr_POS.replace('NOM', 'nom')
    
    elif check_macro('szn', window[0]): # before a numeral: default value
        curr_POS = curr_POS.replace('NOM', 'nulla')
        
        if check_macro('PSE', window[1]): # if the second token has a poss. suff.
            curr_POS = curr_POS.replace('nulla', 'gen')
        else: # otherwise
            curr_POS = curr_POS.replace('nulla', 'nom')
    
    elif check_macro('full_stop', window[0]): # if the next token is a fix outsider, this word must be the end of an NP, thus 'nom'
        curr_POS = curr_POS.replace('NOM', 'nom')
    
    else: # otherwise
        curr_POS = curr_POS.replace('NOM', 'defsemmi')
        if check_macro('PUNCT', window[1]): # if there is a punct.mark in the window
            curr_POS = curr_POS.replace('defsemmi', 'semmi')
            
    return curr_POS    

In [20]:
def NOUN_rules(window, curr_POS, curr_word):
    first_right_word, first_right_lemma, first_right_annot = window[0]
    second_right_word, second_right_lemma, second_right_annot = window[1]
    
    if check_macro('NUs', window[0]): # if the next token is postposition(al like element), the word gets a 'none'
        curr_POS = curr_POS.replace('NOM', 'semmi')
        
    elif check_macro('not_gen', curr_word):
        curr_POS = curr_POS.replace('NOM', 'nom')
    
    elif check_macro('PSE', window[0]):
        curr_POS = curr_POS.replace('NOM', 'gen')
        
    elif check_macro('not_kop_v', window[0]) or check_macro('pl', window[0]) or first_right_annot == 'IK':
        curr_POS = curr_POS.replace('NOM', 'nom')
    
    elif check_macro('TULN', window[0]):
        curr_POS = curr_POS.replace('NOM', 'nulla')
        
        if not check_macro('cimunevu', window[1]):
            curr_POS = curr_POS.replace('nulla', 'nom')
    
    elif check_macro('NPMod', window[0]):
        curr_POS = curr_POS.replace('NOM', 'nulla')
    
        if check_macro('PSE', window[1]) and not re.search('PSe3', curr_POS):
            curr_POS = curr_POS.replace('nulla', 'gen')
        elif check_macro('V', window[1]) or check_macro('PUNCT', window[1]) or check_macro('vonatk', window[1]) or re.search('PSe3', curr_POS):
            curr_POS = curr.POS.replace('nulla', 'nom')       
        
    elif check_macro('full_stop', window[0]):
        curr_POS = curr_POS.replace('NOM', 'nom')
    
    else:
        curr_POS = curr_POS.replace('NOM', 'nulla')
        
        if check_macro('V', window[1]) or check_macro('PUNCT', window[1]) or check_macro('vonatk', window[1]) or re.search('PSe3', curr_POS):
            curr_POS = curr.POS.replace('nulla', 'nom')       
        
        
    return curr_POS

In [21]:
def nom_or_what(s):
    sent = format_sents(s) 
    new_sent = [] # to store a sentence with the novel tags
    to_write_later = [] # to store the NOM token and the window to create the annotation file later
    for i in range(len(sent)):
        
        token = sent[i]
        curr_word = token[0]
        curr_lemma = token[1]
        curr_POS = token[2]
        
        if curr_word != "#": # if the given token is not the end of a sentence
        
            window = sent[i + 1:i + 3]

            
            if check_macro('NOM', token): # if the given token is a suffixless nominal
                              
                if check_macro('Noun_tree', token): # if the given token is a noun, or a plural adjective, participle
                    curr_POS = NOUN_rules(window, curr_POS, curr_word)
        
                elif check_macro('Adj_tree', token): # if the given token is a singular adjective or participle
                    curr_POS = ADJ_rules(window, curr_POS)
        
                elif check_macro('Num_tree', token): # if the given token is a numeral
                    curr_POS = NUM_rules(window, curr_POS)
                
                new_token = curr_word + ' ' + curr_lemma + ' ' + curr_POS
                to_write_later.append( (window, new_token ) ) # a tuple of the window of the given token and the full (novel) annotation of the token
                
        new_sent.append(curr_word + '/' + curr_lemma + '/' + curr_POS)
    
    return (new_sent, to_write_later)


    # print(' '.join(new_sent))
    
    

In [37]:
def write_to_annot_file(new_sent, window, outp, i):
    outp.writelines(str(i) + '. ' + ' '.join([token.split('/')[0] for token in new_sent])+ '\n')
    [outp.writelines(('-' + nom.split()[0] + ' ' + window[0][0] + ' ' + window[1][0] + '\n', nom + '\n', nom + '\n', nom + '\n')) for (window, nom) in to_write]
    outp.writelines(('\n', '\n'))
    

In [23]:
# 1 sent, demo
new_sent, to_write = nom_or_what('Több/sok/SZN._FOK.NOM levelet/levél/FN.ACC írt/ír/IGE.Me3 a/a/DET másik/másik/MN_NM.NOM honti/honti/MN.NOM jóbarátnak/jóbarát/FN.DAT és/és/KOT munkatársnak/munkatárs/FN.DAT ,/,/WPUNCT Pajor_István/Pajor_István/TULN.NOM 1848_49-es/1848_49-es/MN.NOM nemzetőrnek/nemzetőr/FN.DAT ,/,/WPUNCT megyei/megyei/MN.NOM tisztviselőnek/tisztviselő/FN.DAT ,/,/WPUNCT írónak/író/FN.DAT és/és/KOT költőnek/költő/FN.DAT is/is/HA ././SPUNCT')
with open (output_file, 'w') as outp:
    write_to_annot_file(new_sent,to_write, outp, 1)

    

        

In [38]:
# 10 sents, demo, file
with open(input_file, 'r') as inp, open(output_file, 'w') as outp:
    for i, line in enumerate(inp, start = 1):
        new_sent, to_write = nom_or_what(line)
        write_to_annot_file(new_sent, to_write, outp, i)


In [45]:
def merge_annot_with_outp():
    annot = open(orig_annot_file, 'r').readlines()
    outp = open(output_file, 'r').readlines()
    merged_outp = open(merged_output_file, 'w')
    print(len(annot))
    j = -1
    for i in range(len(annot)):
        if i == j:
            merged_outp.write(outp[i])
        else:
            merged_outp.write(annot[i])
        if annot[i].startswith('-'):
            j = i+3
    merged_outp.close()
    
        

merge_annot_with_outp()

In [46]:
merge_annot_with_outp()

132
