# Error Modelling Function

In [13]:
from math import ceil, floor
from shingle import *

port_ro_cog = dict()
port_ro_noncog = dict()

In [14]:
def graph_model(first, second):
    ''' Constructs the graphical structure between two shingle sets. '''
    
    # Step 1: Initialization
    # If the given sets first and second are empty, we initialize 
    # them by inserting an empty token, (nun), into those sets.
    
    if len(first) == 0:
        first.append("nun") #insert empty token if found empty
    if len(second) == 0:
        second.append("nun") #insert empty token if found empty
        
    # Step 2: Equalization of the set cardinalities
    # The cardinalities of the sets first and second made
    # equal by inserting empty tokens (nun) into the
    # middle of the sets.
    
    # While loops to equalize the sizes
    while(len(first) < len(second)):
        pos = ceil(len(first) / 2)
        first.insert(pos, "nun")
    
    # While loops to equalize the sizes
    while(len(first) > len(second)):
        pos = floor(len(second) / 2)
        second.insert(pos, "nun")
        
    # Step 3: Inserting the mappings of the set members into the graph
    # The empty graph is initialized as graph = {}.
    # The directed edges are generated, originating from every set member
    # of first to every set member of second. This results in a complete 
    # directed bipartite graph between first and second sets.
    
    # Pairs in tuples
    graph = set() #Graph in sets to avoid duplicates
    
    for i in range(len(first)):
        pair = (first[i], second[i]) # One to one mapping with same index
        graph.add(pair)
    for i in range(len(first) - 1):
        pair = (first[i], second[i + 1]) # One to one mapping with an index ahead
        graph.add(pair)
    if len(first) > 1:
        for i in range(1, len(first)):
            pair = (first[i], second[i - 1]) # One to one mapping with an index before
            graph.add(pair)
    return graph

In [15]:
def common_elements(list1, list2):
    return [element for element in list1 if element in list2]

def uncommon_elements(list1, list2):
    return [element for element in list1 if element not in list2]

In [16]:
def init_dict():
    cogs = open("data/portuguese/graph_cognates_freq.txt", "r+")
    for line in cogs:
        splits = line.rstrip().split()
        port_ro_cog[(splits[1], splits[2])] = int(splits[0])
    cogs.close()
    noncogs = open("data/portuguese/graph_noncognates_freq.txt", "r+")
    for line in noncogs:
        splits = line.rstrip().split()
        port_ro_noncog[(splits[1], splits[2])] = int(splits[0])
    noncogs.close()

In [17]:
def pi(source, target, k = 1):
    query = two_ends(source, 2) #Your query
    document = two_ends(target, 2) #Your document
    qd = common_elements(query, document) # q cap d
    first = uncommon_elements(query, qd) # q - (q cap d)
    second = uncommon_elements(document, qd) # d - (q cap d)
    graph = graph_model(first,second)
    res = sum([port_ro_cog[i]**k for i in graph]) # sum the frequencies in the dictionary
    return res / len(graph)

In [20]:
init_dict()
pi(aspirat____aspirar)

KeyError: ('ar2', 't1')

In [21]:
print(port_ro_cog)

{('il2', '5vi'): 1, ('to3', 'to4'): 1, ('că2', 'co2'): 1, ('va2', 'r1'): 1, ('oz4', '4ló'): 1, ('ta5', 'nc4'): 1, ('4ol', 'li4'): 1, ('4ch', '4ca'): 1, ('ta5', 'tâ6'): 2, ('it3', 'os2'): 1, ('ar3', 'do2'): 1, ('ie2', 'çã3'): 20, ('si5', 'ss4'): 2, ('4ec', '5ec'): 1, ('nun', 'gi4'): 1, ('6te', '6it'): 2, ('iu2', 'io2'): 1, ('4po', '4pu'): 1, ('e1', 'ir2'): 1, ('că2', 'ca2'): 1, ('al3', 'lt3'): 1, ('oo5', '6oó'): 1, ('1k', '1c'): 1, ('rh2', 'ch3'): 1, ('rf5', '7or'): 1, ('it2', 'mi4'): 1, ('6uc', 'tâ6'): 1, ('ot4', 'nun'): 1, ('ic3', '5ed'): 1, ('un3', 'õe3'): 1, ('5ut', '5da'): 1, ('lu3', 'lo2'): 1, ('ad4', 'ád5'): 1, ('ba2', 'r1'): 1, ('2ră', '3es'): 2, ('nz4', 'nd4'): 1, ('in3', 'ud3'): 1, ('m1', 'a1'): 1, ('ra4', '6cr'): 1, ('nun', 'lo4'): 2, ('5no', '4nh'): 1, ('lu3', '4lu'): 2, ('ic2', 'ic3'): 13, ('at4', 'ad4'): 1, ('un3', 'es2'): 1, ('5ri', 'or2'): 1, ('ca4', '5ca'): 1, ('nun', 'ng5'): 1, ('aț6', 'aç4'): 2, ('2el', '3el'): 1, ('6pu', '5po'): 1, ('nun', 'rc4'): 1, ('1e', '2he'): 3