In [None]:
file_name = 'lexicon.eflomal_clean.txt'

In [None]:
def read_to_dict(filename):
    result = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word1, word2, value = parts
            value = float(value)
            if word1 not in result:
                result[word1] = []
            result[word1].append((word2, value))
    return result

In [None]:
words = read_to_dict(file_name)

In [None]:
print(words)

{'jesus': [('yesungo', 0.64804469), ('yesu', 0.19553073), ('yesuaru', 0.05027933), ('ifi', 0.02793296), ('yesuac', 0.02234637), ('kensikema', 0.01675978), ('yofi', 0.01117318), ('ammigec', 0.00558659), ('anec', 0.00558659), ('anoc', 0.00558659), ('edeyunuec', 0.00558659), ('ioha', 0.00558659)], 'jesusʼ': [('yesuac', 1.0)], 'jews': [('ngictau', 0.4), ('yudangic', 0.4), ('ngeneac', 0.2)], 'john': [('yohane', 0.73076923), ('yohanengo', 0.15384615), ('garenggeecac', 0.03846154), ('negentegecma', 0.03846154), ('ubia', 0.03846154)], 'bethany': [('betania', 0.75), ('betani', 0.25)]}


In [None]:
def min_edit_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0]*(n+1) for _ in range(m+1)]

    def cost(pos, max_len):
        return 1 if pos == 0 or pos == max_len - 1 else 3

    for i in range(m + 1):
        if i > 0:
            dp[i][0] = dp[i-1][0] + cost(i-1, m)
    for j in range(n + 1):
        if j > 0:
            dp[0][j] = dp[0][j-1] + cost(j-1, n)

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i-1] == s2[j-1]:
                sub_cost = 0
            else:
                sub_cost = cost(i-1, m)

            dp[i][j] = min(
                dp[i-1][j] + cost(i-1, m),
                dp[i][j-1] + cost(j-1, n),
                dp[i-1][j-1] + sub_cost
            )
    return dp[m][n]

In [None]:
print(min_edit_distance('nga', 'mga'))

1


In [None]:
def special_med(shorter, longer):
    if len(longer) - len(shorter) > 5:
        return False

    if shorter in longer:
        return True

    if len(shorter) < 3:
        return False

    if len(longer) < 5 and len(longer) - len(shorter) > 2:
        return False

    len_s, len_l = len(shorter), len(longer)
    for i in range(len_l - len_s + 1):
        sub = longer[i:i + len_s]
        if (len(sub) >= 5 and min_edit_distance(shorter, sub) <= 2) or min_edit_distance(shorter, sub) <= 1:
            return True
    return False

In [None]:
print(special_med('nga', 'negemma'))

False


In [None]:
from string import punctuation

def group_strings(pairs):
    sorted_pairs = sorted(pairs, key=lambda x: (-x[1], -len(x[0])))
    result = []

    for i, (str1, val1) in enumerate(sorted_pairs):
        if any(c in str1 for c in punctuation):
            continue

        group = []
        total = val1
        for j, (str2, val2) in enumerate(sorted_pairs):
            if i == j:
                continue
            if any(c in str2 for c in punctuation):
                continue
            if len(str2) < len(str1):
                if special_med(str2, str1):
                    group.append(str2)
                    total += val2
            else:
                if special_med(str1, str2):
                    group.append(str2)
                    total += val2
        result.append((str1, group, total))
    return result

In [None]:
data = [('yesungo', 0.64804469), ('yesu', 0.19553073), ('yesuaru', 0.05027933), ('ifi', 0.02793296), ('yesuac', 0.02234637)]
print(group_strings(data))

[('yesungo', ['yesu'], 0.84357542), ('yesu', ['yesungo', 'yesuaru', 'yesuac'], 0.9162011199999999), ('yesuaru', ['yesu', 'yesuac'], 0.26815643), ('ifi', [], 0.02793296), ('yesuac', ['yesu', 'yesuaru'], 0.26815643)]


In [None]:
result_dict = {}

for word1 in words:
    result_dict[word1] = group_strings(words[word1])

print(result_dict)

{'abiud': [('abihud', [], 1.0)], 'abraham': [('abraham', [], 1.0)], 'abías': [('abías', [], 1.0)], 'acaz': [('acaz', [], 1.0)], 'ak': [('cerdos', [], 0.13333333), ('beelzebú', [], 0.06666667), ('injustos', [], 0.06666667), ('guardia', [], 0.06666667), ('buenos', [], 0.06666667), ('malos', [], 0.06666667)], 'akbaꞌ': [('dispersarán', [], 0.25), ('comiendo', [], 0.25), ('graneros', [], 0.25)], 'akbaꞌn': [('pendiente', [], 0.25), ('comieron', [], 0.25), ('pisoteen', [], 0.25), ('correr', [], 0.25)], 'akbeꞌ': [('compañeros', [], 0.33333333), ('templo', [], 0.33333333)], 'akbeꞌn': [('muertos', [], 1.0)], 'ake': [('encuentran', [], 0.25), ('partieron', [], 0.25), ('mancos', [], 0.25), ('bien', [], 0.25)], 'akebaꞌ': [('escoger', [], 0.33333333), ('bueno', [], 0.33333333), ('malo', [], 0.33333333)], 'akebeꞌ': [('homenaje', [], 0.25), ('sanarlo', [], 0.25), ('nombre', [], 0.25), ('duda', [], 0.25)], 'aken': [('destruido', [], 0.11111111), ('demonios', [], 0.11111111), ('cosecha', [], 0.11111111)

In [None]:
with open('word_pairs_all.txt', 'w', encoding='utf-8') as f:
    for word1, groups in result_dict.items():
        f.write(f"{word1}:\n")
        groups = sorted(groups, key=lambda x: (-x[2], len(x[0])))
        for group in groups:
            f.write(f"  {group[0]}: {group[2]}, from words: {group[1]}\n")
            f.write("\n")

In [None]:
# If one has > 0.5 in the original file, take that one; otherwise take the current highest
with open('word_pairs.txt', 'w', encoding='utf-8') as f:
    for word1, groups in result_dict.items():
        groups = sorted(groups, key=lambda x: (-x[2], len(x[0])))
        groups_to_write = []

        for group in groups:
            if group[2] < 0.65:
                break
            groups_to_write.append(group)

        if len(groups_to_write) > 0:
            f.write(f"{word1}:\n")
            done = False
            for group_to_write in groups_to_write:
                for trg in words[word1]:
                    if trg[0] == group_to_write[0] and trg[1] > 0.5:
                        f.write(f"  {group_to_write[0]}: {group_to_write[2]}, from words: {group_to_write[1]}\n")
                        f.write("\n")
                        done = True
                        break
                if done:
                    break
            if not done:
                f.write(f"  {group_to_write[0]}: {group_to_write[2]}, from words: {group_to_write[1]}\n")
                f.write("\n")

        #if len(groups) > 0 and groups[0][2] >= 0.65:
        #    f.write(f"{word1}:\n")
        #    f.write(f"  {groups[0][0]}: {groups[0][2]}, from words: {groups[0][1]}\n")
        #    f.write("\n")