In [101]:
import os
import json
import random
import sys

In [102]:
def read_protocols_tagged_by_hand(file = "k2sitsi_morfiga_protokollid.txt"):
    protocols_tagged_by_hand = []
    with open(file, 'r', encoding="UTF-8") as f:
        lines = f.readlines()
    for line in lines:
        line = line.replace("\n", "")
        if line != "" and (line != "k2sitsi_morfiga_protokollid2:" and line != "k2sitsi_morfiga_protokollid1:"):
            protocols_tagged_by_hand.append(line)
    
    return protocols_tagged_by_hand

In [103]:
def calculate_whole_corpus_distribution(filenames = os.listdir("vallakohtufailid_json/")):
    all_annotations = []
    for file in filenames:
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                all_annotations.append(dictionary.get('annotations')[0].get('nertag'))
    per = all_annotations.count("PER") / len(all_annotations) * 100
    loc_org = all_annotations.count("LOC_ORG") / len(all_annotations) * 100
    loc = all_annotations.count("LOC") / len(all_annotations) * 100
    org = all_annotations.count("ORG") / len(all_annotations) * 100
    misc = all_annotations.count("MISC") / len(all_annotations) * 100

    ideal_distribution = {'PER': per, 'LOC_ORG' : loc_org, 'LOC' : loc, 'ORG' : org, 'MISC' : misc}
    
    return ideal_distribution

In [104]:
def find_document_ne_statistics(filenames = os.listdir("vallakohtufailid_json/")):
    statistics = dict()
    for file in filenames:
        ner_annotations = []
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                ner_annotations.append(dictionary.get('annotations')[0].get('nertag'))

        statistics_for_file = dict()
        for annotation in set(ner_annotations):
            statistics_for_file[str(annotation)] = ner_annotations.count(annotation)
        statistics[file] = statistics_for_file

    return statistics

In [105]:
statistics = find_document_ne_statistics()
ideal_distribution = calculate_whole_corpus_distribution()
protocols_tagged_by_hand = read_protocols_tagged_by_hand()

In [108]:
statistics

{'Harju_Jyri_Rae_id277_1874a.json': {'LOC_ORG': 1, 'LOC': 3, 'PER': 12},
 'Tartu_Kodavere_Alatskivi_id6838_1879a.json': {'LOC': 3, 'PER': 19},
 'J2rva_Tyri_S2revere_id14656_1887a.json': {'LOC_ORG': 2,
  'LOC': 1,
  'ORG': 1,
  'PER': 10},
 'Harju_Juuru_Kaiu_id1203_1895a.json': {'PER': 11, 'ORG': 1, 'LOC_ORG': 1},
 'L22ne_Reigi_K6rgessaare_id23306_1895a.json': {'LOC_ORG': 3, 'PER': 19},
 'Harju_Hageri_Kohila_id7391_1868a.json': {'LOC_ORG': 3, 'PER': 8},
 'L22ne_Kullamaa_Kuij6e_id15473_1876a.json': {'LOC_ORG': 1,
  'LOC': 2,
  'PER': 18},
 'J2rva_Tyri_Kirna_id23402_1872a.json': {'PER': 17},
 'Harju_Hageri_Kohila_id11175_1875a.json': {'LOC': 5,
  'PER': 45,
  'LOC_ORG': 15},
 'V6ru_Vastseliina_Misso_id14456_1881a.json': {'PER': 11},
 'J2rva_Tyri_V22tsa_id22177_1911a.json': {},
 'Viljandi_P6ltsamaa_Adavere_id20850_1896a.json': {'LOC_ORG': 3, 'PER': 12},
 'Tartu_Kodavere_Ranna_id19679_1865a.json': {'PER': 30, 'LOC_ORG': 1},
 'Tartu_V6nnu_Ahja_id23394_1893a.json': {'LOC_ORG': 2, 'PER': 18},


In [136]:
def calculate_proportions(filenames):
    statistics_for_proportion = {}
    
    for file in filenames:
        statistics_for_proportion[file] = statistics[file]

    all_annotations = list()
    for item in statistics_for_proportion.values():
        for key in item:
            appendable = key
            for i in range(0, item[key] + 1):
                all_annotations.append(appendable)
                
    proportions = dict()
    for file in statistics_for_proportion:
        for key in statistics_for_proportion[file].keys():
            proportion = all_annotations.count(key) / len(all_annotations) * 100
            proportions[key] = proportion
        
    return proportions

def calculate_score(filenames):
    proportions = calculate_proportions(filenames)
    score = 0
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    
    return score

def n_even_chunks(filenames, n):
    files = []
    last = 0
    for i in range(1, n+1):
        current = int(round(i* (len(filenames) / n)))
        files.append(filenames[last:current])
        last = current
    return files

def generate_random_division(filenames, n):
    random_distributions = []
    if n != 0:
        filenames = random.sample(filenames, len(filenames))
        random_distributions.append(filenames)
        n -= 1
    
    return random_distributions

def swap_items_in_lists(A, B, i, j):
    temp_b = B[j]
    B[j] = A[i]
    A[i] = temp_b
    return A,B


seed = 0
random.seed(seed)
print("Seed:", seed)

random_distributions = generate_random_division(os.listdir("vallakohtufailid_json/"), 10)
print(random_distributions)
for distribution in random_distributions:
    print(distribution)
#subdistributions = []
#for distribution in random_distributions:
#    distribution = n_even_chunks(distribution, 6)
#    subdistributions.append(distribution)

scores = dict()

#for files in subdistributions:
 #   score = calculate_score(files)
  #  print(score)

Seed: 0
[['Viljandi_Viljandi_Karula_id19401_1868a.json', 'Harju_Kose_Triigi_id11473_1871a.json', 'P2rnu_P2rnu-Elisabethi_Sauga_id17814_1868a.json', 'Tartu_V6nnu_Ahja_id9067_1871a.json', 'V6ru_R2pina_Kahkva_id5958_1887a.json', 'Harju_Kose_Kose-Uuem6isa_id2174_1867a.json', 'Tartu_Kodavere_Ranna_id14286_1858a.json', 'J2rva_Tyri_V22tsa_id16836_1886a.json', 'Harju_Hageri_Kohila_id22513_1867a.json', 'Harju_Hageri_Kohila_id21266_1885a.json', 'L22ne_Kullamaa_Piirsalu_id16751_1889a.json', 'Saare_P8ide_Laimjala_id5898_1914a.json', 'Tartu_V6nnu_Ahja_id21074_1889a.json', 'Harju_Hageri_Kohila_id10758_1873a.json', 'Tartu_Kodavere_Pala_id22605_1871a.json', 'Tartu_Kodavere_Alatskivi_id15347_1876a.json', 'Viljandi_Tarvastu_Tarvastu_id4805_1875a.json', 'L22ne_Emmaste_Emmaste_id15415_1896a.json', 'Tartu_Kodavere_Pala_id18128_1862a.json', 'Tartu_V6nnu_Ahja_id21240_1889a.json', 'J2rva_Ambla_Ambla_id7441_1887a.json', 'V6ru_Vastseliina_Misso_id24907_1886a.json', 'Tartu_V6nnu_Ahja_id18630_1886a.json', 'P2rnu_

In [120]:
for i, x in enumerate(largest):
    for j, y in enumerate(second_largest):   
        score_largest_old = calculate_score(largest)
        largest, second_largest = swap_items_in_lists(largest, second_largest, i, j)
        score_largest_new = calculate_score(largest)

        if score_largest_old < score_largest_new:
            print("Skoor halvenes, algne:", score_largest_old, "hiljem:", score_largest_new)
            swap_items_in_lists(largest, second_largest, i, j)
        elif score_largest_old == score_largest_new:
            print("Skoor jäi samaks, algne:", score_largest_old, "hiljem:", score_largest_new)
            swap_items_in_lists(largest, second_largest, i, j)
        else:
            print("Skoor paranes, algne:", score_largest_old, "hiljem:", score_largest_new)
            continue

scores_list = sorted(scores.keys())
largest = scores[scores_list[-1]]
second_largest = scores[scores_list[-2]]

Skoor halvenes, algne: 11.261351527854105 hiljem: 11.407977625924895
Skoor halvenes, algne: 11.261351527854105 hiljem: 11.303349340511852
Skoor halvenes, algne: 11.261351527854105 hiljem: 11.269744580358507
Skoor halvenes, algne: 11.261351527854105 hiljem: 11.269744580358507
Skoor halvenes, algne: 11.261351527854105 hiljem: 11.343955829847758
Skoor halvenes, algne: 11.261351527854105 hiljem: 11.303880911724058


KeyboardInterrupt: 