In [23]:
import os
import json
import random
import sys

In [2]:
def read_protocols_tagged_by_hand(file = "k2sitsi_morfiga_protokollid.txt"):
    protocols_tagged_by_hand = []
    with open(file, 'r', encoding="UTF-8") as f:
        lines = f.readlines()
    for line in lines:
        line = line.replace("\n", "")
        if line != "" and (line != "k2sitsi_morfiga_protokollid2:" and line != "k2sitsi_morfiga_protokollid1:"):
            protocols_tagged_by_hand.append(line)
    
    return protocols_tagged_by_hand

In [3]:
def calculate_whole_corpus_distribution(filenames = os.listdir("vallakohtufailid_json/")):
    all_annotations = []
    for file in filenames:
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                all_annotations.append(dictionary.get('annotations')[0].get('nertag'))
    per = all_annotations.count("PER") / len(all_annotations) * 100
    loc_org = all_annotations.count("LOC_ORG") / len(all_annotations) * 100
    loc = all_annotations.count("LOC") / len(all_annotations) * 100
    org = all_annotations.count("ORG") / len(all_annotations) * 100
    misc = all_annotations.count("MISC") / len(all_annotations) * 100

    ideal_distribution = {'PER': per, 'LOC_ORG' : loc_org, 'LOC' : loc, 'ORG' : org, 'MISC' : misc}
    
    return ideal_distribution

In [24]:
def find_document_ne_statistics(filenames = os.listdir("vallakohtufailid_json/")):
    statistics = dict()
    for file in filenames:
        ner_annotations = []
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                ner_annotations.append(dictionary.get('annotations')[0].get('nertag'))

        statistics_for_file = dict()
        for annotation in set(ner_annotations):
            statistics_for_file[str(annotation)] = ner_annotations.count(annotation)
        statistics[file] = statistics_for_file

    return statistics

Statistics, the ideal distribution and the protocols tagged by hand:

In [25]:
statistics = find_document_ne_statistics()
ideal_distribution = calculate_whole_corpus_distribution()
protocols_tagged_by_hand = read_protocols_tagged_by_hand()

In [71]:
def improve_scores(largest, second_largest):
    for i, x in enumerate(largest):
        for j, y in enumerate(second_largest):
            score_largest_old = calculate_score(largest)
            score_second_largest_old = calculate_score(second_largest)
            largest, second_largest = swap_items_in_lists(largest, second_largest, i, j)
            score_largest_new = calculate_score(largest)
            score_second_largest_new = calculate_score(second_largest)

            if score_largest_old > score_largest_new and score_second_largest_old > score_second_largest_new:
                print(f"{i, j} Skoor paranes mõlema alamkorpuse lõikes {score_largest_old-score_largest_new+score_second_largest_old-score_second_largest_new} protsendipunkti võrra.")
            else:
                print(f"{i, j}")
                swap_items_in_lists(largest, second_largest, i, j)
    return largest, second_largest

In [32]:
def calculate_proportions(filenames):
    statistics_for_proportion = {}
    
    for file in filenames:
        statistics_for_proportion[file] = statistics[file]

    all_annotations = list()
    for item in statistics_for_proportion.values():
        for key in item:
            appendable = key
            for i in range(0, item[key] + 1):
                all_annotations.append(appendable)
                
    proportions = dict()
    for file in statistics_for_proportion:
        for key in statistics_for_proportion[file].keys():
            proportion = all_annotations.count(key) / len(all_annotations) * 100
            proportions[key] = proportion
        
    return proportions

def calculate_score(filenames):
    proportions = calculate_proportions(filenames)
    score = 0
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    
    return score

def n_even_chunks(filenames, n):
    files = []
    last = 0
    for i in range(1, n+1):
        current = int(round(i* (len(filenames) / n)))
        files.append(filenames[last:current])
        last = current
    return files

def generate_random_division(filenames, n):
    random_distributions = []
    for i in range(n):
        filenames = random.sample(filenames, len(filenames))
        random_distributions.append(filenames)
    
    return random_distributions

def swap_items_in_lists(A, B, i, j):
    temp_b = B[j]
    B[j] = A[i]
    A[i] = temp_b
    return A,B

In [66]:
seed = 0
random.seed(seed)
print("Seed:", seed)

subdistributions = list()
random_distributions = generate_random_division(os.listdir("vallakohtufailid_json/"), 10)
for distribution in random_distributions:
    distribution = n_even_chunks(distribution, 6)
    subdistributions.append(distribution)
    
sums_of_distributions = list()
list_of_scores = list()

for distribution in subdistributions:
    scores = dict()
    for files in distribution:
        score = calculate_score(files)
        scores[score] = files
        print(score)
    list_of_scores.append(scores)
    print(" ")

for listofscores in list_of_scores:    
    sums_of_distributions.append(sum(listofscores.keys()))

Seed: 0
7.217010759280128
6.97244658028972
9.121278140885995
10.923169877640092
7.724447992371964
10.22348956932398
 
7.754266552310438
9.719299234720939
12.424427655539487
6.278466852474673
6.4430444329659835
8.783316869420872
 
8.860460264550053
6.564965579245409
5.286362879817535
14.602432044028337
10.346528655721295
5.901320113343106
 
9.883690587424145
10.745335954235902
8.622106233493378
9.501132669782503
8.248440301611476
5.319375738207539
 
4.641409067001818
6.9809521309639555
7.872721048846141
9.260491467612594
13.923130427722004
8.729411549689921
 
11.733689752128921
7.088538439500839
10.889844468155209
12.294227222778984
8.682787425926874
1.827852103441948
 
4.299134229558022
11.5355879812799
8.254008155968936
11.15988071360479
9.773286517051986
7.036654817872568
 
10.79300133005725
6.912815808584654
7.334544029765814
10.0250923064573
4.670318109936958
11.958957617815475
 
4.009755021609629
8.60835497927766
13.571090920207169
10.293562001049272
6.650017869424108
8.7837017375

In [72]:
smallest_score = list_of_scores[sums_of_distributions.index(min(sums_of_distributions))]
while sum(smallest_score.keys()) >= 5:
    keys_list = sorted(smallest_score.keys())
    largest_key = keys_list[-1]
    second_largest_key = keys_list[-2]
    largest = smallest_score[largest_key]
    second_largest = smallest_score[second_largest_key]
    
    print(f"Algsed skoorid on {keys_list[-1]} ning {keys_list[-2]}.")
    largest, second_largest = improve_scores(largest, second_largest)
    score_after_improving_largest = calculate_score(largest)
    score_after_improving_second_largest = calculate_score(second_largest)
    print(f"Skoorid pärast parandamist on {score_after_improving_largest} ning {score_after_improving_second_largest}.")
    del smallest_score[largest_key]
    del smallest_score[second_largest_key]
    smallest_score[score_after_improving_largest] = largest
    smallest_score[score_after_improving_second_largest] = second_largest

Algsed skoorid on 12.424427655539487 ning 9.719299234720939.
(0, 0)
(0, 1)
(0, 2)
(0, 3)
(0, 4)
(0, 5)
(0, 6)
(0, 7)
(0, 8)
(0, 9)
(0, 10)
(0, 11)
(0, 12)
(0, 13)
(0, 14)
(0, 15)
(0, 16)
(0, 17)
(0, 18)
(0, 19)
(0, 20)
(0, 21)
(0, 22)
(0, 23)
(0, 24)
(0, 25)
(0, 26)


KeyboardInterrupt: 