In [2]:
import os
import json
import random
import sys

In [3]:
def read_protocols_tagged_by_hand(file = "k2sitsi_morfiga_protokollid.txt"):
    protocols_tagged_by_hand = []
    with open(file, 'r', encoding="UTF-8") as f:
        lines = f.readlines()
    for line in lines:
        line = line.replace("\n", "")
        if line != "" and (line != "k2sitsi_morfiga_protokollid2:" and line != "k2sitsi_morfiga_protokollid1:"):
            protocols_tagged_by_hand.append(line)
    
    return protocols_tagged_by_hand

In [4]:
def calculate_whole_corpus_distribution(filenames = os.listdir("vallakohtufailid_json/")):
    all_annotations = []
    for file in filenames:
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                all_annotations.append(dictionary.get('annotations')[0].get('nertag'))
    per = all_annotations.count("PER") / len(all_annotations) * 100
    loc_org = all_annotations.count("LOC_ORG") / len(all_annotations) * 100
    loc = all_annotations.count("LOC") / len(all_annotations) * 100
    org = all_annotations.count("ORG") / len(all_annotations) * 100
    misc = all_annotations.count("MISC") / len(all_annotations) * 100

    ideal_distribution = {'PER': per, 'LOC_ORG' : loc_org, 'LOC' : loc, 'ORG' : org, 'MISC' : misc}
    
    return ideal_distribution

In [5]:
def find_document_ne_statistics(filenames = os.listdir("vallakohtufailid_json/")):
    statistics = dict()
    for file in filenames:
        ner_annotations = []
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                ner_annotations.append(dictionary.get('annotations')[0].get('nertag'))

        statistics_for_file = dict()
        for annotation in set(ner_annotations):
            statistics_for_file[str(annotation)] = ner_annotations.count(annotation)
        statistics[file] = statistics_for_file

    return statistics

In [6]:
statistics = find_document_ne_statistics()
ideal_distribution = calculate_whole_corpus_distribution()
protocols_tagged_by_hand = read_protocols_tagged_by_hand()

In [22]:
def improve_scores(largest, second_largest):
    for i, x in enumerate(largest):
        for j, y in enumerate(second_largest):   
            score_largest_old = calculate_score(largest)
            largest, second_largest = swap_items_in_lists(largest, second_largest, i, j)
            score_largest_new = calculate_score(largest)

            if score_largest_old < score_largest_new:
                print("Skoor halvenes, algne:", score_largest_old, "hiljem:", score_largest_new)
                swap_items_in_lists(largest, second_largest, i, j)
            elif score_largest_old == score_largest_new:
                print("Skoor jäi samaks, algne:", score_largest_old, "hiljem:", score_largest_new)
                swap_items_in_lists(largest, second_largest, i, j)
            else:
                print("Skoor paranes, algne:", score_largest_old, "hiljem:", score_largest_new)
                continue
    return largest, second_largest

In [30]:
def calculate_proportions(filenames):
    statistics_for_proportion = {}
    
    for file in filenames:
        statistics_for_proportion[file] = statistics[file]

    all_annotations = list()
    for item in statistics_for_proportion.values():
        for key in item:
            appendable = key
            for i in range(0, item[key] + 1):
                all_annotations.append(appendable)
                
    proportions = dict()
    for file in statistics_for_proportion:
        for key in statistics_for_proportion[file].keys():
            proportion = all_annotations.count(key) / len(all_annotations) * 100
            proportions[key] = proportion
        
    return proportions

def calculate_score(filenames):
    proportions = calculate_proportions(filenames)
    score = 0
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    
    return score

def n_even_chunks(filenames, n):
    files = []
    last = 0
    for i in range(1, n+1):
        current = int(round(i* (len(filenames) / n)))
        files.append(filenames[last:current])
        last = current
    return files

def generate_random_division(filenames, n):
    random_distributions = []
    for i in range(n):
        filenames = random.sample(filenames, len(filenames))
        random_distributions.append(filenames)
    
    return random_distributions

def swap_items_in_lists(A, B, i, j):
    temp_b = B[j]
    B[j] = A[i]
    A[i] = temp_b
    return A,B


seed = 0
random.seed(seed)
print("Seed:", seed)

subdistributions = list()
random_distributions = generate_random_division(os.listdir("vallakohtufailid_json/"), 10)
for random_distribution in random_distributions:
    random_distribution = n_even_chunks(random_distribution, 6)
    subdistributions.append(random_distribution)
scores = dict()

counter = 1
for distribution in subdistributions:
    print(f"Subdistribution number {counter}:")
    y = 1
    for files in distribution:
        print(f"Subdistribution {y}:")
        score = calculate_score(files)
        scores[score] = files
        print(score)
        print(" ")
        y += 1

keys_list = sorted(scores.keys())
largest = scores[keys_list[-1]]
second_largest = scores[keys_list[-2]]
print(f"Algsed skoorid on {keys_list[-1]} ning {keys_list[-2]}.")
largest, second_largest = improve_scores(largest, second_largest)
print(f"Skoorid pärast parandamist on {calculate_score(largest)} ning {calculate_score(second_largest)}.")

Seed: 0
Subdistribution 1:
7.217010759280128
 
Subdistribution 2:
6.97244658028972
 
Subdistribution 3:
9.121278140885995
 
Subdistribution 4:
10.923169877640092
 
Subdistribution 5:
7.724447992371964
 
Subdistribution 6:
10.223489569323979
 
Subdistribution 1:
7.754266552310438
 
Subdistribution 2:
9.719299234720939
 
Subdistribution 3:
12.424427655539489
 
Subdistribution 4:
6.278466852474673
 
Subdistribution 5:
6.4430444329659835
 
Subdistribution 6:
8.783316869420872
 
Subdistribution 1:
8.860460264550053
 
Subdistribution 2:
6.564965579245409
 
Subdistribution 3:
5.286362879817535
 
Subdistribution 4:
14.602432044028337
 
Subdistribution 5:
10.346528655721295
 
Subdistribution 6:
5.901320113343106
 
Subdistribution 1:
9.883690587424145
 
Subdistribution 2:
10.745335954235902
 
Subdistribution 3:
8.622106233493378
 
Subdistribution 4:
9.501132669782503
 
Subdistribution 5:
8.248440301611476
 
Subdistribution 6:
5.319375738207539
 
Subdistribution 1:
4.641409067001818
 
Subdistribu

KeyboardInterrupt: 