In [2]:
import os
import json
import random
import sys

In [3]:
def read_protocols_tagged_by_hand(file = "k2sitsi_morfiga_protokollid.txt"):
    protocols_tagged_by_hand = []
    with open(file, 'r', encoding="UTF-8") as f:
        lines = f.readlines()
    for line in lines:
        line = line.replace("\n", "")
        if line != "" and (line != "k2sitsi_morfiga_protokollid2:" and line != "k2sitsi_morfiga_protokollid1:"):
            protocols_tagged_by_hand.append(line)
    
    return protocols_tagged_by_hand

In [4]:
def calculate_whole_corpus_distribution(filenames = os.listdir("vallakohtufailid_json/")):
    all_annotations = []
    for file in filenames:
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                all_annotations.append(dictionary.get('annotations')[0].get('nertag'))
    per = all_annotations.count("PER") / len(all_annotations) * 100
    loc_org = all_annotations.count("LOC_ORG") / len(all_annotations) * 100
    loc = all_annotations.count("LOC") / len(all_annotations) * 100
    org = all_annotations.count("ORG") / len(all_annotations) * 100
    misc = all_annotations.count("MISC") / len(all_annotations) * 100

    ideal_distribution = {'PER': per, 'LOC_ORG' : loc_org, 'LOC' : loc, 'ORG' : org, 'MISC' : misc}
    
    return ideal_distribution

In [5]:
def find_document_ne_statistics(filenames = os.listdir("vallakohtufailid_json/")):
    statistics = dict()
    for file in filenames:
        ner_annotations = []
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                ner_annotations.append(dictionary.get('annotations')[0].get('nertag'))

        statistics_for_file = dict()
        for annotation in set(ner_annotations):
            statistics_for_file[str(annotation)] = ner_annotations.count(annotation)
        statistics[file] = statistics_for_file

    return statistics

Statistics, the ideal distribution and the protocols tagged by hand:

In [6]:
statistics = find_document_ne_statistics()
ideal_distribution = calculate_whole_corpus_distribution()
protocols_tagged_by_hand = read_protocols_tagged_by_hand()

In [57]:
def improve_scores(largest, second_largest):
    for i in range(len(largest)):
        for j in range(len(second_largest)):
            score_largest_old = calculate_score(largest)
            score_second_largest_old = calculate_score(second_largest)
            largest, second_largest = swap_items_in_lists(largest, second_largest, i, j)
            score_largest_new = calculate_score(largest)
            score_second_largest_new = calculate_score(second_largest)

            if score_largest_old > score_largest_new and score_second_largest_old > score_second_largest_new:
                print(f"{i, j} Skoor paranes mõlema alamkorpuse lõikes {score_largest_old-score_largest_new+score_second_largest_old-score_second_largest_new} protsendipunkti võrra.")
            
            else:
                print(f"{i, j}")
                swap_items_in_lists(largest, second_largest, i, j)
    return largest, second_largest

In [67]:
def calculate_score(filenames):
    proportions = calculate_proportions(filenames)
    score = 0
    penalty = 0
    for file in protocols_tagged_by_hand:
        if (file + ".json") in filenames:
            penalty += 1
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    return score+penalty
        

In [53]:
def calculate_proportions(filenames):
    statistics_for_proportion = {}
    
    for file in filenames:
        statistics_for_proportion[file] = statistics[file]

    all_annotations = list()
    for item in statistics_for_proportion.values():
        for key in item:
            appendable = key
            for i in range(0, item[key] + 1):
                all_annotations.append(appendable)
                
    proportions = dict()
    for file in statistics_for_proportion:
        for key in statistics_for_proportion[file].keys():
            proportion = all_annotations.count(key) / len(all_annotations) * 100
            proportions[key] = proportion
        
    return proportions

def n_even_chunks(filenames, n):
    files = []
    last = 0
    for i in range(1, n+1):
        current = int(round(i* (len(filenames) / n)))
        files.append(filenames[last:current])
        last = current
    return files

def generate_random_division(filenames, n):
    random_distributions = []
    for i in range(n):
        filenames = random.sample(filenames, len(filenames))
        random_distributions.append(filenames)
    
    return random_distributions

def swap_items_in_lists(A, B, i, j):
    temp_b = B[j]
    B[j] = A[i]
    A[i] = temp_b
    return A, B

In [None]:
def calculate_score(filenames):
    proportions = calculate_proportions(filenames)
    score = 0
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    
    return score

In [68]:
seed = 0
random.seed(seed)
print("Seed:", seed)

subdistributions = list()
random_distributions = generate_random_division(os.listdir("vallakohtufailid_json/"), 10)
for distribution in random_distributions:
    distribution = n_even_chunks(distribution, 6)
    subdistributions.append(distribution)
    
sums_of_distributions = list()
list_of_scores = list()

for distribution in subdistributions:
    scores = dict()
    for files in distribution:
        score = calculate_score(files)
        scores[score] = files
        print(score)
    list_of_scores.append(scores)
    print(f"Score of sub-corpus: {sum(scores.keys()) / len(scores.keys())}")
    print(" ")

for listofscores in list_of_scores:    
    sums_of_distributions.append(sum(listofscores.keys()) / len(listofscores.keys()))


Seed: 0
15.217010759280129
12.97244658028972
22.121278140885995
22.923169877640092
16.724447992371964
18.22348956932398
Score of sub-corpus: 18.030307153298647
 
24.754266552310437
17.71929923472094
17.42442765553949
13.278466852474672
15.443044432965984
18.783316869420872
Score of sub-corpus: 17.90047026623873
 
19.860460264550053
14.56496557924541
12.286362879817535
27.602432044028337
19.346528655721293
13.901320113343106
Score of sub-corpus: 17.927011589450956
 
15.883690587424145
20.7453359542359
21.622106233493376
21.501132669782503
17.248440301611474
11.319375738207539
Score of sub-corpus: 18.05334691412582
 
12.641409067001817
13.980952130963956
17.87272104884614
20.260491467612596
23.923130427722004
18.72941154968992
Score of sub-corpus: 17.90135261530607
 
18.733689752128917
14.088538439500839
19.889844468155207
26.294227222778986
18.682787425926875
10.827852103441948
Score of sub-corpus: 18.086156568655465
 
11.299134229558021
20.535587981279896
15.254008155968936
28.15988071

In [69]:
smallest_score = list_of_scores[sums_of_distributions.index(min(sums_of_distributions))]
while (sum(smallest_score.keys()) / len(smallest_score.keys())) >= 5:
    keys_list = sorted(smallest_score.keys())
    largest_key = keys_list[-1]
    second_largest_key = keys_list[-2]
    largest = smallest_score[largest_key]
    second_largest = smallest_score[second_largest_key]
    
    print(f"Algsed skoorid on {keys_list[-1]} ning {keys_list[-2]}.")
    largest, second_largest = improve_scores(largest, second_largest)
    score_after_improving_largest = calculate_score(largest)
    score_after_improving_second_largest = calculate_score(second_largest)
    print(f"Skoorid pärast parandamist on {score_after_improving_largest} ning {score_after_improving_second_largest}.")
    del smallest_score[largest_key]
    del smallest_score[second_largest_key]
    smallest_score[score_after_improving_largest] = largest
    smallest_score[score_after_improving_second_largest] = second_largest

Algsed skoorid on 24.754266552310437 ning 18.783316869420872.
(0, 0)
(0, 1)
(0, 2)
(0, 3)
(0, 4)
(0, 5)
(0, 6)
(0, 7)
(0, 8)
(0, 9)
(0, 10)
(0, 11)
(0, 12)
(0, 13)
(0, 14)
(0, 15)
(0, 16)
(0, 17)
(0, 18)
(0, 19)
(0, 20)
(0, 21)
(0, 22)
(0, 23)
(0, 24)
(0, 25)
(0, 26)
(0, 27)
(0, 28)
(0, 29)
(0, 30)
(0, 31)
(0, 32)
(0, 33)
(0, 34)
(0, 35)
(0, 36)
(0, 37)
(0, 38)
(0, 39)
(0, 40)
(0, 41)
(0, 42)
(0, 43)
(0, 44)
(0, 45)
(0, 46)
(0, 47)
(0, 48)
(0, 49)
(0, 50)
(0, 51)
(0, 52)
(0, 53)
(0, 54)
(0, 55)
(0, 56)
(0, 57)
(0, 58)
(0, 59)
(0, 60)
(0, 61)
(0, 62)
(0, 63)
(0, 64)
(0, 65)
(0, 66)
(0, 67)
(0, 68)
(0, 69)
(0, 70)
(0, 71)
(0, 72)
(0, 73)
(0, 74)
(0, 75)
(0, 76)
(0, 77)
(0, 78)
(0, 79)
(0, 80)
(0, 81)
(0, 82)
(0, 83)
(0, 84)
(0, 85)
(0, 86)
(0, 87)
(0, 88)
(0, 89)
(0, 90)
(0, 91)
(0, 92)
(0, 93)
(0, 94)
(0, 95)
(0, 96)
(0, 97)
(0, 98)
(0, 99)
(0, 100)
(0, 101)
(0, 102)
(0, 103)
(0, 104)
(0, 105)
(0, 106)
(0, 107)
(0, 108)
(0, 109)
(0, 110)
(0, 111)
(0, 112)
(0, 113)
(0, 114)
(0, 115)
(0, 

KeyboardInterrupt: 