In [1]:
import os
import json
import random
import sys

In [8]:
def calculate_whole_corpus_distribution(file_names = os.listdir("vallakohtufailid_json/")):
    all_annotations = []
    for file in file_names:
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                all_annotations.append(dictionary.get('annotations')[0].get('nertag'))
    per = all_annotations.count("PER") / len(all_annotations) * 100
    loc_org = all_annotations.count("LOC_ORG") / len(all_annotations) * 100
    loc = all_annotations.count("LOC") / len(all_annotations) * 100
    org = all_annotations.count("ORG") / len(all_annotations) * 100
    misc = all_annotations.count("MISC") / len(all_annotations) * 100

    ideal_distribution = {'PER': per, 'LOC_ORG' : loc_org, 'LOC' : loc, 'ORG' : org, 'MISC' : misc}
    
    return ideal_distribution

def find_document_ne_statistics(file_names):
    statistics = dict()
    for file in file_names:
        ner_annotations = []
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                ner_annotations.append(dictionary.get('annotations')[0].get('nertag'))

        statistics_for_file = dict()
        for annotation in set(ner_annotations):
            statistics_for_file[str(annotation)] = ner_annotations.count(annotation)
        statistics[file] = statistics_for_file

    return statistics

def calculate_proportions(file_names):
    all_annotations = list()
    for item in statistics.values():
        for key in item:
            appendable = key
            for i in range(0, item[key] + 1):
                all_annotations.append(appendable)
    proportions = dict()
    for file in statistics:
        for key in statistics[file].keys():
            proportion = all_annotations.count(key) / len(all_annotations) * 100
            proportions[key] = proportion
        
    return proportions

def calculate_score(file_names):
    score = 0
    proportions = calculate_proportions(file_names)
    
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    
    return score

def n_even_chunks(file_names, n):
    files = []
    
    last = 0
    for i in range(1, n+1):
        current = int(round(i* (len(file_names) / n)))
        files.append(file_names[last:current])
        last = current
        
    return files

def generate_random_division(file_names, n):
    random_files = []
    file_names = random.sample(file_names, len(file_names))
    files = n_even_chunks(file_names, n)
    return files

def swap_items_in_lists(A, B, i, j):
    temp_b = B[j]
    B[j] = A[i]
    A[i] = temp_b
    return A,B

ideal_distribution = calculate_whole_corpus_distribution()

scores = dict()
seed = 0
print("Seed:", seed)

random_files = generate_random_division(os.listdir("vallakohtufailid_json/"), 6)

for files in random_files:
    statistics = find_document_ne_statistics(files)
    score = calculate_score(files)
    print(score)
    scores[score] = files

print("Sum of scores:", sum(scores.keys()))

scores_list = sorted(scores.keys())
largest = scores[scores_list[-1]]
second_largest = scores[scores_list[-2]]

for i in range(len(largest)):
    for j in range(len(second_largest)):        
        score_largest_old = calculate_score(largest)
        score_second_largest_old = calculate_score(second_largest)
        
        largest, second_largest = swap_items_in_lists(largest, second_largest, i, j)
        calculated_new = calculate_score(second_largest)
        if calculated_new < score_second_largest_old:
            print("skoor paranes")
            largest = new_largest
            second_largest = new_second_largest
            continue
        else:
            print("skoor halvenes")

Seed: 0
10.885864874911181
4.530969432546259
8.109707434865065
6.948783279790259
13.327601554229227
8.30538940063266
Sum of scores: 52.10831597697465
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halv

skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halvenes
skoor halv

KeyboardInterrupt: 