In [8]:
import os
import json
import random
import sys

In [37]:
my_randoms = random.sample(range(0, 1500), 250)

def calculate_ideal_distribution(file_names = os.listdir("vallakohtufailid_json/")):
    all_annotations = []
    for file in file_names:
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                all_annotations.append(dictionary.get('annotations')[0].get('nertag'))
    per = all_annotations.count("PER") / len(all_annotations) * 100
    loc_org = all_annotations.count("LOC_ORG") / len(all_annotations) * 100
    loc = all_annotations.count("LOC") / len(all_annotations) * 100
    org = all_annotations.count("ORG") / len(all_annotations) * 100
    misc = all_annotations.count("MISC") / len(all_annotations) * 100

    ideal_distribution = {'PER': per, 'LOC_ORG' : loc_org, 'LOC' : loc, 'ORG' : org, 'MISC' : misc}
    
    return ideal_distribution

def corpus_divider(file_names):
    statistics = dict()
    for file in file_names:
        ner_annotations = []
        with open("vallakohtufailid_json/" + file, 'r', encoding="UTF-8)") as f:
            data = json.load(f)
            for dictionary in data.get('layers')[0].get('spans'):
                ner_annotations.append(dictionary.get('annotations')[0].get('nertag'))

        statistics_for_file = dict()
        for annotation in set(ner_annotations):
            statistics_for_file[str(annotation)] = ner_annotations.count(annotation)
        statistics[file] = statistics_for_file

    return statistics

def calculate_proportions(file_names):
    statistics = corpus_divider(file_names)
    all_annotations = list()
    for item in statistics.values():
        for key in item:
            appendable = key
            for i in range(0, item[key] + 1):
                all_annotations.append(appendable)
    proportions = dict()
    for file in statistics:
        for key in statistics[file].keys():
            proportion = all_annotations.count(key) / len(all_annotations) * 100
            proportions[key] = proportion
        
    return proportions

def calculate_score(file_names):
    score = 0
    proportions = calculate_proportions(file_names)
    ideal_distribution = calculate_ideal_distribution()
    
    for proportion in proportions:
        ideal_distribution_proportion = ideal_distribution[proportion]
        current_proportion = proportions[proportion]
        
        if current_proportion == ideal_distribution_proportion:
            score += 0
        else:
            score += abs(ideal_distribution_proportion - current_proportion)
    
    return score

def n_even_chunks(file_names, n):
    files = []
    
    last = 0
    for i in range(1, n+1):
        current = int(round(i* (len(file_names) / n)))
        files.append(file_names[last:current])
        last = current
        
    return files

def generate_random_division(file_names, n):
    random_files = []
    
    seed = random.randrange(sys.maxsize)
    rng = random.Random(seed)
    print("Seed:", seed)
    file_names = random.sample(file_names, len(file_names))
    
    files = n_even_chunks(file_names, n)

    return files

'''
while True:
    random_files = generate_random_division(os.listdir("vallakohtufailid_json/"), 6)
    scores = []
    for files in random_files:
        score = calculate_score(files)
        print(score)
        scores.append(score)
    if sum(scores) <= 5:
        print("Corpus was successful.")
        break
    else:
        print("Corpus overall score was", sum(scores))
        continue
'''

Seed: 6594043555020155273
11.801058897604428
10.28696216819561
6.119258395047798
10.945193206333673
6.929545442498068
5.900748325492877
Corpus overall score was 51.98276643517245
Seed: 8159237352668887331
9.550234347450013
7.225662042180833


KeyboardInterrupt: 