In [1]:
import os
import json
import random

from modules.corpus_division import get_filenames,\
                                    get_hand_tagged, \
                                    get_labels_distribution, \
                                    get_documents_ne_statistics, \
                                    swap_items_in_lists, \
                                    calculate_proportions, \
                                    improve_scores, \
                                    remove_hand_tagged, \
                                    calculate_score, \
                                    n_even_chunks, \
                                    generate_random_division

In [2]:
# The location of protocols and the directories that have protocols in them
vallakohtufailid_location = os.path.join('..', 'data', 'vallakohtufailid')
directories = ["vallakohus_esimene", "vallakohus_teine", "vallakohus_kolmas", "vallakohus_neljas"]

In [3]:
# Location of the hand-tagged protocols file (contains id-s)
hand_tagged_protocols_file = os.path.join('..', 'data', 'k2sitsi_morfiga_protokollid.txt')

In [4]:
# Location of files that have the goldstandard tags (collected with the first tool)
gold_standard_files = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')

In [24]:
# Location of the distributed corpus file
divided_corpus = os.path.join('..', 'data', 'divided_corpus.txt')

In [25]:
# Location of the hand-tagged files list
hand_tagged = os.path.join('..', 'data', 'corpus_subdistribution_without_hand_tagged.txt')

In [5]:
# Get all names of files in the corpus
filenames = get_filenames(vallakohtufailid_location, directories)

# Get labels and the number of that label from all documents
statistics = get_documents_ne_statistics(filenames, gold_standard_files)

# Get the distribution of labels over the whole corpus
ideal_distribution = get_labels_distribution(filenames, gold_standard_files)

# Get protocols that are tagged by hand
protocols_tagged_by_hand = get_hand_tagged(hand_tagged_protocols_file, filenames)

### Calculate the scores of subdistributions generated by the script

In [22]:
# Seed
seed = 1
random.seed(seed)
print(f'Seed: {seed}')

# Give a number of random divisions to be generated from the files
number_of_random_divisions = 25

# Give a number of how many even chunks to create from the files
number_of_even_chunks = 6

# Generate random distributions and divide them into subdistributions
random_distributions = generate_random_division(sorted(os.listdir(gold_standard_files)), number_of_random_divisions)
subdistributions = [n_even_chunks(distribution, number_of_even_chunks) for distribution in random_distributions]

sums_of_distributions = list()
list_of_scores = list()

for distribution in subdistributions:
    scores = dict()
    for files in distribution:
        score = calculate_score(files, statistics, ideal_distribution)
        scores[score] = files
        print(score)
    list_of_scores.append(scores)
    sub_corpus_score = sum(scores.keys()) / len(scores.keys())
    print(f'Sub-corpus score: {sub_corpus_score}\n')
    
sums_of_distributions = [(sum(k.keys()) / len(k.keys())) for k in list_of_scores]

Seed: 1
6.207355815198964
11.874719943138807
11.39658420239689
9.17197059987578
7.268100311057957
5.7046876800153585
Sub-corpus score: 8.603903091947293

9.9732211280256
9.456529460492243
6.053995218755706
9.769595602711254
8.059186637618012
9.10155524753753
Sub-corpus score: 8.735680549190057

8.91464336784733
11.537179912708115
10.662136280342123
8.995510057232245
7.9614464877588595
4.567379134442525
Sub-corpus score: 8.773049206721867

6.610682850010091
4.410398036932767
14.603704211547347
9.359958462105022
9.36324726896274
8.198438151955253
Sub-corpus score: 8.757738163585536

9.002350320924412
8.496119409176766
4.983825179903615
9.818771068405251
12.538850800304711
7.734536626649881
Sub-corpus score: 8.762408900894107

5.625934661383683
8.289162864722709
4.899051352860271
12.658019940409545
9.364925895845962
11.674180489944426
Sub-corpus score: 8.751879200861099

11.088417032091648
10.51824569911324
8.806274374536615
6.7418000338537
6.471232066336358
8.209653404088723
Sub-corpus s

In [23]:
# Find the smallest score in all of the randomly generated subdistributions
smallest_score = list_of_scores[sums_of_distributions.index(min(sums_of_distributions))]

# Get the smallest score files and find how many hand-tagged protocols they have
number_of_hand_tagged_protocols = [len(set(files).intersection(protocols_tagged_by_hand)) for files in smallest_score.values()]
files_to_use = [files for files in smallest_score.values()]
least_hand_tagged_old = files_to_use[number_of_hand_tagged_protocols.index(min(number_of_hand_tagged_protocols))]

# Remove files from the old list of least hand-tagged files
other_files = [files for files in files_to_use]
other_files.pop(other_files.index(least_hand_tagged_old))

# Remove hand-tagged files from least_hand_tagged by exchanging files from other subdistributions
least_hand_tagged, other_files = remove_hand_tagged(least_hand_tagged_old, other_files, statistics, ideal_distribution, protocols_tagged_by_hand)

# All files for the first five corpuses
final_files = [files for files in other_files]

1 V6ru_R2pina_Kahkva_id24674_1868a.json
(!) Changed out file Tartu_R6ngu_Aakre_id12415_1827a.json
2 J2rva_Peetri_V2ike-Kareda_id22448_1881a.json
(!) Changed out file Saare_Kihelkonna_Atla_id7551_1875a.json
3 L22ne_Martna_Martna_id14205_1869a.json
(!) Changed out file Viru_Haljala_Vihula_id8056_1877a.json


In [27]:
# Write txt files
with open(divided_corpus, 'w+', encoding='UTF-8') as out_f:
    iterator = 1
    for files in final_files:
        for f in files:
            out_f.write(f'{str(f)}:{str(iterator)}\n')
        iterator += 1

with open(hand_tagged, 'w+', encoding = 'UTF-8') as out_f:
    for f in least_hand_tagged:
        out_f.write(f'{str(f)}:{str(iterator)}\n')

In [30]:
for files in final_files:
    print(f'Hand-tagged files in the subdistribution: {len(set(files).intersection(protocols_tagged_by_hand))}')
    print(f'Subdistribution score: {str(calculate_score(files, statistics, ideal_distribution))}\n')
print(f'Hand-tagged files in the subdistribution: {len(set(least_hand_tagged).intersection(protocols_tagged_by_hand))}')
print(f'Subdistribution score: {calculate_score(least_hand_tagged, statistics, ideal_distribution)}')

Hand-tagged files in the subdistribution: 13
Subdistribution score: 6.047574351069665

Hand-tagged files in the subdistribution: 13
Subdistribution score: 7.944807552650694

Hand-tagged files in the subdistribution: 8
Subdistribution score: 9.63458779587663

Hand-tagged files in the subdistribution: 10
Subdistribution score: 9.392619000462133

Hand-tagged files in the subdistribution: 12
Subdistribution score: 9.92939691555241

Hand-tagged files in the subdistribution: 0
Subdistribution score: 8.203464206801717
