# Find the union between the corpus and the gazetteer

An experiment to determine how big of an impact the gazetteer has on the test corpus' score.

In [1]:
import os

from estnltk.converters import json_to_text

In [None]:
divided_corpus = os.path.join('..', 'data', 'divided_corpus.txt')
json_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')
no_goldstandard_tags_location = os.path.join('..', 'data', 'files_without_goldstandard_annotations.txt')
testing_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')
gazetteer_file_location = os.path.join('models', 'gazetteer_both_lowercase_added_loc_and_variants.txt')
sixth_subdistribution_of_files = os.path.join('..', 'data', 'corpus_subdistribution_without_hand_tagged.txt')

removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']

### Find all names in best model gazetteer:

In [2]:
with open(gazetteer_file_location, 'r', encoding='UTF-8') as in_gaz:
    lines = in_gaz.readlines()

names = [line.split('\t')[:-1][0] for line in lines]

### Find all names (named entities) in goldstandard corpus:

In [46]:
# Initial divided corpus
dist_corpus = dict()

with open(divided_corpus, 'r', encoding='UTF-8') as in_corpus:
    lines = in_corpus.readlines()

with open(sixth_subdistribution_of_files, 'r', encoding='UTF-8') as in_corpus:
    for line in in_corpus.readlines():
        lines.append(line)

for line in lines:
    dist_corpus[line.split(':')[0]] = line.split(':')[1].strip()

In [47]:
corpus = dict()
test_corpus = dict()

for file in dist_corpus:
    with open(os.path.join(testing_files_location, str(file)), 'r', encoding='UTF-8') as in_file:
        if dist_corpus[file] == '6':
            test_corpus[str(file)] = [name.text.lower() for name in json_to_text(in_file.read()).gold_ner]
        else:
            corpus[str(file)] = [name.text.lower() for name in json_to_text(in_file.read()).gold_ner]

In [48]:
common_names = list()

for file in test_corpus:
    intersection = list(set(test_corpus[file]).intersection(names))
    for name in intersection:
        common_names.append(name)

In [49]:
all_names_length = len([name for name in test_corpus[file] for file in test_corpus])
common_names_length = len(common_names)

### How many names do the test corpus and gazzetteer have in common:

In [50]:
print(round((common_names_length / all_names_length), 2))

0.24


Approximately 24% of the names found in the test corpus are also present in the gazetteer.

### How many names do the training corpus and test corpus have in common:

In [51]:
test_corpus_names = list()
for file in test_corpus:
    for name in test_corpus[file]:
        test_corpus_names.append(name)

In [52]:
training_corpus_names = list()
for file in corpus:
    for name in corpus[file]:
        training_corpus_names.append(name)

In [54]:
common_names_test_and_training = len(list(set(test_corpus_names).intersection(training_corpus_names)))

In [55]:
print(round((common_names_test_and_training / all_names_length), 2))

0.2


Approximately 20% of the names that are in the training corpus are also found in the test corpus.

### How many names do the training corpus and gazetteer have in common:

In [56]:
common_names_training_and_gaz = len(list(set(training_corpus_names).intersection(names)))
print(round((common_names_training_and_gaz / len(training_corpus_names)), 2))

0.09


Approximately 9% of the names in the training corpus are also present in the gazetteer.