In [14]:
import os
import json
import pandas as pd
import numpy as np
from modules.extract_results import display_results_by_subdistribution, display_results_by_named_entity, \
                                    display_confusion_matrix, extract_results_to_txt_file

### Folders for models:

In [15]:
def get_results_from_directory(model_directory):
    with open(os.path.join('models', model_directory, 'results.txt')) as file:
        results_json = json.loads(file.read())
        return display_results_by_subdistribution(results_json)["Total"], display_results_by_named_entity(results_json)["Total"]

In [16]:
totals_by_subdistribution = {}
totals_by_named_entity = {}

model_directories = [
    'model_default_with_vabamorftagger',
    'model_local_features_without_morph',
    'model_morph_without_lemmas',
    'model_morph_with_lemmas',
    'model_morph_with_lemmas_and_sentences',
    'model_morph_with_lemmas_and_sentences_and_gazzetteer',
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_initial'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer1and2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants')    
]

for model_directory in model_directories:
    by_subdistribution, by_named_entity = get_results_from_directory(model_directory)
    totals_by_subdistribution[model_directory] = by_subdistribution
    totals_by_named_entity[model_directory] = by_named_entity

# Total values by model:

In [17]:
display(pd.DataFrame(totals_by_subdistribution, index=["Precision", "Recall", "F1-score"]).T)

Unnamed: 0,Precision,Recall,F1-score
model_default_with_vabamorftagger,0.906627,0.87301,0.889501
model_local_features_without_morph,0.864699,0.832153,0.848114
model_morph_without_lemmas,0.86989,0.845586,0.857566
model_morph_with_lemmas,0.899843,0.860993,0.879989
model_morph_with_lemmas_and_sentences,0.900211,0.86065,0.879986
model_morph_with_lemmas_and_sentences_and_gazzetteer,0.898827,0.864727,0.881447
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_initial,0.905881,0.873224,0.889253
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer,0.905739,0.87301,0.889073
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer2,0.905991,0.871036,0.888169
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer1and2,0.90572,0.873224,0.889175


# Total values by named entity:

In [18]:
display(pd.DataFrame(totals_by_named_entity).T)

Unnamed: 0,ORG_precision,ORG_recall,ORG_f1score,PER_precision,PER_recall,PER_f1score,MISC_precision,MISC_recall,MISC_f1score,LOC_precision,LOC_recall,LOC_f1score,LOC_ORG_precision,LOC_ORG_recall,LOC_ORG_f1score
model_default_with_vabamorftagger,0.782784,0.738998,0.760079,0.935971,0.92444,0.930162,0.754867,0.62946,0.684197,0.619732,0.45509,0.523479,0.753337,0.654928,0.698896
model_local_features_without_morph,0.687798,0.693405,0.689991,0.902675,0.887524,0.895002,0.711702,0.652229,0.676465,0.491394,0.412734,0.447676,0.68041,0.572801,0.620993
model_morph_without_lemmas,0.708073,0.694753,0.700954,0.910621,0.899791,0.905156,0.719368,0.624728,0.665965,0.498547,0.417352,0.452484,0.666364,0.615352,0.637321
model_morph_with_lemmas,0.764448,0.717828,0.740377,0.931687,0.91343,0.922458,0.730591,0.620904,0.668685,0.595758,0.441284,0.505341,0.73666,0.638451,0.682786
model_morph_with_lemmas_and_sentences,0.778933,0.722055,0.749319,0.932308,0.912425,0.922235,0.731492,0.622585,0.669542,0.588593,0.44318,0.505245,0.735771,0.644876,0.685607
model_morph_with_lemmas_and_sentences_and_gazzetteer,0.785898,0.737153,0.760706,0.930386,0.915206,0.922721,0.749587,0.640533,0.688006,0.598201,0.45653,0.516861,0.734767,0.649618,0.688142
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_initial,0.777907,0.738891,0.757856,0.936061,0.925076,0.930529,0.74329,0.621258,0.674346,0.614576,0.458052,0.522888,0.74915,0.64998,0.694644
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer,0.787554,0.736738,0.761158,0.937599,0.92515,0.931325,0.76658,0.64935,0.700536,0.607115,0.455062,0.518226,0.737148,0.649108,0.688855
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer2,0.787322,0.739302,0.762462,0.937579,0.923938,0.930705,0.755292,0.638635,0.689544,0.611494,0.449693,0.516467,0.735932,0.643335,0.684771
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer1and2,0.783717,0.742041,0.762229,0.937769,0.925624,0.931649,0.760697,0.643635,0.694739,0.607616,0.45332,0.517476,0.735488,0.647228,0.687085


# Confusion matrices

In [19]:
files = []
with open(os.path.join('..', 'data', 'divided_corpus.txt'), 'r', encoding='UTF-8') as f:
    lines = f.readlines()

for line in lines:
    filename, subdistribution = line.strip().split(':')
    files.append(filename)

In [20]:
for model in model_directories:
    y_true, y_pred = display_confusion_matrix(model, files)
    print(model)
    print(pd.crosstab(y_true, y_pred))
    print('\n')

model_default_with_vabamorftagger
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        395       58     0    0     34
LOC_ORG     48     1501     2    3     86
MISC         3        1   131    0     11
ORG          0        3     0  256      0
PER          6       17     1    0  18059


model_local_features_without_morph
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        358       81     2    3     48
LOC_ORG     81     1315     2    6     80
MISC         6        0   135    1      8
ORG          1        5     0  239      1
PER         20       48     6    0  17343


model_morph_without_lemmas
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        362      102     1    4     57
LOC_ORG     68     1398     2    7    112
MISC         4        0   130    1      6
ORG          0        7     0  241      0
PER         21       92     2    0  17572


model_morph_with