In [1]:
import os
import json
import pandas as pd
import numpy as np
from modules.extract_results import display_results_by_subdistribution, display_results_by_named_entity, \
                                    display_confusion_matrix, extract_results_to_txt_file

### Folders for models:

In [2]:
def get_results_from_directory(model_directory):
    with open(os.path.join('models', model_directory, 'results.txt')) as file:
        results_json = json.loads(file.read())
        return display_results_by_subdistribution(results_json)["Total"], display_results_by_named_entity(results_json)["Total"]

In [3]:
totals_by_subdistribution = {}
totals_by_named_entity = {}

model_directories = [
    'model_default_with_vabamorftagger',
    'model_local_features_without_morph',
    'model_morph_without_lemmas',
    'model_morph_with_lemmas',
    'model_morph_with_lemmas_and_sentences',
    'model_morph_with_lemmas_and_sentences_and_gazzetteer',
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_initial'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer1and2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants')    
]

for model_directory in model_directories:
    by_subdistribution, by_named_entity = get_results_from_directory(model_directory)
    totals_by_subdistribution[model_directory] = by_subdistribution
    totals_by_named_entity[model_directory] = by_named_entity

# Total values by model:

In [4]:
display(pd.DataFrame(totals_by_subdistribution, index=["Precision", "Recall", "F1-score"]).T)

Unnamed: 0,Precision,Recall,F1-score
model_default_with_vabamorftagger,0.981581,0.967426,0.974452
model_local_features_without_morph,0.931129,0.934166,0.932645
model_morph_without_lemmas,0.958125,0.947599,0.952833
model_morph_with_lemmas,0.981885,0.967684,0.974733
model_morph_with_lemmas_and_sentences,0.980849,0.969358,0.97507
model_morph_with_lemmas_and_sentences_and_gazzetteer,0.979129,0.966396,0.972721
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_initial,0.98153,0.966997,0.974209
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer,0.982489,0.970388,0.976401
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer2,0.98291,0.970044,0.976435
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer1and2,0.90572,0.873224,0.889175


# Total values by named entity:

In [5]:
display(pd.DataFrame(totals_by_named_entity).T)

Unnamed: 0,ORG_precision,ORG_recall,ORG_f1score,PER_precision,PER_recall,PER_f1score,MISC_precision,MISC_recall,MISC_f1score,LOC_precision,LOC_recall,LOC_f1score,LOC_ORG_precision,LOC_ORG_recall,LOC_ORG_f1score
model_default_with_vabamorftagger,0.954667,0.948718,0.951634,0.987743,0.979225,0.983463,0.905383,0.895248,0.900118,0.930151,0.882045,0.900778,0.945942,0.918423,0.930823
model_local_features_without_morph,0.780202,0.842736,0.809745,0.944977,0.953207,0.949059,0.868723,0.892174,0.879759,0.83548,0.81743,0.824245,0.868817,0.84558,0.856235
model_morph_without_lemmas,0.924486,0.925561,0.925006,0.971248,0.964937,0.968081,0.919888,0.900492,0.909458,0.86678,0.831877,0.846348,0.879564,0.862209,0.870475
model_morph_with_lemmas,0.950685,0.941026,0.945695,0.987755,0.979599,0.983658,0.894572,0.885248,0.889729,0.934541,0.882045,0.902083,0.949057,0.920591,0.933381
model_morph_with_lemmas_and_sentences,0.950685,0.941026,0.945695,0.987814,0.980312,0.984048,0.894572,0.885248,0.889729,0.929725,0.891229,0.907202,0.944978,0.927131,0.935472
model_morph_with_lemmas_and_sentences_and_gazzetteer,0.952,0.946154,0.94902,0.986175,0.978732,0.982439,0.885193,0.865856,0.875275,0.926354,0.883562,0.900578,0.939765,0.914978,0.926241
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_initial,0.957333,0.951282,0.954248,0.987458,0.978927,0.983173,0.899978,0.890248,0.894923,0.935166,0.881024,0.901478,0.945547,0.91728,0.929963
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer,0.954054,0.946154,0.95,0.989511,0.982233,0.985859,0.910789,0.900248,0.905313,0.92563,0.884844,0.901209,0.944304,0.921838,0.93217
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer2,0.956757,0.948718,0.952632,0.989542,0.981879,0.985695,0.905383,0.895248,0.900118,0.930684,0.885106,0.903197,0.945454,0.921422,0.932363
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer1and2,0.783717,0.742041,0.762229,0.937769,0.925624,0.931649,0.760697,0.643635,0.694739,0.607616,0.45332,0.517476,0.735488,0.647228,0.687085


# Confusion matrices

In [12]:
files = []
with open(os.path.join('..', 'data', 'divided_corpus.txt'), 'r', encoding='UTF-8') as f:
    lines = f.readlines()

for line in lines:
    filename, subdistribution = line.strip().split(':')
    files.append(filename)

In [16]:
for model in model_directories:
    y_true, y_pred = display_confusion_matrix(model, files)
    print(model)
    print(pd.crosstab(y_true, y_pred))
    print('\n')

model_default_with_vabamorftagger
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        748       12     0    0      9
LOC_ORG      8     2140     0    0     18
MISC         0        0   188    0      8
ORG          0        2     0  326      0
PER          2        9     0    0  19140


model_local_features_without_morph
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        692       18     2    2     24
LOC_ORG     45     1972     0    5     30
MISC         0        0   187    0      1
ORG          0        3     0  290      0
PER          4       17     5    0  18626


model_morph_without_lemmas
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        705       28     0    3     18
LOC_ORG     39     2013     0    3     33
MISC         0        0   189    0      1
ORG          0        3     0  318      0
PER          5       19     2    0  18855


model_morph_with