In [1]:
import os
import json
import pandas as pd

from modules.results_extraction import results_by_subdistribution, results_by_named_entity, confusion_matrix, extract_results

### Folders for models:

In [2]:
def get_results_from_directory(model_directory):
    with open(os.path.join('models', model_directory, 'results.txt')) as file:
        results_json = json.loads(file.read())
        return results_by_subdistribution(results_json)["Total"], results_by_named_entity(results_json)["Total"]

In [3]:
totals_by_subdistribution = {}
totals_by_named_entity = {}

model_directories = [
    'model_default_with_vabamorftagger',
    'model_local_features_without_morph',
    'model_morph_without_lemmas',
    'model_morph_with_lemmas',
    'model_morph_with_lemmas_and_sentences',
    'model_morph_with_lemmas_and_sentences_and_gazzetteer',
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_initial'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer1and2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants')    
]

for model_directory in model_directories:
    by_subdistribution, by_named_entity = get_results_from_directory(model_directory)
    totals_by_subdistribution[model_directory] = by_subdistribution
    totals_by_named_entity[model_directory] = by_named_entity

# Total values by model:

In [4]:
display(pd.DataFrame(totals_by_subdistribution, index=["Precision", "Recall", "F1-score"]).T)

Unnamed: 0,Precision,Recall,F1-score
model_default_with_vabamorftagger,0.895241,0.877645,0.886356
model_local_features_without_morph,0.866032,0.832024,0.848688
model_morph_without_lemmas,0.872944,0.849491,0.861058
model_morph_with_lemmas,0.900823,0.864212,0.882138
model_morph_with_lemmas_and_sentences,0.898241,0.863354,0.880452
model_morph_with_lemmas_and_sentences_and_gazzetteer,0.899732,0.864169,0.881592
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_initial,0.906471,0.874726,0.890316
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer,0.894125,0.872023,0.882936
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer2,0.892718,0.873868,0.883192
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer1and2,0.895267,0.873482,0.88424


# Total values by named entity:

In [5]:
display(pd.DataFrame(totals_by_named_entity).T)

Unnamed: 0,ORG_precision,ORG_recall,ORG_f1score,PER_precision,PER_recall,PER_f1score,MISC_precision,MISC_recall,MISC_f1score,LOC_precision,LOC_recall,LOC_f1score,LOC_ORG_precision,LOC_ORG_recall,LOC_ORG_f1score
model_default_with_vabamorftagger,0.758804,0.727751,0.74275,0.929349,0.929144,0.929243,0.720904,0.644348,0.676572,0.582911,0.47735,0.523924,0.724264,0.65516,0.6861
model_local_features_without_morph,0.691862,0.655949,0.673176,0.903203,0.886713,0.89485,0.697559,0.620446,0.654738,0.502194,0.399449,0.44414,0.68027,0.584712,0.628686
model_morph_without_lemmas,0.707799,0.71819,0.712663,0.915546,0.903981,0.909711,0.713167,0.628644,0.667228,0.489296,0.417202,0.449118,0.663418,0.606,0.63284
model_morph_with_lemmas,0.771874,0.724954,0.747674,0.93247,0.915016,0.923653,0.757389,0.646373,0.694754,0.590088,0.44383,0.505122,0.741071,0.654897,0.693405
model_morph_with_lemmas_and_sentences,0.774965,0.721817,0.747352,0.931167,0.914358,0.922656,0.742768,0.629942,0.678993,0.582207,0.43871,0.499548,0.731495,0.648276,0.6865
model_morph_with_lemmas_and_sentences_and_gazzetteer,0.768066,0.731199,0.749111,0.930704,0.9146,0.922562,0.768535,0.658016,0.706024,0.609346,0.452966,0.51838,0.738116,0.649728,0.689819
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_initial,0.779708,0.731938,0.754803,0.936759,0.925916,0.931296,0.746169,0.634824,0.682552,0.610534,0.444279,0.513123,0.751578,0.663236,0.703114
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer,0.76176,0.716802,0.738541,0.929334,0.925725,0.927522,0.735716,0.626137,0.673706,0.568503,0.453444,0.503676,0.710719,0.642592,0.672799
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer2,0.769924,0.715685,0.741532,0.927331,0.927954,0.927639,0.73127,0.6203,0.668464,0.560924,0.446979,0.496365,0.715056,0.642249,0.675313
model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features\model_vabamorf_gazetteer1and2,0.77003,0.716511,0.742141,0.930236,0.926599,0.928406,0.756124,0.6453,0.692817,0.574537,0.463416,0.512352,0.714559,0.64248,0.675118


# Confusion matrices

In [6]:
files = []
with open(os.path.join('..', 'data', 'divided_corpus.txt'), 'r', encoding='UTF-8') as f:
    lines = f.readlines()

for line in lines:
    filename, subdistribution = line.strip().split(':')
    files.append(filename)

In [7]:
for model in model_directories:
    y_true, y_pred = confusion_matrix(model, files)
    print(model)
    print(pd.crosstab(y_true, y_pred))
    print('\n')

model_default_with_vabamorftagger
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        414       70     0    0     46
LOC_ORG     52     1501     2    1    100
MISC         3        3   134    0     10
ORG          0        4     0  252      0
PER          8       24     1    0  18149


model_local_features_without_morph
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        347       97     1    3     44
LOC_ORG     77     1364     1    2     90
MISC         5        1   129    1      9
ORG          0        4     0  226      1
PER         17       63     5    0  17321


model_morph_without_lemmas
Predicted  LOC  LOC_ORG  MISC  ORG    PER
Actual                                   
LOC        359      102     0    5     52
LOC_ORG     73     1395     2    7    122
MISC         5        1   131    1      7
ORG          1        9     0  249      1
PER         17       73     3    0  17660


model_morph_with