In [1]:
import os
import json
import pandas as pd
import numpy as np
from modules.extract_results import display_results_by_subdistribution, display_results_by_named_entity, \
                                    display_confusion_matrix, extract_results_to_txt_file
import matplotlib.pyplot as plt

In [2]:
def format_vertical_headers(df):
    """Display a dataframe with vertical column headers"""
    styles = [dict(selector="th", props=[('width', '40px')]),
              dict(selector="th.col_heading",
                   props=[("writing-mode", "vertical-rl"),
                          ('transform', 'rotateZ(180deg)'), 
                          ('height', '290px'),
                          ('vertical-align', 'top')])]
    return (df.fillna('').style.set_table_styles(styles))

### Folders for models:

The models in the `models` variable use these taggers:
1. `model_default` uses NerMorphFeatureTagger, NerLocalFeatureTagger, NerSentenceFeatureTagger, NerGazetteerFeatureTagger and NerGlobalContextFeatureTagger.
2. `model_local_features_without_morph` uses NerEmptyFeatureTagger and NerLocalFeatureWithoutMorphTagger.
3. `model_morph_without_lemmas` uses NerEmptyFeatureTagger, NerLocalFeatureWithoutMorphTagger and NerMorphNoLemmasFeatureTagger.
4. `model_morph_with_lemmas` uses NerEmptyFeatureTagger, NerLocalFeatureWithoutMorphTagger and NerBasicMorphFeatureTagger.
5. `model_morph_with_lemmas_and_sentences` uses NerEmptyFeatureTagger, NerLocalFeatureWithoutMorphTagger, NerBasicMorphFeatureTagger and fex.NerSentenceFeatureTagger.
6. `model_morph_with_lemmas_and_sentences_and_gazzetteer` uses NerEmptyFeatureTagger, NerLocalFeatureWithoutMorphTagger, NerBasicMorphFeatureTagger, fex.NerSentenceFeatureTagger and fex.NerGazetteerFeatureTagger.
7. `model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features` uses NerEmptyFeatureTagger, NerLocalFeatureWithoutMorphTagger, NerBasicMorphFeatureTagger, fex.NerSentenceFeatureTagger, fex.NerGazetteerFeatureTagger and fex.NerGlobalContextFeatureTagger.

In [3]:
models = ['model_default',\
          'model_local_features_without_morph',\
          'model_morph_without_lemmas',\
          'model_morph_with_lemmas',\
          'model_morph_with_lemmas_and_sentences',\
          'model_morph_with_lemmas_and_sentences_and_gazzetteer',\
          'model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features']

### Folders for results:

Every model in the `models` variable contains folders with different results, here are the folders inside the model folders:

1. `model_initial` contains results from the initial model
2. `model_vabamorf` contains results using VabamorfCorpusTagger
3. `model_vabamorf_gazzetteer` contains results from using the "vallakohtud.csv" gazetteer
4. `model_vabamorf_gazzetteer2` contains results using the gazeetter from the .tsv file
5. `model_vabamorf_gazzetteer1and2` contains results from using both of the aforementioned gazetteers

In [4]:
model_results = ['model_initial',\
                 'model_vabamorf',\
                 'model_vabamorf_gazzetteer',\
                 'model_vabamorf_gazzetteer2',\
                 'model_vabamorf_gazzetteer1and2']

In [5]:
def get_results_from_directory(model_directory, directory_for_results):
    try:
        with open(os.path.join('models', model_directory, directory_for_results, 'results.txt')) as file:
            results_json = json.loads(file.read())
            return display_results_by_subdistribution(results_json)["Total"], display_results_by_named_entity(results_json)["Total"]
    except FileNotFoundError:
        print(f"(!) Mudelil {model_directory} puuduvad tulemused kaustas {directory_for_results}.")
        return {}, {}

In [6]:
totals_by_subdistribution = {}
totals_by_named_entity = {}

for model_directory in models:
    for m_results in model_results:
        by_subdistribution, by_named_entity = get_results_from_directory(model_directory, m_results)
        totals_by_subdistribution[model_directory + "," + str(m_results)] = by_subdistribution
        totals_by_named_entity[model_directory + "," + str(m_results)] = by_named_entity

(!) Mudelil model_local_features_without_morph puuduvad tulemused kaustas model_vabamorf_gazzetteer.
(!) Mudelil model_local_features_without_morph puuduvad tulemused kaustas model_vabamorf_gazzetteer2.
(!) Mudelil model_local_features_without_morph puuduvad tulemused kaustas model_vabamorf_gazzetteer1and2.
(!) Mudelil model_morph_without_lemmas puuduvad tulemused kaustas model_vabamorf_gazzetteer.
(!) Mudelil model_morph_without_lemmas puuduvad tulemused kaustas model_vabamorf_gazzetteer2.
(!) Mudelil model_morph_without_lemmas puuduvad tulemused kaustas model_vabamorf_gazzetteer1and2.
(!) Mudelil model_morph_with_lemmas puuduvad tulemused kaustas model_vabamorf_gazzetteer.
(!) Mudelil model_morph_with_lemmas puuduvad tulemused kaustas model_vabamorf_gazzetteer2.
(!) Mudelil model_morph_with_lemmas puuduvad tulemused kaustas model_vabamorf_gazzetteer1and2.
(!) Mudelil model_morph_with_lemmas_and_sentences puuduvad tulemused kaustas model_vabamorf_gazzetteer.
(!) Mudelil model_morph_wi

# Total values by model:

In [7]:
format_vertical_headers(pd.DataFrame(totals_by_subdistribution, index=["Precision", "Recall", "F1-score"]))

Unnamed: 0,"model_default,model_initial","model_default,model_vabamorf","model_default,model_vabamorf_gazzetteer","model_default,model_vabamorf_gazzetteer2","model_default,model_vabamorf_gazzetteer1and2","model_local_features_without_morph,model_initial","model_local_features_without_morph,model_vabamorf","model_local_features_without_morph,model_vabamorf_gazzetteer","model_local_features_without_morph,model_vabamorf_gazzetteer2","model_local_features_without_morph,model_vabamorf_gazzetteer1and2","model_morph_without_lemmas,model_initial","model_morph_without_lemmas,model_vabamorf","model_morph_without_lemmas,model_vabamorf_gazzetteer","model_morph_without_lemmas,model_vabamorf_gazzetteer2","model_morph_without_lemmas,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas,model_initial","model_morph_with_lemmas,model_vabamorf","model_morph_with_lemmas,model_vabamorf_gazzetteer","model_morph_with_lemmas,model_vabamorf_gazzetteer2","model_morph_with_lemmas,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas_and_sentences,model_initial","model_morph_with_lemmas_and_sentences,model_vabamorf","model_morph_with_lemmas_and_sentences,model_vabamorf_gazzetteer","model_morph_with_lemmas_and_sentences,model_vabamorf_gazzetteer2","model_morph_with_lemmas_and_sentences,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_initial","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf_gazzetteer","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf_gazzetteer2","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_initial","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf_gazzetteer","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf_gazzetteer2","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf_gazzetteer1and2"
Precision,0.904527,0.905337,0.904561,0.903844,0.903961,0.864699,0.866032,,,,0.86989,0.867159,,,,0.899843,0.902091,,,,0.900211,0.900876,,,,0.898827,0.901721,0.90261,0.902124,0.903103,0.905881,0.906627,0.905739,0.905991,0.90572
Recall,0.874598,0.87507,0.874126,0.873782,0.873739,0.832153,0.832024,,,,0.845586,0.846616,,,,0.860993,0.86477,,,,0.86065,0.865499,,,,0.864727,0.865886,0.865113,0.865886,0.865585,0.873224,0.87301,0.87301,0.871036,0.873224
F1-score,0.889311,0.889946,0.889083,0.888559,0.888593,0.848114,0.848688,,,,0.857566,0.856764,,,,0.879989,0.883036,,,,0.879986,0.882833,,,,0.881447,0.88344,0.883464,0.883633,0.883946,0.889253,0.889501,0.889073,0.888169,0.889175


# Total values by named entity:

In [8]:
format_vertical_headers(pd.DataFrame(totals_by_named_entity))

Unnamed: 0,"model_default,model_initial","model_default,model_vabamorf","model_default,model_vabamorf_gazzetteer","model_default,model_vabamorf_gazzetteer2","model_default,model_vabamorf_gazzetteer1and2","model_local_features_without_morph,model_initial","model_local_features_without_morph,model_vabamorf","model_local_features_without_morph,model_vabamorf_gazzetteer","model_local_features_without_morph,model_vabamorf_gazzetteer2","model_local_features_without_morph,model_vabamorf_gazzetteer1and2","model_morph_without_lemmas,model_initial","model_morph_without_lemmas,model_vabamorf","model_morph_without_lemmas,model_vabamorf_gazzetteer","model_morph_without_lemmas,model_vabamorf_gazzetteer2","model_morph_without_lemmas,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas,model_initial","model_morph_with_lemmas,model_vabamorf","model_morph_with_lemmas,model_vabamorf_gazzetteer","model_morph_with_lemmas,model_vabamorf_gazzetteer2","model_morph_with_lemmas,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas_and_sentences,model_initial","model_morph_with_lemmas_and_sentences,model_vabamorf","model_morph_with_lemmas_and_sentences,model_vabamorf_gazzetteer","model_morph_with_lemmas_and_sentences,model_vabamorf_gazzetteer2","model_morph_with_lemmas_and_sentences,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_initial","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf_gazzetteer","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf_gazzetteer2","model_morph_with_lemmas_and_sentences_and_gazzetteer,model_vabamorf_gazzetteer1and2","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_initial","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf_gazzetteer","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf_gazzetteer2","model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features,model_vabamorf_gazzetteer1and2"
ORG_precision,0.779428,0.774955,0.787858,0.781208,0.784107,0.687798,0.691862,,,,0.708073,0.707765,,,,0.764448,0.778035,,,,0.778933,0.782638,,,,0.785898,0.787822,0.787158,0.783904,0.783904,0.777907,0.782784,0.787554,0.787322,0.783717
ORG_recall,0.744606,0.742388,0.739477,0.742041,0.744781,0.693405,0.655949,,,,0.694753,0.702828,,,,0.717828,0.730557,,,,0.722055,0.727993,,,,0.737153,0.737558,0.726013,0.731026,0.731026,0.738891,0.738998,0.736738,0.739302,0.742041
ORG_f1score,0.761262,0.758191,0.762768,0.761042,0.763859,0.689991,0.673176,,,,0.700954,0.704915,,,,0.740377,0.753497,,,,0.749319,0.754216,,,,0.760706,0.761825,0.755239,0.756464,0.756464,0.757856,0.760079,0.761158,0.762462,0.762229
PER_precision,0.935972,0.93599,0.936415,0.935851,0.935967,0.902675,0.903203,,,,0.910621,0.908566,,,,0.931687,0.933332,,,,0.932308,0.932575,,,,0.930386,0.93267,0.933216,0.933751,0.93399,0.936061,0.935971,0.937599,0.937579,0.937769
PER_recall,0.925524,0.925934,0.926092,0.925873,0.925753,0.887524,0.886713,,,,0.899791,0.899029,,,,0.91343,0.915736,,,,0.912425,0.916315,,,,0.915206,0.916525,0.915994,0.916636,0.91672,0.925076,0.92444,0.92515,0.923938,0.925624
PER_f1score,0.930714,0.930929,0.931214,0.930825,0.930822,0.895002,0.89485,,,,0.905156,0.903757,,,,0.922458,0.924432,,,,0.922235,0.924358,,,,0.922721,0.924518,0.924511,0.925099,0.92526,0.930529,0.930162,0.931325,0.930705,0.931649
MISC_precision,0.740069,0.738009,0.739553,0.73367,0.73367,0.711702,0.697559,,,,0.719368,0.741873,,,,0.730591,0.721473,,,,0.731492,0.741493,,,,0.749587,0.719285,0.736455,0.731327,0.736455,0.74329,0.754867,0.76658,0.755292,0.760697
MISC_recall,0.649946,0.643153,0.625064,0.61935,0.61935,0.652229,0.620446,,,,0.624728,0.659474,,,,0.620904,0.611142,,,,0.622585,0.630064,,,,0.640533,0.619582,0.624942,0.62102,0.624942,0.621258,0.62946,0.64935,0.638635,0.643635
MISC_f1score,0.688467,0.683545,0.674957,0.66916,0.66916,0.676465,0.654738,,,,0.665965,0.697185,,,,0.668685,0.658599,,,,0.669542,0.678362,,,,0.688006,0.663239,0.673778,0.669334,0.673778,0.674346,0.684197,0.700536,0.689544,0.694739
LOC_precision,0.609861,0.620079,0.617052,0.617664,0.614459,0.491394,0.502194,,,,0.498547,0.491936,,,,0.595758,0.618043,,,,0.588593,0.601528,,,,0.598201,0.615282,0.613395,0.609479,0.609845,0.614576,0.619732,0.607115,0.611494,0.607616


# Compare models:
Which models would you like to compare?

In [18]:
old = 'model_default,model_initial'
new = 'model_default,model_vabamorf'

#Enter models as 'model_name,results_folder'
#E.g. model_default,model_initial

In [19]:
old_by_subdistribution = totals_by_subdistribution[old]
new_by_subdistribution = totals_by_subdistribution[new]

In [21]:
print(new_by_subdistribution - old_by_subdistribution)

Precision    0.000810
Recall       0.000472
F1-score     0.000635
Name: Total, dtype: float64


# All models:

Define the model which results you'd like to see:

In [11]:
print([model for model in models])

['model_default', 'model_local_features_without_morph', 'model_morph_without_lemmas', 'model_morph_with_lemmas', 'model_morph_with_lemmas_and_sentences', 'model_morph_with_lemmas_and_sentences_and_gazzetteer', 'model_morph_with_lemmas_and_sentences_and_gazzetteer_and_global_features']


In [12]:
print([results for results in model_results])

['model_initial', 'model_vabamorf', 'model_vabamorf_gazzetteer', 'model_vabamorf_gazzetteer2', 'model_vabamorf_gazzetteer1and2']


In [13]:
model_name = 'model_default'
results_name = 'model_vabamorf_gazzetteer'

In [14]:
with open(os.path.join('models', model_name, results_name, 'results.txt')) as file:
    results_json = json.loads(file.read())
    display(display_results_by_subdistribution(results_json))
    display(pd.DataFrame(display_results_by_named_entity(results_json)))

Alamhulk,1,2,3,4,5,Total
Precision,0.914043,0.91349,0.905759,0.882513,0.910745,0.904561
Recall,0.879867,0.885986,0.879448,0.845625,0.885053,0.874126
F1-score,0.896629,0.899528,0.89241,0.863675,0.897715,0.889083


Unnamed: 0,1,2,3,4,5,Total
ORG_precision,0.792453,0.777778,0.823529,0.764706,0.780822,0.787858
ORG_recall,0.711864,0.742424,0.8,0.712329,0.730769,0.739477
ORG_f1score,0.75,0.75969,0.811594,0.737589,0.754967,0.762768
PER_precision,0.941719,0.945304,0.939057,0.910916,0.945077,0.936415
PER_recall,0.928015,0.927487,0.925231,0.905605,0.944121,0.926092
PER_f1score,0.934817,0.936311,0.932092,0.908253,0.944599,0.931214
MISC_precision,0.655172,0.894737,0.675676,0.823529,0.648649,0.739553
MISC_recall,0.463415,0.666667,0.595238,0.8,0.6,0.625064
MISC_f1score,0.542857,0.764045,0.632911,0.811594,0.623377,0.674957
LOC_precision,0.555556,0.592814,0.680672,0.589552,0.666667,0.617052
