In [1]:
import pandas as pd
from glob import glob
import urllib
import json

## Models for all states with various feature combos

In [2]:
better_col_names = {
    'xgbClassifier': 'Simple Classification',
    'xgbOrdinalClassifier': 'Ordinal Classification',
    'xgbCoarseGrainedClassifier': 'Binned(3 Bins) Classification'
}

better_index_name = {
    'babelkw': 'Babelfy KWs',
    'kw': 'YAKE KWs',
    'wikikw': 'Wikifier KWs',
    'st': 'Sentence Transformer',
    'tfidf': 'TF-IDF'
}


In [3]:
results_dir = '../results/results_with_combos/accuracies'
parsed = dict()
with open(results_dir) as f:
    for l in f:
        label, acc = l.split('\t')
        parsed[label] = acc

In [4]:
results = []
for k in parsed:
    model, features, filtered = k.split('.')[0].split('_')
    filtered = filtered == 'True'
    accuracy = float(parsed[k].strip())
    results.append((model, features, filtered, accuracy))
df = pd.DataFrame(results, columns=['model', 'features', 'filtered', 'accuracy'])

In [5]:
df[df.filtered][~df[df.filtered].features.str.contains('+', regex=False)].\
    drop(columns=['filtered']).\
    pivot(index='model', columns='features').\
    transpose().\
    rename(columns=better_col_names, index=better_index_name).\
    style.highlight_max(color='lightgreen').highlight_min(color='red')

Unnamed: 0_level_0,model,Simple Classification,Binned(3 Bins) Classification,Ordinal Classification
Unnamed: 0_level_1,features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,Babelfy KWs,0.12389,0.49626,0.119565
accuracy,YAKE KWs,0.121201,0.477443,0.111851
accuracy,Sentence Transformer,0.135227,0.504208,0.134058
accuracy,TF-IDF,0.127747,0.505259,0.116994
accuracy,Wikifier KWs,0.126578,0.479313,0.123422


In [6]:
df[~df.filtered].\
    drop(columns=['filtered']).\
    pivot(index='model', columns='features').\
    transpose().\
    rename(columns=better_col_names, index=better_index_name).\
    style.highlight_max(color='lightgreen').highlight_min(color='red')

Unnamed: 0_level_0,model,Simple Classification,Binned(3 Bins) Classification,Ordinal Classification
Unnamed: 0_level_1,features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,Babelfy KWs,0.10534,0.479643,0.111768
accuracy,babelkw+kw,0.10594,0.480758,0.10954
accuracy,babelkw+wikikw,0.101997,0.476901,0.104397
accuracy,YAKE KWs,0.113397,0.467472,0.100626
accuracy,Sentence Transformer,0.118968,0.492415,0.114854
accuracy,st+babelkw,0.116997,0.487015,0.114854
accuracy,st+wikikw,0.11614,0.488557,0.116054
accuracy,TF-IDF,0.10954,0.4919,0.109454
accuracy,Wikifier KWs,0.103797,0.454273,0.106368
accuracy,wikikw+babelkw+kw,0.105511,0.476386,0.106711


## Models for each state

In [7]:
results_dir = '../results/results_states/accuracies'
parsed = dict()
with open(results_dir) as f:
    for l in f:
        label, acc = l.split('\t')
        parsed[label] = acc

In [8]:
results = []
for k in parsed:
    model, features, filtered, state = k.split('.')[0].split('_')
    filtered = filtered == 'True'
    accuracy = float(parsed[k].strip())
    results.append((model, features, filtered, state, accuracy))
df = pd.DataFrame(results, columns=['model', 'features', 'filtered', 'state', 'accuracy'])

In [9]:
df[~df.filtered].\
    drop(columns=['filtered', 'model']).\
    pivot(index='state', columns='features').\
    transpose().\
    rename(columns=better_col_names, index=better_index_name).\
    style.highlight_max(color='lightgreen').highlight_min(color='red')

Unnamed: 0_level_0,state,baden-wuerttemberg,bayern,berlin,brandenburg,bremen,hamburg,hessen,mecklenburg-vorpommern,niedersachsen,nordrhein-westfalen,rheinland-pfalz,saarland,sachsen,sachsen-anhalt,schleswig-holstein,thueringen
Unnamed: 0_level_1,features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
accuracy,Babelfy KWs,0.186436,0.359714,0.177138,0.217791,0.250622,0.202519,0.382857,0.36723,0.213277,0.200244,0.267467,0.315308,0.28547,0.280452,0.241715,0.214712
accuracy,YAKE KWs,0.156814,0.30633,0.176279,0.200687,0.238685,0.190891,0.348571,0.295518,0.199153,0.199108,0.246673,0.238496,0.230769,0.227131,0.247563,0.225042
accuracy,Sentence Transformer,0.210357,0.415011,0.18657,0.229218,0.28377,0.229651,0.407619,0.423761,0.242938,0.23196,0.289729,0.348641,0.331624,0.312359,0.288499,0.244452
accuracy,Wikifier KWs,0.137695,0.294496,0.181431,0.246287,0.229426,0.18314,0.350476,0.26738,0.245763,0.191508,0.285311,0.288302,0.271795,0.281981,0.233918,0.247062


In [10]:
df[~df.filtered].\
    drop(columns=['filtered', 'model']).\
    pivot(index='state', columns='features').\
    transpose().\
    rename(columns=better_col_names, index=better_index_name).\
    mean(axis=1)

          features            
accuracy  Babelfy KWs             0.260185
          YAKE KWs                0.232982
          Sentence Transformer    0.292260
          Wikifier KWs            0.245998
dtype: float64

In [11]:
df[~df.filtered].\
    drop(columns=['filtered', 'model']).\
    pivot(index='state', columns='features').\
    transpose().\
    rename(columns=better_col_names, index=better_index_name).\
    transpose().\
    style.highlight_max(color='lightgreen').highlight_min(color='red')

Unnamed: 0_level_0,accuracy,accuracy,accuracy,accuracy
features,Babelfy KWs,YAKE KWs,Sentence Transformer,Wikifier KWs
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
baden-wuerttemberg,0.186436,0.156814,0.210357,0.137695
bayern,0.359714,0.30633,0.415011,0.294496
berlin,0.177138,0.176279,0.18657,0.181431
brandenburg,0.217791,0.200687,0.229218,0.246287
bremen,0.250622,0.238685,0.28377,0.229426
hamburg,0.202519,0.190891,0.229651,0.18314
hessen,0.382857,0.348571,0.407619,0.350476
mecklenburg-vorpommern,0.36723,0.295518,0.423761,0.26738
niedersachsen,0.213277,0.199153,0.242938,0.245763
nordrhein-westfalen,0.200244,0.199108,0.23196,0.191508


## Top Features for Each State

In [12]:

def go():
    for filename  in glob('../results/results_w_top_features/*top_features*.txt'):
        _, feature_set, _, state = filename.split('/')[-1].replace('[top_features].txt', '').split('_')
        with open(filename) as f:
            for feature_rank, l in enumerate(f):
                feature_name = l.strip()
                yield feature_set, state, feature_name, feature_rank
df = pd.DataFrame(list(go()), columns=['feature_set', 'land', 'feature', 'feature_rank'])
df.loc[df.feature_set == 'kw', 'feature_set'] = 'yake'

In [13]:
def get_simple_feature(babel_feature):
    try:
        f = json.loads(babel_feature)
    except:
        return None
    if f['dbpedia']:
        return 'babel/dbpedia: ' + urllib.parse.unquote(f['dbpedia'][0].split('/')[-1])
    else:
        return 'babel/text: ' + urllib.parse.unquote(f['text'][0])
    
df['simple_feature'] = df.feature_set + ': ' + df.feature    
df.loc[df.feature_set == 'babelkw', 'simple_feature'] = df.feature.apply(get_simple_feature)


In [14]:
df.groupby('simple_feature').feature_rank.median().sort_values(ascending=True)[:50]

simple_feature
babel/dbpedia: Thermodynamic_free_energy     0.0
yake: Grundwissen                            0.5
yake: Blick                                  1.0
babel/dbpedia: Force                         1.5
yake: Energie                                4.0
babel/dbpedia: Fundamental_science           4.0
yake: Körper                                 4.5
wikikw: Elektron                             5.5
babel/text: größerer                         6.0
babel/dbpedia: Animation                     6.5
yake: Licht                                  7.0
babel/dbpedia: Light                         7.5
wikikw: Visuelle Wahrnehmung                 7.5
yake: Wichtigste                             7.5
yake: Abb                                    7.5
babel/dbpedia: Physical_body                 8.0
wikikw: Elektrischer Strom                   8.5
wikikw: Magnetismus                          9.0
yake: Kraft                                  9.0
babel/dbpedia: Motion_(physics)             10.0
yake: