In [1]:
import os
import json
import pandas as pd

path = '../sentence_similarity/results/'

records = []
for root, dirs, files in os.walk(path, topdown=False):
    for name in files:
        if name.endswith('.json') and name.startswith('result_'):
            with open(os.path.join(root, name), 'r') as f:
                lst = json.load(f)
                records.extend(lst)

In [2]:
records[0:3]

[{'test': 'NILC_with_unk_part2_../embeddings/NILC/glove/glove_s50.model',
  'pearson': 0.42257938140742624,
  'MSE': 0.6244417183463097,
  'lang': 'ptbr',
  'timestamp': 1581477557},
 {'test': 'NILC_with_unk_part2_../embeddings/NILC/glove/glove_s50.model',
  'pearson': 0.38050354189638347,
  'MSE': 1.0215650950803998,
  'lang': 'pteu',
  'timestamp': 1581477561},
 {'test': 'NILC_with_unk_part2_../embeddings/NILC/glove/glove_s50.model',
  'pearson': 0.5445138075738842,
  'MSE': 0.7978910898320511,
  'lang': 'assin2',
  'timestamp': 1581477566}]

In [3]:
import re
for item in records:
    
    if item['lang'] == 'ptbr':
        item['dataset'] = 'ASSIN ( pt-BR )'
    elif item['lang'] == 'pteu':
        item['dataset'] = 'ASSIN ( pt-PT )'
    elif item['lang'] == 'assin2':
        item['dataset'] = 'ASSIN 2'
        
    if 'cbow' in item['test']:
        item['architecture'] = 'CBOW'
    elif 'skip' in item['test']:
        item['architecture'] = 'skip-gram'
    else:
        item['architecture'] = ' '

    if 'embeddings' in item['test']:
        item['embedding'] = item['test'].split('/')[-2]
        item['dimensions'] = int(re.findall('\d+', item['test'].split('/')[-1])[0])
    else:
        item['embedding'] = ' '
        item['dimensions'] = ' '
        
    if 'ELMo' in item['test']:
        if 'custom1' in item['test']:
            item['ELMo'] = 'wiki (reduced)'
        elif 'custom2' in item['test']:
            item['ELMo'] = 'BRWAC'
        else:
            item['ELMo'] = 'wiki'
    else:
        item['ELMo'] = ' '
        
    if 'unk' in item['test']:
        item['unk'] = True
    else:
        item['unk'] = False

In [4]:
import numpy as np
df = pd.DataFrame(records).round(2)
df = df[['dataset', 'ELMo', 'embedding', 'unk', 'architecture', 'dimensions', 'pearson', 'MSE']]\
.rename(columns={'pearson': 'PCC'})
df = df.groupby(['dataset', 'ELMo', 'embedding', 'unk', 'architecture', 'dimensions']).apply(np.mean)
df = df[['PCC', 'MSE']]
df.to_csv('../reports/evaluation.csv')

In [5]:
view = df[df.index.get_level_values('embedding')=='fasttext']
# fasttext_view = fasttext_view[fasttext_view.index.get_level_values('unk')==False]
view = view[view.index.get_level_values('architecture')=='skip-gram']
view = view[view.index.get_level_values('ELMo')==' ']
view = view[view.index.get_level_values('dataset')!='ASSIN 2']
view = view[~view.index.get_level_values('dataset').str.contains('pt-PT')]
view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,PCC,MSE
dataset,ELMo,embedding,unk,architecture,dimensions,Unnamed: 6_level_1,Unnamed: 7_level_1
ASSIN ( pt-BR ),,fasttext,False,skip-gram,50,0.52,0.55
ASSIN ( pt-BR ),,fasttext,False,skip-gram,100,0.55,0.53
ASSIN ( pt-BR ),,fasttext,False,skip-gram,300,0.58,0.5
ASSIN ( pt-BR ),,fasttext,False,skip-gram,600,0.59,0.49
ASSIN ( pt-BR ),,fasttext,False,skip-gram,1000,0.6,0.49
ASSIN ( pt-BR ),,fasttext,True,skip-gram,50,0.45,0.61
ASSIN ( pt-BR ),,fasttext,True,skip-gram,100,0.48,0.58
ASSIN ( pt-BR ),,fasttext,True,skip-gram,300,0.51,0.56
ASSIN ( pt-BR ),,fasttext,True,skip-gram,600,0.52,0.55
ASSIN ( pt-BR ),,fasttext,True,skip-gram,1000,0.52,0.56


In [6]:
print(view.to_latex())

\begin{tabular}{llllllrr}
\toprule
                &   &          &       &           &      &   PCC &   MSE \\
dataset & ELMo & embedding & unk & architecture & dimensions &       &       \\
\midrule
ASSIN ( pt-BR ) &   & fasttext & False & skip-gram & 50   &  0.52 &  0.55 \\
                &   &          &       &           & 100  &  0.55 &  0.53 \\
                &   &          &       &           & 300  &  0.58 &  0.50 \\
                &   &          &       &           & 600  &  0.59 &  0.49 \\
                &   &          &       &           & 1000 &  0.60 &  0.49 \\
                &   &          & True  & skip-gram & 50   &  0.45 &  0.61 \\
                &   &          &       &           & 100  &  0.48 &  0.58 \\
                &   &          &       &           & 300  &  0.51 &  0.56 \\
                &   &          &       &           & 600  &  0.52 &  0.55 \\
                &   &          &       &           & 1000 &  0.52 &  0.56 \\
\bottomrule
\end{tabular}



In [7]:
view = df[df.index.get_level_values('embedding')=='word2vec']
# fasttext_view = fasttext_view[fasttext_view.index.get_level_values('unk')==False]
view = view[view.index.get_level_values('architecture')=='skip-gram']
view = view[view.index.get_level_values('ELMo')==' ']
view = view[view.index.get_level_values('dataset')!='ASSIN 2']
view = view[~view.index.get_level_values('dataset').str.contains('pt-PT')]
view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,PCC,MSE
dataset,ELMo,embedding,unk,architecture,dimensions,Unnamed: 6_level_1,Unnamed: 7_level_1
ASSIN ( pt-BR ),,word2vec,False,skip-gram,50,0.46,0.6
ASSIN ( pt-BR ),,word2vec,False,skip-gram,100,0.48,0.59
ASSIN ( pt-BR ),,word2vec,False,skip-gram,300,0.52,0.56
ASSIN ( pt-BR ),,word2vec,False,skip-gram,600,0.53,0.54
ASSIN ( pt-BR ),,word2vec,False,skip-gram,1000,0.54,0.54
ASSIN ( pt-BR ),,word2vec,True,skip-gram,50,0.46,0.6
ASSIN ( pt-BR ),,word2vec,True,skip-gram,100,0.48,0.58
ASSIN ( pt-BR ),,word2vec,True,skip-gram,300,0.52,0.56
ASSIN ( pt-BR ),,word2vec,True,skip-gram,600,0.53,0.54
ASSIN ( pt-BR ),,word2vec,True,skip-gram,1000,0.54,0.54


In [8]:
print(view.to_latex())

\begin{tabular}{llllllrr}
\toprule
                &   &          &       &           &      &   PCC &   MSE \\
dataset & ELMo & embedding & unk & architecture & dimensions &       &       \\
\midrule
ASSIN ( pt-BR ) &   & word2vec & False & skip-gram & 50   &  0.46 &  0.60 \\
                &   &          &       &           & 100  &  0.48 &  0.59 \\
                &   &          &       &           & 300  &  0.52 &  0.56 \\
                &   &          &       &           & 600  &  0.53 &  0.54 \\
                &   &          &       &           & 1000 &  0.54 &  0.54 \\
                &   &          & True  & skip-gram & 50   &  0.46 &  0.60 \\
                &   &          &       &           & 100  &  0.48 &  0.58 \\
                &   &          &       &           & 300  &  0.52 &  0.56 \\
                &   &          &       &           & 600  &  0.53 &  0.54 \\
                &   &          &       &           & 1000 &  0.54 &  0.54 \\
\bottomrule
\end{tabular}



In [9]:
view = df
view = view[(((view.index.get_level_values('ELMo')=='wiki') |\
              (view.index.get_level_values('ELMo')=='wiki (reduced)')) &\
             (view.index.get_level_values('embedding')==' ')) |\
            ((view.index.get_level_values('embedding')=='fasttext')&\
             (view.index.get_level_values('dimensions')==1000)&\
            (view.index.get_level_values('architecture')=='skip-gram')&\
            (view.index.get_level_values('unk')==False)&\
            ((view.index.get_level_values('ELMo')=='wiki') |\
              (view.index.get_level_values('ELMo')=='wiki (reduced)'))) |\
           (((view.index.get_level_values('ELMo')=='wiki') |\
              (view.index.get_level_values('ELMo')=='wiki (reduced)')) &\
            (((view.index.get_level_values('embedding')=='word2vec') &\
           (view.index.get_level_values('architecture')=='CBOW')) |\
            ((view.index.get_level_values('embedding')=='fasttext') &\
           (view.index.get_level_values('architecture')=='skip-gram')) |\
            ((view.index.get_level_values('embedding')=='glove') &\
           (view.index.get_level_values('architecture')==' '))) &\
           (view.index.get_level_values('dimensions')==1000))]
view = view[view.index.get_level_values('dataset')!='ASSIN 2']
view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,PCC,MSE
dataset,ELMo,embedding,unk,architecture,dimensions,Unnamed: 6_level_1,Unnamed: 7_level_1
ASSIN ( pt-BR ),wiki,,False,,,0.61,0.48
ASSIN ( pt-BR ),wiki (reduced),,False,,,0.62,0.47
ASSIN ( pt-BR ),wiki (reduced),fasttext,False,skip-gram,1000.0,0.64,0.45
ASSIN ( pt-BR ),wiki (reduced),glove,False,,1000.0,0.6,0.49
ASSIN ( pt-BR ),wiki (reduced),word2vec,False,CBOW,1000.0,0.63,0.45
ASSIN ( pt-PT ),wiki,,False,,,0.62,0.75
ASSIN ( pt-PT ),wiki (reduced),,False,,,0.63,0.74
ASSIN ( pt-PT ),wiki (reduced),fasttext,False,skip-gram,1000.0,0.61,0.76
ASSIN ( pt-PT ),wiki (reduced),glove,False,,1000.0,0.54,0.85
ASSIN ( pt-PT ),wiki (reduced),word2vec,False,CBOW,1000.0,0.65,0.72


In [10]:
print(re.sub('False','',view.to_latex()))

\begin{tabular}{llllllrr}
\toprule
                &                &          &       &      &      &   PCC &   MSE \\
dataset & ELMo & embedding & unk & architecture & dimensions &       &       \\
\midrule
ASSIN ( pt-BR ) & wiki &   &  &   &   &  0.61 &  0.48 \\
                & wiki (reduced) &   &  &   &   &  0.62 &  0.47 \\
                &                & fasttext &  & skip-gram & 1000 &  0.64 &  0.45 \\
                &                & glove &  &   & 1000 &  0.60 &  0.49 \\
                &                & word2vec &  & CBOW & 1000 &  0.63 &  0.45 \\
ASSIN ( pt-PT ) & wiki &   &  &   &   &  0.62 &  0.75 \\
                & wiki (reduced) &   &  &   &   &  0.63 &  0.74 \\
                &                & fasttext &  & skip-gram & 1000 &  0.61 &  0.76 \\
                &                & glove &  &   & 1000 &  0.54 &  0.85 \\
                &                & word2vec &  & CBOW & 1000 &  0.65 &  0.72 \\
\bottomrule
\end{tabular}



In [11]:
view = df
view = view[(((view.index.get_level_values('ELMo')=='wiki') |\
              (view.index.get_level_values('ELMo')=='wiki (reduced)')) &\
             (view.index.get_level_values('embedding')==' ')) |\
            ((view.index.get_level_values('embedding')=='fasttext')&\
             (view.index.get_level_values('dimensions')==1000)&\
            (view.index.get_level_values('architecture')=='skip-gram')&\
            (view.index.get_level_values('unk')==False)&\
            ((view.index.get_level_values('ELMo')=='wiki') |\
              (view.index.get_level_values('ELMo')=='wiki (reduced)'))) |\
           (((view.index.get_level_values('ELMo')=='wiki') |\
              (view.index.get_level_values('ELMo')=='wiki (reduced)')) &\
            (((view.index.get_level_values('embedding')=='word2vec') &\
           (view.index.get_level_values('architecture')=='CBOW')) |\
            ((view.index.get_level_values('embedding')=='fasttext') &\
           (view.index.get_level_values('architecture')=='skip-gram')) |\
            ((view.index.get_level_values('embedding')=='glove') &\
           (view.index.get_level_values('architecture')==' '))) &\
           (view.index.get_level_values('dimensions')==1000))]
view = view[view.index.get_level_values('dataset')=='ASSIN 2']
view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,PCC,MSE
dataset,ELMo,embedding,unk,architecture,dimensions,Unnamed: 6_level_1,Unnamed: 7_level_1
ASSIN 2,wiki,,False,,,0.55,0.8
ASSIN 2,wiki (reduced),,False,,,0.57,0.77
ASSIN 2,wiki (reduced),fasttext,False,skip-gram,1000.0,0.6,0.73
ASSIN 2,wiki (reduced),glove,False,,1000.0,0.61,0.71
ASSIN 2,wiki (reduced),word2vec,False,CBOW,1000.0,0.6,0.73


In [12]:
print(re.sub('False','',view.to_latex()))

\begin{tabular}{llllllrr}
\toprule
        &                &          &       &      &      &   PCC &   MSE \\
dataset & ELMo & embedding & unk & architecture & dimensions &       &       \\
\midrule
ASSIN 2 & wiki &   &  &   &   &  0.55 &  0.80 \\
        & wiki (reduced) &   &  &   &   &  0.57 &  0.77 \\
        &                & fasttext &  & skip-gram & 1000 &  0.60 &  0.73 \\
        &                & glove &  &   & 1000 &  0.61 &  0.71 \\
        &                & word2vec &  & CBOW & 1000 &  0.60 &  0.73 \\
\bottomrule
\end{tabular}



In [13]:
view = df
view = view[((view.index.get_level_values('embedding')=='word2vec') &\
           (view.index.get_level_values('architecture')=='skip-gram'))]
view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,PCC,MSE
dataset,ELMo,embedding,unk,architecture,dimensions,Unnamed: 6_level_1,Unnamed: 7_level_1
ASSIN ( pt-BR ),,word2vec,False,skip-gram,50,0.46,0.6
ASSIN ( pt-BR ),,word2vec,False,skip-gram,100,0.48,0.59
ASSIN ( pt-BR ),,word2vec,False,skip-gram,300,0.52,0.56
ASSIN ( pt-BR ),,word2vec,False,skip-gram,600,0.53,0.54
ASSIN ( pt-BR ),,word2vec,False,skip-gram,1000,0.54,0.54
ASSIN ( pt-BR ),,word2vec,True,skip-gram,50,0.46,0.6
ASSIN ( pt-BR ),,word2vec,True,skip-gram,100,0.48,0.58
ASSIN ( pt-BR ),,word2vec,True,skip-gram,300,0.52,0.56
ASSIN ( pt-BR ),,word2vec,True,skip-gram,600,0.53,0.54
ASSIN ( pt-BR ),,word2vec,True,skip-gram,1000,0.54,0.54


In [14]:
view = df
view = view[(view.index.get_level_values('embedding')=='glove')]
view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,PCC,MSE
dataset,ELMo,embedding,unk,architecture,dimensions,Unnamed: 6_level_1,Unnamed: 7_level_1
ASSIN ( pt-BR ),,glove,False,,50,0.43,0.62
ASSIN ( pt-BR ),,glove,False,,100,0.46,0.6
ASSIN ( pt-BR ),,glove,False,,300,0.49,0.58
ASSIN ( pt-BR ),,glove,False,,600,0.51,0.56
ASSIN ( pt-BR ),,glove,False,,1000,0.52,0.56
ASSIN ( pt-BR ),,glove,True,,50,0.42,0.62
ASSIN ( pt-BR ),,glove,True,,100,0.45,0.6
ASSIN ( pt-BR ),,glove,True,,300,0.49,0.58
ASSIN ( pt-BR ),,glove,True,,600,0.5,0.57
ASSIN ( pt-BR ),,glove,True,,1000,0.51,0.56
