In [15]:
import json
import pandas as pd
import numpy as np
import imgkit

In [9]:
with open('stats-1563536857.json','r') as f:
    table = json.load(f)
    
df2019 = pd.DataFrame(table)

df2019.sort_values(by='MSE')

Unnamed: 0,MSE,model,pearson
19,0.491466,fasttext_skip_s1000,0.595615
9,0.492201,wang2vec_skip_s1000,0.594679
24,0.49444,fasttext_skip_s600,0.591606
14,0.497738,wang2vec_skip_s600,0.588351
0,0.497809,ELMo,0.58791
17,0.500397,fasttext_skip_s300,0.58479
30,0.504838,word2vec_cbow_s1000,0.581221
7,0.507129,wang2vec_skip_s300,0.577544
15,0.507934,wang2vec_cbow_s300,0.577094
28,0.511869,word2vec_cbow_s600,0.572974


In [10]:
with open('hartmann_results.txt') as f:
    table = f.read().split('\n')

hartmann_results = []
for item in table:
    line = item.split(' ')
    line = list(filter(lambda x: len(x), line))
    if not line:
        continue
    row = {
        'model': line[0],
        'ptbr-pearson-2017': line[1],
        'ptbr-MSE-2017': line[2],
        'pteu-pearson-2017': line[3],
        'pteu-MSE-2017': line[4]
    }
    hartmann_results.append(row)

    
df2017 = pd.DataFrame(hartmann_results)

df2017

Unnamed: 0,model,ptbr-MSE-2017,ptbr-pearson-2017,pteu-MSE-2017,pteu-pearson-2017
0,fasttext_cbow_s50,0.66,0.36,1.05,0.34
1,fasttext_cbow_s100,0.66,0.37,1.04,0.36
2,fasttext_cbow_s300,0.65,0.38,1.03,0.37
3,fasttext_cbow_s600,0.68,0.33,1.02,0.38
4,fasttext_cbow_s1000,0.64,0.39,0.99,0.41
5,fasttext_skip_s50,0.61,0.45,0.98,0.43
6,fasttext_skip_s100,0.58,0.49,0.94,0.47
7,fasttext_skip_s300,0.53,0.55,1.02,0.4
8,fasttext_skip_s600,0.64,0.4,1.01,0.4
9,fasttext_skip_s1000,0.56,0.52,0.86,0.54


In [11]:
df2019_no_elmo = df2019[df2019.model != 'ELMo']
df = pd.merge(df2017, df2019_no_elmo, how='outer', on='model')
df = df.drop(['pteu-MSE-2017', 'pteu-pearson-2017'], axis=1)
df = df.rename(columns={"MSE": "ptbr-MSE-2019", "pearson": "ptbr-pearson-2019"})

cols = df.columns.tolist()
cols = [cols[1], cols[3]]
tmp = df[cols]
df['diff'] = pd.to_numeric(df['ptbr-MSE-2017']).sub(df['ptbr-MSE-2019'], axis=0)

mean = df['diff'].values.astype(float).mean()
Q1 = df['diff'].quantile(0.25)
Q3 = df['diff'].quantile(0.75)
IQR = Q3 - Q1

elmo_row = df2019[df2019['model'] == 'ELMo'].to_dict()
elmo = {}
elmo['ptbr-MSE-2019'] = elmo_row['MSE'][0]
elmo['ptbr-pearson-2019'] = elmo_row['pearson'][0]
elmo['model'] = 'ELMo'
df = df.append(elmo, ignore_index=True)

def highlight_outliers(data, color='yellow'):
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  
        is_out1 = data < (Q1 - 1.5 * IQR)
        is_out2 = data > (Q3 + 1.5 * IQR)
        is_out = is_out1 | is_out2
        return [attr if v else '' for v in is_out]
    else: 
        is_out1 = data < (Q1 - 1.5 * IQR)
        is_out2 = data > (Q3 + 1.5 * IQR)
        is_out = is_out1 | is_out2
        return pd.DataFrame(np.where(is_out, attr, ''),
                            index=data.index, columns=data.columns)
   
style_df = df.style.apply(highlight_outliers, color='darkorange', axis=None, subset=['diff'])

In [12]:
style_df

Unnamed: 0,model,ptbr-MSE-2017,ptbr-pearson-2017,ptbr-MSE-2019,ptbr-pearson-2019,diff
0,fasttext_cbow_s50,0.66,0.36,0.667274,0.350017,-0.00727383
1,fasttext_cbow_s100,0.66,0.37,0.659786,0.363861,0.000213743
2,fasttext_cbow_s300,0.65,0.38,0.65367,0.37477,-0.00367048
3,fasttext_cbow_s600,0.68,0.33,0.636966,0.404073,0.0430341
4,fasttext_cbow_s1000,0.64,0.39,0.638319,0.401842,0.00168077
5,fasttext_skip_s50,0.61,0.45,0.554217,0.520344,0.0557829
6,fasttext_skip_s100,0.58,0.49,0.5287,0.551922,0.0513003
7,fasttext_skip_s300,0.53,0.55,0.500397,0.58479,0.0296034
8,fasttext_skip_s600,0.64,0.4,0.49444,0.591606,0.14556
9,fasttext_skip_s1000,0.56,0.52,0.491466,0.595615,0.0685345


In [20]:
df = df.sort_values(by='ptbr-MSE-2019')
style_df = df.style.apply(highlight_outliers, color='yellow', axis=None, subset=['diff'])
style_df.to_excel("sentence_similarity_sorted_by_ptbr-MSE-2019.xlsx")
style_df

Unnamed: 0,model,ptbr-MSE-2017,ptbr-pearson-2017,ptbr-MSE-2019,ptbr-pearson-2019,diff
9,fasttext_skip_s1000,0.56,0.52,0.491466,0.595615,0.0685345
19,wang2vec_skip_s1000,0.49,0.6,0.492201,0.594679,-0.00220142
8,fasttext_skip_s600,0.64,0.4,0.49444,0.591606,0.14556
18,wang2vec_skip_s600,0.49,0.59,0.497738,0.588351,-0.00773769
35,ELMo,,,0.497809,0.58791,
7,fasttext_skip_s300,0.53,0.55,0.500397,0.58479,0.0296034
29,word2vec_cbow_s1000,0.5,0.58,0.504838,0.581221,-0.00483789
17,wang2vec_skip_s300,0.5,0.58,0.507129,0.577544,-0.00712893
12,wang2vec_cbow_s300,0.55,0.53,0.507934,0.577094,0.0420665
28,word2vec_cbow_s600,0.51,0.57,0.511869,0.572974,-0.00186933


In [21]:
df = df.sort_values(by='model')
style_df = df.style.apply(highlight_outliers, color='yellow', axis=None, subset=['diff'])
style_df.to_excel("sentence_similarity_sorted_by_model-MSE-2019.xlsx")
style_df

Unnamed: 0,model,ptbr-MSE-2017,ptbr-pearson-2017,ptbr-MSE-2019,ptbr-pearson-2019,diff
35,ELMo,,,0.497809,0.58791,
1,fasttext_cbow_s100,0.66,0.37,0.659786,0.363861,0.000213743
4,fasttext_cbow_s1000,0.64,0.39,0.638319,0.401842,0.00168077
2,fasttext_cbow_s300,0.65,0.38,0.65367,0.37477,-0.00367048
0,fasttext_cbow_s50,0.66,0.36,0.667274,0.350017,-0.00727383
3,fasttext_cbow_s600,0.68,0.33,0.636966,0.404073,0.0430341
6,fasttext_skip_s100,0.58,0.49,0.5287,0.551922,0.0513003
9,fasttext_skip_s1000,0.56,0.52,0.491466,0.595615,0.0685345
7,fasttext_skip_s300,0.53,0.55,0.500397,0.58479,0.0296034
5,fasttext_skip_s50,0.61,0.45,0.554217,0.520344,0.0557829
