In [1]:
import pickle
from os.path import join
import pandas as pd
from copy import deepcopy
import re
from glob import glob
import os
import torch

In [2]:
data_dir = "/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local"

In [3]:
bpe_vs = ['500', '10000', '50000']
word_vs = ['words-lower', 'words-mix']
word_vocab_sizes = [5000, 10000, 25000]
models = ['bilstm', 'gcn_head']

In [4]:
def merge_dicts(d1, d2):
    res = deepcopy(d1)
    for k, v in d2.items():
        res[k] = v
    return res

In [5]:
def parse_file(f):
    if isinstance(f, str) and os.path.exists(f):
        with open(f) as f:
            lines = [l.strip() for l in f.readlines()]
    else:
        lines = [l.strip() for l in f.readlines()]
        
    res = {}
    buff = []
    covered = []
    keys = []
    for line in lines:
        line = line.split('|')[1]
        if "LR:" in line:
            if line not in covered:
                covered.append(line)
                key = line + ", AGGR: mean"
            else:
                key = line + ", AGGR: max"
            if keys:
                res[keys[-1]] = buff
            keys.append(key)
            buff = []
        elif "Epoch:" in line:
            buff.append(dict([param.strip().split(':') for param in line.split(',')]))
    
    res[key] = buff
    
    return res 

In [6]:
res = parse_file(join(data_dir, f"logs/gcn_head_vocab_10000_neg_sample_1_grid_search.log.log"))

## Bilstm

In [7]:
bilstm_pd_list = []

In [8]:
bilstm_dict = {}

for bpe_v in bpe_vs:
    with open(join(data_dir, f"grid_search/bilstm_vocab_{bpe_v}_neg_sample_1_grid_search.pkl"), 'rb') as f:
        bilstm_dict[bpe_v] = pickle.load(f)

for word_v in word_vs:
    for word_vocab_size in word_vocab_sizes:
        with open(join(data_dir, f"grid_search/bilstm_vocab_{word_v}_neg_sample_1_vocab_size_{word_vocab_size}_grid_search.pkl"), 'rb') as f:
            bilstm_dict[f'{word_v}|{word_vocab_size}'] = pickle.load(f)

In [9]:
for v, res_d in bilstm_dict.items():
    if v.startswith('word'):
        vocab, size = v.split("|")
        d = {'head_vocab': vocab, "word_vocab_size": size, "side_vocabs": "NA", "AGGR": "NA"}
    else:
        d = {'head_vocab': v, "word_vocab_size": -1, "side_vocabs": "NA", "AGGR": "NA"}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            if not isinstance(loss_d, dict):
                loss_d = {"Accuracy": loss_d, "Train Loss": -1, "Valid Loss": -1}
            new_d = merge_dicts({'epoch': epoch, 'model': 'bilstm'}, loss_d)
            bilstm_pd_list.append(merge_dicts(d, new_d))

In [10]:
accs = sorted([float(rec['Accuracy'].numpy()) for rec in bilstm_pd_list], reverse=True)
len(accs), accs[:10]

(7039,
 [0.9615293741226196,
  0.9608191251754761,
  0.9606415629386902,
  0.9604048132896423,
  0.9603456258773804,
  0.9601088762283325,
  0.9599905014038086,
  0.9598721861839294,
  0.9597538113594055,
  0.9596354365348816])

## GCN

### Single head chain

In [11]:
gcn_pd_list = []

In [12]:
gcn_dict = {}

for bpe_v in bpe_vs:
    with open(join(data_dir, f"logs/gcn_head_vocab_{bpe_v}_neg_sample_1_grid_search.log.log"), 'r') as f:
        gcn_dict[bpe_v] = parse_file(f)

for word_v in word_vs:
    for word_vocab_size in word_vocab_sizes:
        with open(join(data_dir, f"logs/gcn_head_vocab_{word_v}_neg_sample_1_vocab_size_{word_vocab_size}_grid_search.log.log"), 'r') as f:
            gcn_dict[f'{word_v}|{word_vocab_size}'] = parse_file(f)

In [13]:
for v, res_d in gcn_dict.items():
    if v.startswith('word'):
        vocab, size = v.split("|")
        d = {'head_vocab': vocab, "word_vocab_size": size, "side_vocabs": "NA"}
    else:
        d = {'head_vocab': v, "word_vocab_size": -1, "side_vocabs": "NA"}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            new_d = merge_dicts({'epoch': epoch, 'model': 'gcn'}, loss_d)
            gcn_pd_list.append(merge_dicts(d, new_d))

In [14]:
gcn_dict.keys()

dict_keys(['500', '10000', '50000', 'words-lower|5000', 'words-lower|10000', 'words-lower|25000', 'words-mix|5000', 'words-mix|10000', 'words-mix|25000'])

In [15]:
accs = sorted([float(rec['Accuracy'].strip()) for rec in gcn_pd_list], reverse=True)
len(accs), accs[:10]

(2880,
 [0.961,
  0.9601,
  0.9596,
  0.9595,
  0.9592,
  0.9592,
  0.9587,
  0.9585,
  0.9584,
  0.958])

### 1 head and 1 BPE Chain

In [16]:
gcn_dict = {}

for word_v in word_vs:
    for bpe_v in bpe_vs:
        for word_vocab_size in word_vocab_sizes:
            with open(join(data_dir, f"logs/gcn_head_vocab_{word_v}_bpe_vocab_{bpe_v}_neg_sample_1_vocab_size_{word_vocab_size}_grid_search.log.log"), 'r') as f:
                gcn_dict[f'{word_v}|{bpe_v}|{word_vocab_size}'] = parse_file(f)

In [17]:
for v, res_d in gcn_dict.items():
    word_v, bpe_v, size = v.split("|")
    d = {'head_vocab': vocab, "word_vocab_size": size, "side_vocabs": bpe_v}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            new_d = merge_dicts({'epoch': epoch, 'model': 'gcn'}, loss_d)
            gcn_pd_list.append(merge_dicts(d, new_d))

In [18]:
accs = sorted([float(rec['Accuracy'].strip()) for rec in gcn_pd_list], reverse=True)
len(accs), accs[:10]

(8640,
 [0.961,
  0.9601,
  0.9596,
  0.9595,
  0.9592,
  0.9592,
  0.9587,
  0.9585,
  0.9584,
  0.958])

### 1 head and multiple BPE Chains

In [19]:
f_names = glob(join(data_dir, f"logs/gcn_head*bpe_vocabs*"))

In [20]:
gcn_dict = {}
reg = re.compile(r".*gcn_head_vocab_(.*)_bpe_vocabs_(.*)_neg_sample_1_vocab_size_(.*)_grid_search")
for f_name in f_names:
    with open(f_name, "r") as f:
        res_d = parse_file(f)
    m = reg.match(f_name)
    
    word_vocab = m.group(1)
    bpe_vocabs = m.group(2)
    word_vocab_size = m.group(3)
    d = {'head_vocab': word_vocab, "word_vocab_size": word_vocab_size, "side_vocabs": bpe_vocabs}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            new_d = merge_dicts({'epoch': epoch, 'model': 'gcn'}, loss_d)
            gcn_pd_list.append(merge_dicts(d, new_d))

In [21]:
accs = sorted([float(rec['Accuracy'].strip()) for rec in gcn_pd_list], reverse=True)
len(accs), accs[:10]

(16320,
 [0.9614,
  0.961,
  0.9601,
  0.9596,
  0.9595,
  0.9592,
  0.9592,
  0.9587,
  0.9585,
  0.9584])

## Analyze Dataframe

In [22]:
df = pd.DataFrame.from_dict(bilstm_pd_list + gcn_pd_list)

In [23]:
df.head()

Unnamed: 0,AGGR,Accuracy,DP,Epoch,LR,NUM_LAYERS,Train Loss,Valid Loss,WD,epoch,head_vocab,model,side_vocabs,word_vocab_size
0,,tensor(0.5203),0.3,,0.01,2,0.678797,0.710672,0.0005,0,500,bilstm,,-1
1,,tensor(0.4940),0.3,,0.01,2,0.667404,0.684273,0.0005,1,500,bilstm,,-1
2,,tensor(0.5798),0.3,,0.01,2,0.653367,0.651956,0.0005,2,500,bilstm,,-1
3,,tensor(0.5810),0.3,,0.01,2,0.649096,0.649752,0.0005,3,500,bilstm,,-1
4,,tensor(0.5790),0.3,,0.01,2,0.654959,0.676428,0.0005,4,500,bilstm,,-1


In [24]:
df ['Accuracy'] = df['Accuracy'].apply(lambda x: x.numpy() if isinstance(x, torch.Tensor) else float(x))

In [25]:
df.groupby(['model']).max()['Accuracy']

model
bilstm    0.961529
gcn       0.961400
Name: Accuracy, dtype: float64

In [26]:
df_gcn = df[df["model"] == "gcn"]
df_bilstm = df[df["model"] == "bilstm"]

In [31]:
df.groupby(['head_vocab']).max()[['Accuracy', 'side_vocabs']]

Unnamed: 0_level_0,Accuracy,side_vocabs
head_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1
10000,0.957741,
500,0.961529,
50000,0.837955,
words-lower,0.9614,
words-mix,0.9601,


In [29]:
df['side_vocabs'].unique()

array(['NA', '500', '10000', '50000', '50000-10000-500', '50000-10000',
       '500-10000', '500-50000'], dtype=object)

In [30]:
df.groupby(['side_vocabs']).max()['Accuracy']

side_vocabs
10000              0.956200
500                0.952900
500-10000          0.954500
500-50000          0.961400
50000              0.957700
50000-10000        0.954700
50000-10000-500    0.953400
NA                 0.961529
Name: Accuracy, dtype: float64

In [33]:
df[['Accuracy', 'DP', 'LR', 'Train Loss', 'Valid Loss', 'WD']].astype('float32').corr()

Unnamed: 0,Accuracy,DP,LR,Train Loss,Valid Loss,WD
Accuracy,1.0,-0.056601,-0.331849,-0.165755,-0.173204,-0.207216
DP,-0.056601,1.0,0.174094,0.022371,0.027693,0.115207
LR,-0.331849,0.174094,1.0,0.197108,0.216322,0.193904
Train Loss,-0.165755,0.022371,0.197108,1.0,0.941466,0.119574
Valid Loss,-0.173204,0.027693,0.216322,0.941466,1.0,0.128708
WD,-0.207216,0.115207,0.193904,0.119574,0.128708,1.0


## Simplified

In [43]:
f_names = glob(join(data_dir, "grid_search/model_gcn*"))

In [44]:
f_names

['/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/grid_search/model_gcn_main_vocab_words-lower_side_vocabs_None_word_vocab_size_5000.pkl',
 '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/grid_search/model_gcn_main_vocab_words-mix_side_vocabs_50000,10000_word_vocab_size_5000.pkl',
 '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/grid_search/model_gcn_main_vocab_words-mix_side_vocabs_None_word_vocab_size_10000.pkl',
 '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/grid_search/model_gcn_main_vocab_words-mix_side_vocabs_500,50000_word_vocab_size_25000.pkl',
 '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/grid_search/model_gcn_main_vocab_words-mix_side_vocabs_500,10000,50000_word_vocab_size_25000.pkl',
 '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/grid_search/model_gcn_main_vocab_words-lower_side_vocabs_500_word_vocab_size_25000.pkl',
 '/nfs/team/nlp/users/rgupta/NMT/code/GN

In [45]:
records = []
for f_name in f_names:
    with open(f_name, "rb") as f:
        records.extend(pickle.load(f))

In [47]:
len(records)

3460

In [48]:
df = pd.DataFrame.from_dict(records)

In [51]:
df.iloc[338]

Accuracy              0.527344
DP                         0.3
Epoch                        8
LR                       0.005
NUM_LAYERS                   2
Train Loss            0.597887
Valid Loss            0.656703
WD                      0.0005
main_vocab         words-lower
model                      gcn
side_vocabs                500
word_vocab_size          25000
Name: 338, dtype: object

In [50]:
df['Accuracy'].sort_values(ascending=False)

338     0.527344
339     0.527344
1698    0.496094
996     0.492188
998     0.492188
119     0.484375
1669    0.480469
2779    0.480469
3369    0.472656
1489    0.468750
2549    0.468750
1037    0.460938
938     0.460938
3368    0.460938
909     0.457031
2539    0.457031
918     0.457031
248     0.453125
337     0.453125
1699    0.449219
2489    0.449219
1019    0.449219
2939    0.445312
118     0.445312
2399    0.445312
978     0.445312
929     0.445312
937     0.445312
2537    0.441406
78      0.441406
          ...   
1465    0.378906
1716    0.378906
1797    0.378906
1957    0.378906
735     0.378906
2818    0.378906
2817    0.378906
2319    0.378906
315     0.375000
399     0.375000
2819    0.375000
1799    0.375000
437     0.375000
716     0.375000
2418    0.375000
1798    0.375000
1837    0.375000
1958    0.375000
395     0.371094
816     0.371094
3327    0.371094
48      0.371094
818     0.371094
2433    0.371094
3308    0.367188
3307    0.367188
2653    0.367188
757     0.3671