In [115]:
import pickle
from os.path import join
import pandas as pd
from copy import deepcopy
import re
from glob import glob
import os
import torch

In [116]:
a = torch.randn(5, 4)

In [117]:
a

tensor([[-1.0832,  0.2328,  1.0506, -0.2467],
        [-0.2059,  0.5844, -0.5497,  0.8998],
        [-0.7502,  1.0566, -1.6788, -1.3847],
        [ 0.7480,  0.9562, -0.7243,  0.2264],
        [ 1.3163, -1.1475, -0.1838,  1.0982]])

In [118]:
m = torch.tensor([0,0,1,1,0])

In [123]:
m.unsqueeze(1).expand_as(a)

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [0, 0, 0, 0]])

In [119]:
m

tensor([0, 0, 1, 1, 0])

In [2]:
data_dir = "/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/data"

In [3]:
bpe_vs = ['500', '10000', '50000']
word_vs = ['words-lower', 'words-mix']
word_vocab_sizes = [5000, 10000, 25000]
models = ['bilstm', 'gcn_head']

In [4]:
def merge_dicts(d1, d2):
    res = deepcopy(d1)
    for k, v in d2.items():
        res[k] = v
    return res

In [5]:
def parse_file(f):
    if isinstance(f, str) and os.path.exists(f):
        with open(f) as f:
            lines = [l.strip() for l in f.readlines()]
    else:
        lines = [l.strip() for l in f.readlines()]
        
    res = {}
    buff = []
    covered = []
    keys = []
    for line in lines:
        line = line.split('|')[1]
        if "LR:" in line:
            if line not in covered:
                covered.append(line)
                key = line + ", AGGR: mean"
            else:
                key = line + ", AGGR: max"
            if keys:
                res[keys[-1]] = buff
            keys.append(key)
            buff = []
        elif "Epoch:" in line:
            buff.append(dict([param.strip().split(':') for param in line.split(',')]))
    
    res[key] = buff
    
    return res 

In [6]:
res = parse_file(join(data_dir, f"logs/gcn_head_vocab_10000_neg_sample_1_grid_search.log.log"))

## Bilstm

In [7]:
bilstm_pd_list = []

In [8]:
bilstm_dict = {}

for bpe_v in bpe_vs:
    with open(join(data_dir, f"grid_search/bilstm_vocab_{bpe_v}_neg_sample_1_grid_search.pkl"), 'rb') as f:
        bilstm_dict[bpe_v] = pickle.load(f)

for word_v in word_vs:
    for word_vocab_size in word_vocab_sizes:
        with open(join(data_dir, f"grid_search/bilstm_vocab_{word_v}_neg_sample_1_vocab_size_{word_vocab_size}_grid_search.pkl"), 'rb') as f:
            bilstm_dict[f'{word_v}|{word_vocab_size}'] = pickle.load(f)

In [9]:
for v, res_d in bilstm_dict.items():
    if v.startswith('word'):
        vocab, size = v.split("|")
        d = {'head_vocab': vocab, "word_vocab_size": size, "side_vocabs": "NA", "AGGR": "NA"}
    else:
        d = {'head_vocab': v, "word_vocab_size": -1, "side_vocabs": "NA", "AGGR": "NA"}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            if not isinstance(loss_d, dict):
                loss_d = {"Accuracy": loss_d, "Train Loss": -1, "Valid Loss": -1}
            new_d = merge_dicts({'epoch': epoch, 'model': 'bilstm'}, loss_d)
            bilstm_pd_list.append(merge_dicts(d, new_d))

In [10]:
accs = sorted([float(rec['Accuracy'].numpy()) for rec in bilstm_pd_list], reverse=True)
len(accs), accs[:10]

(7039,
 [0.9615293741226196,
  0.9608191251754761,
  0.9606415629386902,
  0.9604048132896423,
  0.9603456258773804,
  0.9601088762283325,
  0.9599905014038086,
  0.9598721861839294,
  0.9597538113594055,
  0.9596354365348816])

## GCN

### Single head chain

In [11]:
gcn_pd_list = []

In [12]:
gcn_dict = {}

for bpe_v in bpe_vs:
    with open(join(data_dir, f"logs/gcn_head_vocab_{bpe_v}_neg_sample_1_grid_search.log.log"), 'r') as f:
        gcn_dict[bpe_v] = parse_file(f)

for word_v in word_vs:
    for word_vocab_size in word_vocab_sizes:
        with open(join(data_dir, f"logs/gcn_head_vocab_{word_v}_neg_sample_1_vocab_size_{word_vocab_size}_grid_search.log.log"), 'r') as f:
            gcn_dict[f'{word_v}|{word_vocab_size}'] = parse_file(f)

In [13]:
for v, res_d in gcn_dict.items():
    if v.startswith('word'):
        vocab, size = v.split("|")
        d = {'head_vocab': vocab, "word_vocab_size": size, "side_vocabs": "NA"}
    else:
        d = {'head_vocab': v, "word_vocab_size": -1, "side_vocabs": "NA"}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            new_d = merge_dicts({'epoch': epoch, 'model': 'gcn'}, loss_d)
            gcn_pd_list.append(merge_dicts(d, new_d))

In [14]:
gcn_dict.keys()

dict_keys(['500', '10000', '50000', 'words-lower|5000', 'words-lower|10000', 'words-lower|25000', 'words-mix|5000', 'words-mix|10000', 'words-mix|25000'])

In [15]:
accs = sorted([float(rec['Accuracy'].strip()) for rec in gcn_pd_list], reverse=True)
len(accs), accs[:10]

(2880,
 [0.961,
  0.9601,
  0.9596,
  0.9595,
  0.9592,
  0.9592,
  0.9587,
  0.9585,
  0.9584,
  0.958])

### 1 head and 1 BPE Chain

In [16]:
gcn_dict = {}

for word_v in word_vs:
    for bpe_v in bpe_vs:
        for word_vocab_size in word_vocab_sizes:
            with open(join(data_dir, f"logs/gcn_head_vocab_{word_v}_bpe_vocab_{bpe_v}_neg_sample_1_vocab_size_{word_vocab_size}_grid_search.log.log"), 'r') as f:
                gcn_dict[f'{word_v}|{bpe_v}|{word_vocab_size}'] = parse_file(f)

In [17]:
for v, res_d in gcn_dict.items():
    word_v, bpe_v, size = v.split("|")
    d = {'head_vocab': vocab, "word_vocab_size": size, "side_vocabs": bpe_v}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            new_d = merge_dicts({'epoch': epoch, 'model': 'gcn'}, loss_d)
            gcn_pd_list.append(merge_dicts(d, new_d))

In [18]:
accs = sorted([float(rec['Accuracy'].strip()) for rec in gcn_pd_list], reverse=True)
len(accs), accs[:10]

(8640,
 [0.961,
  0.9601,
  0.9596,
  0.9595,
  0.9592,
  0.9592,
  0.9587,
  0.9585,
  0.9584,
  0.958])

### 1 head and multiple BPE Chains

In [19]:
f_names = glob(join(data_dir, f"logs/gcn_head*bpe_vocabs*"))

In [20]:
gcn_dict = {}
reg = re.compile(r".*gcn_head_vocab_(.*)_bpe_vocabs_(.*)_neg_sample_1_vocab_size_(.*)_grid_search")
for f_name in f_names:
    with open(f_name, "r") as f:
        res_d = parse_file(f)
    m = reg.match(f_name)
    
    word_vocab = m.group(1)
    bpe_vocabs = m.group(2)
    word_vocab_size = m.group(3)
    d = {'head_vocab': word_vocab, "word_vocab_size": word_vocab_size, "side_vocabs": bpe_vocabs}
    for key, losses in res_d.items():
        d.update(dict([param.strip().split(':') for param in key.split(',')]))
        for epoch, loss_d in enumerate(losses):
            new_d = merge_dicts({'epoch': epoch, 'model': 'gcn'}, loss_d)
            gcn_pd_list.append(merge_dicts(d, new_d))

In [21]:
accs = sorted([float(rec['Accuracy'].strip()) for rec in gcn_pd_list], reverse=True)
len(accs), accs[:10]

(16320,
 [0.9614,
  0.961,
  0.9601,
  0.9596,
  0.9595,
  0.9592,
  0.9592,
  0.9587,
  0.9585,
  0.9584])

## Analyze Dataframe

In [22]:
df = pd.DataFrame.from_dict(bilstm_pd_list + gcn_pd_list)

In [23]:
df.head()

Unnamed: 0,AGGR,Accuracy,DP,Epoch,LR,NUM_LAYERS,Train Loss,Valid Loss,WD,epoch,head_vocab,model,side_vocabs,word_vocab_size
0,,tensor(0.5203),0.3,,0.01,2,0.678797,0.710672,0.0005,0,500,bilstm,,-1
1,,tensor(0.4940),0.3,,0.01,2,0.667404,0.684273,0.0005,1,500,bilstm,,-1
2,,tensor(0.5798),0.3,,0.01,2,0.653367,0.651956,0.0005,2,500,bilstm,,-1
3,,tensor(0.5810),0.3,,0.01,2,0.649096,0.649752,0.0005,3,500,bilstm,,-1
4,,tensor(0.5790),0.3,,0.01,2,0.654959,0.676428,0.0005,4,500,bilstm,,-1


In [24]:
df ['Accuracy'] = df['Accuracy'].apply(lambda x: x.numpy() if isinstance(x, torch.Tensor) else float(x))

In [25]:
df.groupby(['model']).max()['Accuracy']

model
bilstm    0.961529
gcn       0.961400
Name: Accuracy, dtype: float64

In [26]:
df_gcn = df[df["model"] == "gcn"]
df_bilstm = df[df["model"] == "bilstm"]

In [31]:
df.groupby(['head_vocab']).max()[['Accuracy', 'side_vocabs']]

Unnamed: 0_level_0,Accuracy,side_vocabs
head_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1
10000,0.957741,
500,0.961529,
50000,0.837955,
words-lower,0.9614,
words-mix,0.9601,


In [29]:
df['side_vocabs'].unique()

array(['NA', '500', '10000', '50000', '50000-10000-500', '50000-10000',
       '500-10000', '500-50000'], dtype=object)

In [30]:
df.groupby(['side_vocabs']).max()['Accuracy']

side_vocabs
10000              0.956200
500                0.952900
500-10000          0.954500
500-50000          0.961400
50000              0.957700
50000-10000        0.954700
50000-10000-500    0.953400
NA                 0.961529
Name: Accuracy, dtype: float64

In [33]:
df[['Accuracy', 'DP', 'LR', 'Train Loss', 'Valid Loss', 'WD']].astype('float32').corr()

Unnamed: 0,Accuracy,DP,LR,Train Loss,Valid Loss,WD
Accuracy,1.0,-0.056601,-0.331849,-0.165755,-0.173204,-0.207216
DP,-0.056601,1.0,0.174094,0.022371,0.027693,0.115207
LR,-0.331849,0.174094,1.0,0.197108,0.216322,0.193904
Train Loss,-0.165755,0.022371,0.197108,1.0,0.941466,0.119574
Valid Loss,-0.173204,0.027693,0.216322,0.941466,1.0,0.128708
WD,-0.207216,0.115207,0.193904,0.119574,0.128708,1.0


## Simplified

In [110]:
f_names = glob(join(data_dir, "grid_search/model_gcn*"))

In [113]:
records = []
for f_name in f_names:
    with open(f_name, "rb") as f:
        records.extend(pickle.load(f))

In [115]:
df = pd.DataFrame.from_dict(records)

In [116]:
df.head()

Unnamed: 0,Accuracy,DP,Epoch,LR,NUM_LAYERS,Train Loss,Valid Loss,WD,main_vocab,model,side_vocabs,word_vocab_size
0,0.804297,0.3,0,0.001,2,1.187878,0.429437,5e-05,words-lower,gcn,10000500,25000
1,0.85319,0.3,1,0.001,2,0.392437,0.343223,5e-05,words-lower,gcn,10000500,25000
2,0.878646,0.3,2,0.001,2,0.334639,0.300368,5e-05,words-lower,gcn,10000500,25000
3,0.890365,0.3,3,0.001,2,0.299826,0.269641,5e-05,words-lower,gcn,10000500,25000
4,0.897786,0.3,4,0.001,2,0.275134,0.255407,5e-05,words-lower,gcn,10000500,25000


In [117]:
df.groupby('main_vocab').max()['Accuracy']

main_vocab
10000          0.944987
500            0.942383
50000          0.943099
words-lower    0.944596
words-mix      0.942578
Name: Accuracy, dtype: float64

In [118]:
df.groupby('side_vocabs').max()[['Accuracy', 'main_vocab']]

Unnamed: 0_level_0,Accuracy,main_vocab
side_vocabs,Unnamed: 1_level_1,Unnamed: 2_level_1
10000,0.938867,words-mix
10000500,0.942708,words-mix
1000050000,0.936589,words-mix
500,0.938281,words-mix
50010000,0.944596,words-mix
5001000050000,0.933919,words-mix
50050000,0.941797,words-mix
50000,0.938802,words-mix
5000010000,0.937305,words-mix
50000500,0.942578,words-mix


In [101]:
def create_num_side_vocabs(row):
    if row['side_vocabs'] in [None, 0]:
        return  0
    else:
        return len(row['side_vocabs'].split(','))

In [120]:
df['num_side_vocabs'] = df.apply (lambda row: create_num_side_vocabs(row), axis=1)

In [121]:
df.head()

Unnamed: 0,Accuracy,DP,Epoch,LR,NUM_LAYERS,Train Loss,Valid Loss,WD,main_vocab,model,side_vocabs,word_vocab_size,num_side_vocabs
0,0.804297,0.3,0,0.001,2,1.187878,0.429437,5e-05,words-lower,gcn,10000500,25000,2
1,0.85319,0.3,1,0.001,2,0.392437,0.343223,5e-05,words-lower,gcn,10000500,25000,2
2,0.878646,0.3,2,0.001,2,0.334639,0.300368,5e-05,words-lower,gcn,10000500,25000,2
3,0.890365,0.3,3,0.001,2,0.299826,0.269641,5e-05,words-lower,gcn,10000500,25000,2
4,0.897786,0.3,4,0.001,2,0.275134,0.255407,5e-05,words-lower,gcn,10000500,25000,2


In [122]:
df['side_vocabs'].fillna(0, inplace=True)

In [123]:
df.groupby('num_side_vocabs').max()[['Accuracy', 'main_vocab']]

Unnamed: 0_level_0,Accuracy,main_vocab
num_side_vocabs,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.944987,words-mix
1,0.938867,words-mix
2,0.944596,words-mix
3,0.933919,words-mix


In [124]:
df.groupby(['word_vocab_size', 'side_vocabs']).max()[['Accuracy']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
word_vocab_size,side_vocabs,Unnamed: 2_level_1
-1,0,0.944987
5000,0,0.931641
5000,10000,0.9375
5000,10000500,0.942708
5000,1000050000,0.93457
5000,500,0.938281
5000,50010000,0.940365
5000,5001000050000,0.933464
5000,50050000,0.941797
5000,50000,0.93763


## Ranking Data

In [82]:
def get_df(f_names):
    all_records = []
    for f_name in f_names:
        neg_sample = f_name.split('_')[-1][0]
        with open(f_name, "rb") as f:
            records = pickle.load(f)
            for rec in records:
                rec['neg_sample'] = neg_sample
            all_records.extend(records)
            
    return pd.DataFrame.from_dict(all_records)

In [83]:
gcn_f_names = glob(join(data_dir, "grid_search/ranking_gcn*"))
bilstm_f_names = glob(join(data_dir, "grid_search/ranking_bilstm*"))
metrics = ['mrr', 1, 10, 100]

In [84]:
df_gcn = get_df(gcn_f_names)
df_bilstm = get_df(bilstm_f_names)
df = pd.concat([df_gcn, df_bilstm])

In [85]:
df_gcn['side_vocabs'].fillna(0, inplace=True)
df_bilstm['side_vocabs'].fillna(0, inplace=True)
df['side_vocabs'].fillna(0, inplace=True)

In [102]:
df['num_side_vocabs'] = df.apply (lambda row: create_num_side_vocabs(row), axis=1)
df_gcn['num_side_vocabs'] = df_gcn.apply (lambda row: create_num_side_vocabs(row), axis=1)
df_bilstm['num_side_vocabs'] = df_bilstm.apply (lambda row: create_num_side_vocabs(row), axis=1)

In [103]:
len(df_gcn), len(df_bilstm), len(df)

(59980, 5423, 65403)

In [104]:
df.head()

Unnamed: 0,DP,LR,NUM_LAYERS,WD,main_vocab,model,side_vocabs,word_vocab_size,1,2,5,10,100,mrr,Valid Loss,Train Loss,Epoch,neg_sample,num_side_vocabs
0,0.3,0.001,2,5e-05,words-mix,gcn,10000500,5000,0.0308625692429438,0.0584278554471115,0.1268794513321023,0.2143233975204431,0.6595884990767608,0.093643,0.940721,0.227707,0,1,2
1,0.3,0.001,2,5e-05,words-mix,gcn,10000500,5000,0.0374571353204959,0.0679240305987866,0.1442891057768398,0.2404378791875494,0.6909786336059087,0.10503,0.88566,0.148418,1,1,2
2,0.3,0.001,2,5e-05,words-mix,gcn,10000500,5000,0.0547348984436824,0.0972039039831179,0.1883408071748878,0.3038776048536006,0.7570561857029807,0.135423,0.758593,0.138419,2,1,2
3,0.3,0.001,2,5e-05,words-mix,gcn,10000500,5000,0.0584278554471115,0.1014244262727512,0.2090477446584014,0.3309153257715642,0.761276707992614,0.143215,0.828607,0.132395,3,1,2
4,0.3,0.001,2,5e-05,words-mix,gcn,10000500,5000,0.0683197045634397,0.119757319968346,0.2251384858876285,0.3470060670007913,0.7777631231864943,0.157221,0.676527,0.128819,4,1,2


In [105]:
df_bilstm.groupby(['main_vocab']).max()[metrics + ['model']]

Unnamed: 0_level_0,mrr,1,10,100,model
main_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,0.690197,0.605645,0.844764,0.961883,bilstm
500,0.645487,0.5579,0.808889,0.955421,bilstm
50000,0.719518,0.641915,0.86125,0.968082,bilstm
words-lower,0.634933,0.55157,0.789106,0.934054,bilstm
words-mix,0.624864,0.540491,0.77618,0.925086,bilstm


In [106]:
df_gcn.groupby(['main_vocab']).max()[metrics + ['model']]

Unnamed: 0_level_0,mrr,1,10,100,model
main_vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,0.541391,0.407676,0.793986,0.951992,gcn
500,0.466673,0.328541,0.736613,0.935109,gcn
50000,0.522367,0.387101,0.787391,0.954893,gcn
words-lower,0.520247,0.391849,0.774334,0.946188,gcn
words-mix,0.516584,0.386705,0.767476,0.945793,gcn


In [107]:
df_bilstm.groupby('side_vocabs').max()[metrics + ['main_vocab']]

Unnamed: 0_level_0,mrr,1,10,100,main_vocab
side_vocabs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.719518,0.641915,0.86125,0.968082,words-mix


In [108]:
df_gcn.groupby('side_vocabs').max()[metrics + ['main_vocab']]

Unnamed: 0_level_0,mrr,1,10,100,main_vocab
side_vocabs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.541391,0.407676,0.793986,0.954893,words-mix
10000,0.505135,0.370087,0.774334,0.946188,words-mix
10000500,0.473503,0.335795,0.742284,0.941045,words-mix
1000050000,0.470298,0.339224,0.738591,0.938407,words-mix
500,0.372667,0.226985,0.669348,0.9177,words-mix
50010000,0.487795,0.350567,0.754155,0.942232,words-mix
5001000050000,0.456202,0.316275,0.734898,0.941572,words-mix
50050000,0.491825,0.354656,0.755869,0.941176,words-mix
50000,0.520247,0.391849,0.767476,0.941704,words-mix
5000010000,0.471901,0.336982,0.738591,0.938011,words-mix


In [109]:
df_bilstm.groupby('neg_sample').max()[['mrr', 1, 10, 100]]

Unnamed: 0_level_0,mrr,1,10,100
neg_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.682834,0.59905,0.837906,0.96162
2,0.666507,0.581245,0.823793,0.957663
5,0.719518,0.641915,0.86125,0.968082


In [110]:
df_gcn.groupby('neg_sample').max()[['mrr', 1, 10, 100]]

Unnamed: 0_level_0,mrr,1,10,100
neg_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.541391,0.407676,0.793986,0.954893
2,0.451362,0.318913,0.726721,0.940913
5,0.532817,0.401873,0.789501,0.951992


In [112]:
df_gcn.groupby('num_side_vocabs').max()[metrics]

Unnamed: 0_level_0,mrr,1,10,100
num_side_vocabs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.541391,0.407676,0.793986,0.954893
1,0.520247,0.391849,0.774334,0.946188
2,0.491825,0.354656,0.755869,0.942627
3,0.456202,0.316275,0.734898,0.941572


In [114]:
df_gcn.groupby(['word_vocab_size', 'side_vocabs']).max()[metrics]

Unnamed: 0_level_0,Unnamed: 1_level_0,mrr,1,10,100
word_vocab_size,side_vocabs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,0,0.541391,0.407676,0.793986,0.954893
5000,0,0.406528,0.279873,0.663677,0.902928
5000,10000,0.505135,0.370087,0.763123,0.943023
5000,10000500,0.463875,0.323529,0.740438,0.941045
5000,1000050000,0.466227,0.332366,0.7287,0.938407
5000,500,0.372667,0.226985,0.669348,0.9177
5000,50010000,0.487795,0.350567,0.754155,0.942232
5000,5001000050000,0.456202,0.316275,0.734898,0.93445
5000,50050000,0.444995,0.304405,0.729623,0.941176
5000,50000,0.49535,0.364416,0.758243,0.941704
