In [1]:
from glob import glob
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
import re
import nltk
from collections import defaultdict
from scipy import spatial

# Part 1: Dataset Preparation

In [2]:
data_path = "/home/shruti/Desktop/iitgn/courses/SEM2/ML/Project/code/PaperAcceptancePrediction/ICLR data/masterdata_unbalanced/"

years = [2017, 2018, 2019, 2020]
rev_dict = {}
paper_dict = {}
dec_dict = {}
iclr_arxiv_map = {}

for y in years:
    rev_dict[y] = pd.read_pickle(data_path + "off_rev_dict_{}.pkl".format(y))
    paper_dict[y] = pd.read_pickle(data_path + "papers_{}.pkl".format(y))
    dec_dict[y] = pd.read_pickle(data_path + "paper_decision_dict_{}.pkl".format(y))

iclr_arxiv_map = pd.read_pickle("./data/iclr_arxiv_map.pkl")

In [2]:
df = pd.read_excel("InputTestSet-Reviews16_Ann.xlsx")

In [3]:
df.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,0,2019_SJf_XhCqKm,Reject,The authors propose to use k-DPP to select a s...,0,,
1,1,2019_SJf_XhCqKm,Reject,"This paper covers the related work nicely, wit...",0,,
2,2,2019_SJf_XhCqKm,Reject,The rest of the paper are also clearly written.,0,,
3,3,2019_SJf_XhCqKm,Reject,"However, I have some concerns about the propos...",0,,
4,4,2019_SJf_XhCqKm,Reject,"- It is not clear how to define the kernel, th...",0,,


In [4]:
df.shape

(763, 7)

In [5]:
gt_dict = {}

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    if not pid in gt_dict:
        gt_dict[pid] = {"dec": df.loc[i]["Dec"], "mcomp": set(), "not_mcomp": set()}
    if df.loc[i]["MComp"] == 1:
        gt_dict[pid]["mcomp"].add(df.loc[i]["UID"])
    else:
        gt_dict[pid]["not_mcomp"].add(df.loc[i]["UID"])

In [6]:
stats_dict = {"Accept": [0, 0], "Reject": [0, 0]}

for k, v in gt_dict.items():
    #print(len(v["mcomp"]), len(v["not_mcomp"]), v["dec"])
    stats_dict[v["dec"]][0] += len(v["mcomp"])
    stats_dict[v["dec"]][1] += len(v["not_mcomp"])
    
print(stats_dict)

{'Accept': [19, 340], 'Reject': [29, 375]}


In [7]:
test_set = list(gt_dict.keys())
print("TestSet length: %d\n"%len(test_set), test_set)

TestSet length: 16
 ['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl']


In [8]:
for k in test_set:
    print('{:20}{}'.format(k, gt_dict[k]["mcomp"]))

2019_SJf_XhCqKm     {39, 17, 20, 27, 28, 30}
2017_Bk0MRI5lg      {48, 57}
2020_SyevYxHtDB     {76, 87}
2018_rJBiunlAW      {108, 110, 112, 113, 124, 126}
2020_rkltE0VKwH     {160, 155, 184, 159}
2018_Hki-ZlbA-      {267, 235, 236, 271}
2019_BJx0sjC5FX     {292, 287}
2020_r1e_FpNFDr     {312, 322, 315, 308}
2020_B1lsXREYvr     {376, 401}
2018_SkZxCk-0Z      {449, 443, 445, 486}
2019_rJzoujRct7     {518, 519}
2018_HkfXMz-Ab      set()
2017_BJ9fZNqle      {627, 623, 615}
2019_SyxZJn05YX     {672, 673, 657, 669, 671}
2017_B1ckMDqlg      {714, 707}
2017_HJ0NvFzxl      set()


In [9]:
initial_pool_sentences = [ 
    "The method should be compared with other state-of-the-art k-shot learning methods (e.g., Matching Networks by Vinyals et al, 2016).",
    "It's not clear how this method compares against them."
    "Measure: Accuracy difference does not look like a good idea for comparing the baseline method and the proposed one.",
    "If the authors care to compare their approach to other 1-shot learning methods, then they would have to evaluate their approach with siamese and triplet learning networks.",
    "Also it is interesting that authors obtained meaningful results on several datasets beating state-of-the-arts based on very simple ideas.",
    "We should see the performance on other datasets (e.g., some of the other datasets in Wu et al (2018)).",
    "The authors present a convincing set of results over many translation tasks and compare with very competitive baselines.",
    "I would like to see an evaluation on (A) the original two datasets of Mikolov et al (without non-nouns), and (B) the larger datasets provided by Drozd et al [3] and Rogers et al [4].",
    "The comparison with between the representation learned by JMVAE and CVAE might be unfair given that the representation of CVAE is learned conditionally, on the label in the case of MNIST, and should therefore not consider the label in this representation.",
    "This in itself is not a bad thing, but since there is no comparison of different (simpler) RL agents on the tasks, it is difficult to determine if the tasks selected are challenging.",
    "It would be great to compare with standard NLP techniques such as Bag of Words followed by SVM."
]

initial_pool_vecs = []

In [10]:
sents_for_test = defaultdict(list)

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    sents_for_test[pid].append((df.loc[i]["UID"], df.loc[i]["Sent"]))

In [11]:
sents_for_test.keys()

dict_keys(['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl'])

# Part 2: Embeddings for Similarity Calculation 

In [45]:
def calculate_precision_at_k(predicted_sentids, k):
    
    local_precision = []
    for pid in predicted_sentids:
        pred = set(predicted_sentids[pid][0:k])
        gt = set(gt_dict[pid]["mcomp"])
        if len(gt) > 0:
            local_precision.append(round(len(pred.intersection(gt))/k, 3))
    #print("Local precision: ", local_precision)
    print("Precision@{}: {}".format(k, np.mean(local_precision)))
    return np.mean(local_precision)

In [46]:
def calculate_recall_at_k(predicted_sentids, k):
    
    local_recall = []
    for pid in predicted_sentids:
        pred = set(predicted_sentids[pid][0:k])
        gt = set(gt_dict[pid]["mcomp"])
        if len(gt) > 0:
            local_recall.append(round(len(pred.intersection(gt))/len(gt), 3))
    #print("Local recall: ", local_recall)
    print("Recall@{}: {}".format(k, np.mean(local_recall)))
    return np.mean(local_recall)

## A. SciBERT model

In [13]:
import spacy
import torch

In [14]:
from transformers import AutoTokenizer, AutoModel

In [15]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

model = AutoModel.from_pretrained("allenai/scibert_scivocab_cased")

In [16]:
def embed_text_using_scibert(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states

In [17]:
for i in initial_pool_sentences:
    vec_emb = embed_text_using_scibert(i).mean(1).detach().numpy()
    vec_emb_normalized = vec_emb / norm(vec_emb)
    initial_pool_vecs.append(vec_emb_normalized)

In [18]:
sent_vectors = defaultdict(list)

for pid in sents_for_test:
    for sp in sents_for_test[pid]:
        vec = embed_text_using_scibert(sp[1]).mean(1).detach().numpy()
        sent_vectors[pid].append((sp[0], vec/norm(vec)))

In [19]:
sim_with_initial_pool = defaultdict(list)

for pid in sent_vectors:
    for sp in sent_vectors[pid]:
        local_sims = []
        for ipv in initial_pool_vecs:
            local_sims.append(np.inner(sp[1], ipv)[0][0])
        sim_with_initial_pool[pid].append((sp[0], max(local_sims)))

### Min and Max sim sent for each paper

In [20]:
for k in test_set:
    print(max(sim_with_initial_pool[k], key=lambda x: x[1]), min(sim_with_initial_pool[k], key=lambda x: x[1]))

(20, 0.8978162) (25, 0.7426945)
(56, 0.89206576) (46, 0.6183027)
(65, 0.8961332) (77, 0.5202615)
(108, 0.92335904) (105, 0.69457334)
(155, 0.9079475) (180, 0.7029835)
(269, 0.9079767) (252, 0.69661546)
(287, 0.8945467) (282, 0.66351205)
(312, 0.8921772) (325, 0.6555019)
(417, 0.9038178) (428, 0.65535474)
(450, 0.906968) (493, 0.6545675)
(531, 0.89420986) (522, 0.77269435)
(558, 0.8910845) (550, 0.75973654)
(623, 0.90252984) (597, 0.6404349)
(675, 0.8918824) (690, 0.70532465)
(719, 0.91007626) (713, 0.7906028)
(732, 0.9052617) (731, 0.7771135)


In [21]:
# Sort sim sentences

sorted_sim_with_initial_pool = defaultdict(list)

for k in sim_with_initial_pool:
    sorted_sims = sorted(sim_with_initial_pool[k], key=lambda x: x[1], reverse=True)
    for items in sorted_sims:
        sorted_sim_with_initial_pool[k].append(items[0])
    

In [26]:
with open("data/sent_sin_scibert.pkl", "wb") as f:
    pickle.dump(sorted_sim_with_initial_pool, f)

In [48]:
p_at_k = {}

for i in [1, 3, 5, 7, 10, 12, 14]:
    p_at_k[i] = calculate_precision_at_k(sorted_sim_with_initial_pool, i)

Precision@1: 0.42857142857142855
Precision@3: 0.2617857142857143
Precision@5: 0.19999999999999998
Precision@7: 0.16342857142857142
Precision@10: 0.14285714285714288
Precision@12: 0.1607857142857143
Precision@14: 0.1427142857142857


In [49]:
r_at_k = {}

for i in [1, 3, 5, 7, 10, 12, 14]:
    r_at_k[i] = calculate_recall_at_k(sorted_sim_with_initial_pool, i)

Recall@1: 0.11907142857142858
Recall@3: 0.22028571428571428
Recall@5: 0.2845714285714286
Recall@7: 0.33814285714285713
Recall@10: 0.40950000000000003
Recall@12: 0.6023571428571428
Recall@14: 0.6142857142857142


In [54]:
f_score_at_k = {}

for i in p_at_k:
    f_score_at_k[i] = round((2*p_at_k[i]*r_at_k[i])/(p_at_k[i]+r_at_k[i]), 2)

print(f_score_at_k)

{1: 0.19, 3: 0.24, 5: 0.23, 7: 0.22, 10: 0.21, 12: 0.25, 14: 0.23}


#### Inspect errors

In [58]:
for pid in sorted_sim_with_initial_pool:
    pred = set(sorted_sim_with_initial_pool[pid][0:5])
    gt = set(gt_dict[pid]["mcomp"])
    if len(gt) > 0:
        print("Pid: {} corr pred: {} out of {}".format(pid, len(pred.intersection(gt)), len(gt)))


Pid: 2019_SJf_XhCqKm corr pred: 1 out of 6
Pid: 2017_Bk0MRI5lg corr pred: 0 out of 2
Pid: 2020_SyevYxHtDB corr pred: 1 out of 2
Pid: 2018_rJBiunlAW corr pred: 1 out of 6
Pid: 2020_rkltE0VKwH corr pred: 1 out of 4
Pid: 2018_Hki-ZlbA- corr pred: 1 out of 4
Pid: 2019_BJx0sjC5FX corr pred: 1 out of 2
Pid: 2020_r1e_FpNFDr corr pred: 2 out of 4
Pid: 2020_B1lsXREYvr corr pred: 0 out of 2
Pid: 2018_SkZxCk-0Z corr pred: 1 out of 4
Pid: 2019_rJzoujRct7 corr pred: 0 out of 2
Pid: 2017_BJ9fZNqle corr pred: 3 out of 3
Pid: 2019_SyxZJn05YX corr pred: 2 out of 5
Pid: 2017_B1ckMDqlg corr pred: 0 out of 2


[719, 715, 700, 723, 720]

In [66]:
pid = "2017_B1ckMDqlg"
sent_ids = sorted_sim_with_initial_pool[pid][0:5]
for i in sent_ids:
    print(df.iloc[i]["Sent"], "\n")
    
print("========================================================================================")
for i in gt_dict[pid]["mcomp"]:
    print(df.iloc[i]["Sent"], "\n")

Experiments are performed on language modeling and machine translation tasks, showing significant gains by increasing the number of experts, compared to both SoA as well as explicitly computationally-matched baseline systems. 

Paper Weaknesses:
--- there are many different ways of increasing model capacity to enable the exploitation of very large datasets; it would be very nice to discuss the use of MoE and other alternatives in terms of computational efficiency and other factors. 

Experiments applying the proposed approach on RNNs in language modelling task show that it can beat SOTA results with significantly less computation, which is a result of selectively using much more parameters. 

Overall I think this is a well-described system that achieves good results, using a nifty placement for the MoE that can overcome what otherwise might be a disadvantage for sparse computation. 

An area that falls a bit short is in presenting plots or statistics on the real computational load and 

In [67]:
pid = "2019_SJf_XhCqKm"
sent_ids = sorted_sim_with_initial_pool[pid][0:5]
for i in sent_ids:
    print(df.iloc[i]["Sent"], "\n")
    
print("========================================================================================")
for i in gt_dict[pid]["mcomp"]:
    print(df.iloc[i]["Sent"], "\n")

The authors propose k-DPP as an open loop (oblivious to the evaluation of configurations) method for hyperparameter optimization and provide its empirical study and comparison with other methods such as grid search, uniform random search, low-discrepancy Sobol sequences, BO-TPE (Bayesian optimization using tree-structured Parzen estimator) by Bergstra et al (2011). 

The authors propose to use k-DPP to select a set of diverse parameters and use them to search for a good a hyperparameter setting. 

I think it would have more novelty if some theoretical analyses can be shown on the mixing rate and how good this optimization algorithm is. 

It is unclear to me if the comparison of wall clock time and accuracy holds for larger number of hyperparameters or against Spearmint with more parallelization. 

The first experiment by the authors shows that k-DPP-RBF gives better star discrepancy than uniform random search while being comparable to low-discrepancy Sobol sequences in other metrics su

## B. USE 

In [12]:
import tensorflow as tf
import tensorflow_hub as hub

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [16]:
embed_text_using_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [25]:
initial_pool_vecs = []

for isent in initial_pool_sentences:
    vec = embed_text_using_use([isent])
    initial_pool_vecs.append(vec/norm(vec))

In [28]:
len(initial_pool_vecs), type(initial_pool_vecs[0])

(10, tensorflow.python.framework.ops.EagerTensor)

In [29]:
sent_vectors = defaultdict(list)

for pid in sents_for_test:
    for sp in sents_for_test[pid]:
        vec = embed_text_using_use([sp[1]])
        sent_vectors[pid].append((sp[0], vec/norm(vec)))

In [33]:
sim_with_initial_pool = defaultdict(list)

for pid in sent_vectors:
    for sp in sent_vectors[pid]:
        local_sims = []
        for ipv in initial_pool_vecs:
            local_sims.append(np.inner(sp[1], ipv)[0][0])
        sim_with_initial_pool[pid].append((sp[0], max(local_sims)))

### Min and Max sim sent for each paper

In [34]:
for k in test_set:
    print(max(sim_with_initial_pool[k], key=lambda x: x[1]), min(sim_with_initial_pool[k], key=lambda x: x[1]))

(35, 0.5289302) (15, 0.03991701)
(51, 0.40261608) (47, 0.09980716)
(86, 0.46295142) (96, 0.07237849)
(108, 0.5347953) (105, 0.08769381)
(135, 0.5545818) (196, 0.018506698)
(228, 0.4474287) (248, 0.045198984)
(281, 0.4098944) (297, 0.12206017)
(322, 0.4922856) (359, 0.05927653)
(401, 0.4900506) (431, 0.025508128)
(479, 0.5199797) (494, 0.025145307)
(543, 0.39963928) (522, 0.103784904)
(576, 0.4109378) (563, 0.10770256)
(626, 0.49329722) (597, 0.045339487)
(669, 0.42909688) (694, 0.08125763)
(700, 0.41074893) (725, 0.07047744)
(739, 0.41832653) (734, 0.108119756)


In [35]:
# Sort sim sentences

sorted_sim_with_initial_pool = defaultdict(list)

for k in sim_with_initial_pool:
    sorted_sims = sorted(sim_with_initial_pool[k], key=lambda x: x[1], reverse=True)
    for items in sorted_sims:
        sorted_sim_with_initial_pool[k].append(items[0])
    

In [36]:
with open("data/sent_sin_use.pkl", "wb") as f:
    pickle.dump(sorted_sim_with_initial_pool, f)

In [54]:
p_at_k = {}

for i in [1, 3, 5, 7, 10, 12, 14]:
    p_at_k[i] = calculate_precision_at_k(sorted_sim_with_initial_pool, i)

Precision@1: 0.2857142857142857
Precision@3: 0.28550000000000003
Precision@5: 0.28571428571428575
Precision@7: 0.24507142857142852
Precision@10: 0.23571428571428577
Precision@12: 0.21428571428571433
Precision@14: 0.19885714285714287


In [55]:
r_at_k = {}

for i in [1, 3, 5, 7, 10, 12, 14]:
    r_at_k[i] = calculate_recall_at_k(sorted_sim_with_initial_pool, i)

Recall@1: 0.07978571428571428
Recall@3: 0.2523571428571429
Recall@5: 0.3916428571428572
Recall@7: 0.45357142857142857
Recall@10: 0.6976428571428571
Recall@12: 0.7475714285714287
Recall@14: 0.8095


In [60]:
f_score_at_k = {}

for i in p_at_k:
    f_score_at_k[i] = round((2*p_at_k[i]*r_at_k[i])/(p_at_k[i]+r_at_k[i]), 2)

print(f_score_at_k)

{1: 0.12, 3: 0.27, 5: 0.33, 7: 0.32, 10: 0.35, 12: 0.33, 14: 0.32}


In [None]:
nlp = spacy.load("en_core_sci_sm")