In [1]:
from glob import glob
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
import re
import nltk
from collections import defaultdict
from scipy import spatial

# Part 1: Dataset Preparation

In [2]:
data_path = "/home/shruti/Desktop/iitgn/courses/SEM2/ML/Project/code/PaperAcceptancePrediction/ICLR data/masterdata_unbalanced/"

years = [2017, 2018, 2019, 2020]
rev_dict = {}
paper_dict = {}
dec_dict = {}
iclr_arxiv_map = {}

for y in years:
    rev_dict[y] = pd.read_pickle(data_path + "off_rev_dict_{}.pkl".format(y))
    paper_dict[y] = pd.read_pickle(data_path + "papers_{}.pkl".format(y))
    dec_dict[y] = pd.read_pickle(data_path + "paper_decision_dict_{}.pkl".format(y))

iclr_arxiv_map = pd.read_pickle("./data/iclr_arxiv_map.pkl")

In [3]:
df = pd.read_excel("TestSet-Reviews16_Ann.xlsx")

In [4]:
df.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,0,2019_SJf_XhCqKm,Reject,The authors propose to use k-DPP to select a s...,0,,
1,1,2019_SJf_XhCqKm,Reject,"This paper covers the related work nicely, wit...",0,,
2,2,2019_SJf_XhCqKm,Reject,The rest of the paper are also clearly written.,0,,
3,3,2019_SJf_XhCqKm,Reject,"However, I have some concerns about the propos...",0,,
4,4,2019_SJf_XhCqKm,Reject,"- It is not clear how to define the kernel, th...",0,,


In [5]:
df.shape

(763, 7)

In [6]:
gt_dict = {}

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    if not pid in gt_dict:
        gt_dict[pid] = {"dec": df.loc[i]["Dec"], "mcomp": set(), "not_mcomp": set()}
    if df.loc[i]["MComp"] == 1:
        gt_dict[pid]["mcomp"].add(df.loc[i]["UID"])
    else:
        gt_dict[pid]["not_mcomp"].add(df.loc[i]["UID"])

In [7]:
stats_dict = {"Accept": [0, 0], "Reject": [0, 0]}

for k, v in gt_dict.items():
    #print(len(v["mcomp"]), len(v["not_mcomp"]), v["dec"])
    stats_dict[v["dec"]][0] += len(v["mcomp"])
    stats_dict[v["dec"]][1] += len(v["not_mcomp"])
    
print(stats_dict)

{'Accept': [19, 340], 'Reject': [29, 375]}


In [8]:
test_set = list(gt_dict.keys())
print("TestSet length: %d\n"%len(test_set), test_set)

TestSet length: 16
 ['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl']


In [9]:
for k in test_set:
    print('{:20}{}'.format(k, gt_dict[k]["mcomp"]))

2019_SJf_XhCqKm     {39, 17, 20, 27, 28, 30}
2017_Bk0MRI5lg      {48, 57}
2020_SyevYxHtDB     {76, 87}
2018_rJBiunlAW      {108, 110, 112, 113, 124, 126}
2020_rkltE0VKwH     {160, 155, 184, 159}
2018_Hki-ZlbA-      {267, 235, 236, 271}
2019_BJx0sjC5FX     {292, 287}
2020_r1e_FpNFDr     {312, 322, 315, 308}
2020_B1lsXREYvr     {376, 401}
2018_SkZxCk-0Z      {449, 443, 445, 486}
2019_rJzoujRct7     {518, 519}
2018_HkfXMz-Ab      set()
2017_BJ9fZNqle      {627, 623, 615}
2019_SyxZJn05YX     {672, 673, 657, 669, 671}
2017_B1ckMDqlg      {714, 707}
2017_HJ0NvFzxl      set()


In [10]:
initial_pool_sentences = [ 
    "The method should be compared with other state-of-the-art k-shot learning methods (e.g., Matching Networks by Vinyals et al, 2016).",
    "It's not clear how this method compares against them."
    "Measure: Accuracy difference does not look like a good idea for comparing the baseline method and the proposed one.",
    "If the authors care to compare their approach to other 1-shot learning methods, then they would have to evaluate their approach with siamese and triplet learning networks.",
    "Also it is interesting that authors obtained meaningful results on several datasets beating state-of-the-arts based on very simple ideas.",
    "We should see the performance on other datasets (e.g., some of the other datasets in Wu et al (2018)).",
    "The authors present a convincing set of results over many translation tasks and compare with very competitive baselines.",
    "I would like to see an evaluation on (A) the original two datasets of Mikolov et al (without non-nouns), and (B) the larger datasets provided by Drozd et al [3] and Rogers et al [4].",
    "The comparison with between the representation learned by JMVAE and CVAE might be unfair given that the representation of CVAE is learned conditionally, on the label in the case of MNIST, and should therefore not consider the label in this representation.",
    "This in itself is not a bad thing, but since there is no comparison of different (simpler) RL agents on the tasks, it is difficult to determine if the tasks selected are challenging.",
    "It would be great to compare with standard NLP techniques such as Bag of Words followed by SVM."
]

initial_pool_vecs = []

In [11]:
sents_for_test = defaultdict(list)

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    sents_for_test[pid].append((df.loc[i]["UID"], df.loc[i]["Sent"]))

In [12]:
sents_for_test.keys()

dict_keys(['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl'])

# Part 2: Embeddings for Similarity Calculation 

In [37]:
def calculate_precision_at_k(predicted_sentids, k):
    
    local_precision = []
    for pid in predicted_sentids:
        pred = set(predicted_sentids[pid][0:k])
        gt = set(gt_dict[pid]["mcomp"])
        if len(gt) > 0:
            local_precision.append(round(len(pred.intersection(gt))/k, 3))
    #print("Local precision: ", local_precision)
    print("Precision@{}: {}".format(k, np.mean(local_precision)))
    return

In [38]:
def calculate_recall_at_k(predicted_sentids, k):
    
    local_recall = []
    for pid in predicted_sentids:
        pred = set(predicted_sentids[pid][0:k])
        gt = set(gt_dict[pid]["mcomp"])
        if len(gt) > 0:
            local_recall.append(round(len(pred.intersection(gt))/len(gt), 3))
    #print("Local recall: ", local_recall)
    print("Recall@{}: {}".format(k, np.mean(local_recall)))
    return

## A. SciSpacy model

In [13]:
import spacy
import torch

In [14]:
from transformers import AutoTokenizer, AutoModel

In [15]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

model = AutoModel.from_pretrained("allenai/scibert_scivocab_cased")

In [16]:
def embed_text_using_scibert(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states

In [17]:
for i in initial_pool_sentences:
    vec_emb = embed_text_using_scibert(i).mean(1).detach().numpy()
    vec_emb_normalized = vec_emb / norm(vec_emb)
    initial_pool_vecs.append(vec_emb_normalized)

In [18]:
sent_vectors = defaultdict(list)

for pid in sents_for_test:
    for sp in sents_for_test[pid]:
        vec = embed_text_using_scibert(sp[1]).mean(1).detach().numpy()
        sent_vectors[pid].append((sp[0], vec/norm(vec)))

In [19]:
sim_with_initial_pool = defaultdict(list)

for pid in sent_vectors:
    for sp in sent_vectors[pid]:
        local_sims = []
        for ipv in initial_pool_vecs:
            local_sims.append(np.inner(sp[1], ipv)[0][0])
        sim_with_initial_pool[pid].append((sp[0], max(local_sims)))

### Min and Max sim sent for each paper

In [20]:
for k in test_set:
    print(max(sim_with_initial_pool[k], key=lambda x: x[1]), min(sim_with_initial_pool[k], key=lambda x: x[1]))

(20, 0.8978162) (25, 0.7426945)
(56, 0.89206576) (46, 0.6183027)
(65, 0.8961332) (77, 0.5202615)
(108, 0.92335904) (105, 0.69457334)
(155, 0.9079475) (180, 0.7029835)
(269, 0.9079767) (252, 0.69661546)
(287, 0.8945467) (282, 0.66351205)
(312, 0.8921772) (325, 0.6555019)
(417, 0.9038178) (428, 0.65535474)
(450, 0.906968) (493, 0.6545675)
(531, 0.89420986) (522, 0.77269435)
(558, 0.8910845) (550, 0.75973654)
(623, 0.90252984) (597, 0.6404349)
(675, 0.8918824) (690, 0.70532465)
(719, 0.91007626) (713, 0.7906028)
(732, 0.9052617) (731, 0.7771135)


In [21]:
# Sort sim sentences

sorted_sim_with_initial_pool = defaultdict(list)

for k in sim_with_initial_pool:
    sorted_sims = sorted(sim_with_initial_pool[k], key=lambda x: x[1], reverse=True)
    for items in sorted_sims:
        sorted_sim_with_initial_pool[k].append(items[0])
    

In [26]:
with open("data/sent_sin_scibert.pkl", "wb") as f:
    pickle.dump(sorted_sim_with_initial_pool, f)

In [43]:
for i in [1, 3, 5, 7, 10, 12, 14]:
    calculate_precision_at_k(sorted_sim_with_initial_pool, i)

Precision@1: 0.42857142857142855
Precision@3: 0.2617857142857143
Precision@5: 0.19999999999999998
Precision@7: 0.16342857142857142
Precision@10: 0.14285714285714288
Precision@12: 0.1607857142857143
Precision@14: 0.1427142857142857


In [44]:
for i in [1, 3, 5, 7, 10, 12, 14]:
    calculate_recall_at_k(sorted_sim_with_initial_pool, i)

Recall@1: 0.11907142857142858
Recall@3: 0.22028571428571428
Recall@5: 0.2845714285714286
Recall@7: 0.33814285714285713
Recall@10: 0.40950000000000003
Recall@12: 0.6023571428571428
Recall@14: 0.6142857142857142


In [None]:
nlp = spacy.load("en_core_sci_sm")