In [57]:
from glob import glob
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
import re
import nltk
from collections import defaultdict, Counter
from scipy import spatial

# Part 1: Dataset Preparation

In [2]:
data_path = "/home/shruti/Desktop/iitgn/courses/SEM2/ML/Project/code/PaperAcceptancePrediction/ICLR data/masterdata_unbalanced/"

years = [2017, 2018, 2019, 2020]
rev_dict = {}
paper_dict = {}
dec_dict = {}
iclr_arxiv_map = {}

for y in years:
    rev_dict[y] = pd.read_pickle(data_path + "off_rev_dict_{}.pkl".format(y))
    paper_dict[y] = pd.read_pickle(data_path + "papers_{}.pkl".format(y))
    dec_dict[y] = pd.read_pickle(data_path + "paper_decision_dict_{}.pkl".format(y))

iclr_arxiv_map = pd.read_pickle("./data/iclr_arxiv_map.pkl")

In [3]:
df = pd.read_excel("InputTestSet-Reviews48_Ann.xlsx")

In [4]:
df.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,0,2019_SJf_XhCqKm,Reject,The authors propose to use k-DPP to select a s...,0,,
1,1,2019_SJf_XhCqKm,Reject,"This paper covers the related work nicely, wit...",0,,
2,2,2019_SJf_XhCqKm,Reject,The rest of the paper are also clearly written.,0,,
3,3,2019_SJf_XhCqKm,Reject,"However, I have some concerns about the propos...",0,,
4,4,2019_SJf_XhCqKm,Reject,"- It is not clear how to define the kernel, th...",0,,


In [5]:
df.shape

(1505, 7)

In [6]:
gt_dict = {}

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    if not pid in gt_dict:
        gt_dict[pid] = {"dec": df.loc[i]["Dec"], "mcomp": set(), "not_mcomp": set()}
    if df.loc[i]["MComp"] == 1:
        gt_dict[pid]["mcomp"].add(df.loc[i]["UID"])
    else:
        gt_dict[pid]["not_mcomp"].add(df.loc[i]["UID"])

In [7]:
stats_dict = {"Accept": [0, 0], "Reject": [0, 0]}

for k, v in gt_dict.items():
    #print(len(v["mcomp"]), len(v["not_mcomp"]), v["dec"])
    stats_dict[v["dec"]][0] += len(v["mcomp"])
    stats_dict[v["dec"]][1] += len(v["not_mcomp"])
    
print(stats_dict)

{'Accept': [48, 644], 'Reject': [69, 744]}


In [8]:
test_set = list(gt_dict.keys())
print("TestSet length: %d\n"%len(test_set), test_set)

TestSet length: 32
 ['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl', '2017_S1_pAu9xl', '2018_SyYYPdg0-', '2017_BJAA4wKxg', '2019_HyVxPsC9tm', '2019_HylTBhA5tQ', '2019_B1l08oAct7', '2018_H135uzZ0-', '2017_H1oyRlYgg', '2017_r1y1aawlg', '2020_r1eX1yrKwB', '2020_Byg79h4tvB', '2019_H1lFZnR5YX', '2020_BkeWw6VFwr', '2018_HyHmGyZCZ', '2018_HyUNwulC-', '2020_HkgsPhNYPS']


In [9]:
for k in test_set:
    print('{:20}{}'.format(k, gt_dict[k]["mcomp"]))

2019_SJf_XhCqKm     {39, 17, 20, 27, 28, 30}
2017_Bk0MRI5lg      {48, 57}
2020_SyevYxHtDB     {76, 87}
2018_rJBiunlAW      {108, 110, 112, 113, 124, 126}
2020_rkltE0VKwH     {160, 155, 184, 159}
2018_Hki-ZlbA-      {267, 235, 236, 271}
2019_BJx0sjC5FX     {292, 287}
2020_r1e_FpNFDr     {312, 322, 315, 308}
2020_B1lsXREYvr     {376, 401}
2018_SkZxCk-0Z      {449, 443, 445, 486}
2019_rJzoujRct7     {518, 519}
2018_HkfXMz-Ab      {573, 566}
2017_BJ9fZNqle      {627, 623, 615}
2019_SyxZJn05YX     {672, 673, 657, 669, 671}
2017_B1ckMDqlg      {714, 707}
2017_HJ0NvFzxl      {739}
2017_S1_pAu9xl      {792, 809, 810, 806}
2018_SyYYPdg0-      {834, 867, 868, 869, 870, 872, 873, 844, 830}
2017_BJAA4wKxg      {884}
2019_HyVxPsC9tm     {931, 933, 905, 909, 912, 913, 919, 926}
2019_HylTBhA5tQ     {972, 950}
2019_B1l08oAct7     {994, 996, 1064, 1004, 1007, 1044, 1047, 1048, 1055}
2018_H135uzZ0-      {1072, 1079}
2017_H1oyRlYgg      set()
2017_r1y1aawlg      {1125, 1162, 1100, 1102, 1168}
2020_r1eX1y

## Inspect SciBERT coverage

In [10]:
import spacy
import torch

In [11]:
from transformers import AutoTokenizer, AutoModel

## 1. Cased

In [12]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

model = AutoModel.from_pretrained("allenai/scibert_scivocab_cased")

In [36]:
cased_word_dict = defaultdict(int)

for s in df["Sent"]:
    try:
        sent_text = nltk.sent_tokenize(s)
        for sent in sent_text:
            words = nltk.word_tokenize(sent)
            for w in words:
                cased_word_dict[w] += 1
    except Exception as ex:
        print("Err for: ", s)

Err for:  2
Err for:  3
Err for:  2


In [45]:
df["Sent"].shape

(1505,)

In [40]:
len(cased_word_dict), sorted(cased_word_dict.items(), key=lambda x: x[1], reverse=True)[0:20]

(4373,
 [('the', 1939),
  ('.', 1291),
  (',', 1178),
  ('of', 813),
  ('is', 750),
  ('to', 726),
  ('a', 613),
  ('and', 594),
  ('in', 488),
  (')', 442),
  ('that', 375),
  ('The', 368),
  ('(', 326),
  ('for', 317),
  ('paper', 300),
  ('on', 288),
  ('be', 263),
  ('are', 254),
  ('this', 246),
  ('I', 245)])

In [44]:
cased_word_dict["et"], cased_word_dict["al"]

(57, 56)

In [48]:
tokenizer.convert_tokens_to_ids(["et", "al"])

[386, 197]

In [108]:
tokenizer.convert_ids_to_tokens(100)

'[UNK]'

In [66]:
print("Unique words in the setences(cased): {}.\nVocab size of SciBERT: {}".format(len(cased_word_dict), tokenizer.vocab_size))

Unique words in the setences(cased): 4374.
Vocab size of SciBERT: 31116


Check coverage

In [73]:
unique_tokens = list(cased_word_dict.keys()) 
token_ids = tokenizer.convert_tokens_to_ids(unique_tokens)

In [74]:
token_ids.count(100)

1340

In [75]:
ctr = Counter(token_ids)
ctr.most_common(10)

[(100, 1340),
 (186, 1),
 (2227, 1),
 (4633, 1),
 (146, 1),
 (649, 1),
 (6155, 1),
 (105, 1),
 (634, 1),
 (125, 1)]

In [79]:
unk_tokens = []
for idx, it in enumerate(token_ids):
    if it == 100:
        unk_tokens.append((unique_tokens[idx], cased_word_dict[unique_tokens[idx]]))

In [82]:
len(unk_tokens), unk_tokens[0:5]

(1340,
 [('k-DPP', 4),
  ('hyperparameter', 7),
  ('nicely', 2),
  ('hyperparameters', 6),
  ('``', 106)])

In [87]:
sorted(unk_tokens, key=lambda x: x[1], reverse=True)

[('``', 106),
 ("''", 99),
 ("'s", 34),
 ('adversarial', 31),
 ("n't", 26),
 ('RNNs', 24),
 ('entailment', 16),
 ('baselines', 15),
 ('CNNs', 15),
 ('e.g.', 14),
 ('RNN', 14),
 ('embeddings', 14),
 ('state-of-the-art', 12),
 ('Minor', 11),
 ('dDVI', 11),
 ('UDA', 11),
 ('top-k', 11),
 ('MNIST', 10),
 ('SOTA', 9),
 ('LSTM', 9),
 ('ICLR', 9),
 ('convinced', 8),
 ('well-written', 8),
 ('autoregressive', 8),
 ('activations', 8),
 ('TPDN', 8),
 ('NAS', 8),
 ('confusing', 8),
 ('semi-supervised', 8),
 ('hyperparameter', 7),
 ('rebuttal', 7),
 ('size-free', 7),
 ('\\beta', 7),
 ('one-shot', 7),
 ('2019', 7),
 ('multimodality', 7),
 ('blind-spot', 7),
 ('\\alpha', 7),
 ('BNN', 7),
 ('MCVI', 7),
 ('top-1', 7),
 ('hyperparameters', 6),
 ("'m", 6),
 ('ImageNet', 6),
 ('multi-agent', 6),
 ('misclassified', 6),
 ('amateur', 6),
 ('MoE', 6),
 ('Strengths', 6),
 ('DMPN', 6),
 ('GloVe', 6),
 ('ICML', 5),
 ('stealing', 5),
 ('SRU', 5),
 ('gated', 5),
 ('high-level', 5),
 ('L1', 5),
 ('tighter', 5),
 ('

In [92]:
print("Unique words in the dataset(cased): {}.\nVocab size of SciBERT: {}".format(len(cased_word_dict), tokenizer.vocab_size))
print("{} tokens out of {} unique tokens are not present.\n".format(token_ids.count(100), len(cased_word_dict)))

print("Total words in the dataset: {}".format(sum(cased_word_dict.values())))
print("Absent word count in the dataset: {}".format(sum(x[1] for x in unk_tokens)))

Unique words in the dataset(cased): 4374.
Vocab size of SciBERT: 31116
1340 tokens out of 4374 unique tokens are not present.

Total words in the dataset: 34868
Absent word count in the dataset: 2484


## 2. Uncased

In [93]:
tokenizer_uncased = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

model_uncased = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [119]:
uncased_word_dict = defaultdict(int)

for s in df["Sent"]:
    try:
        sent_text = nltk.sent_tokenize(s)
        for sent in sent_text:
            words = nltk.word_tokenize(sent)
            for w in words:
                w = w.lower()
                uncased_word_dict[w] += 1
    except Exception as ex:
        print("Err for: ", s)

Err for:  2
Err for:  3
Err for:  2


In [120]:
df["Sent"].shape

(1505,)

In [121]:
len(uncased_word_dict), sorted(uncased_word_dict.items(), key=lambda x: x[1], reverse=True)[0:20]

(3937,
 [('the', 2310),
  ('.', 1291),
  (',', 1178),
  ('of', 815),
  ('is', 765),
  ('to', 737),
  ('a', 637),
  ('and', 597),
  ('in', 568),
  (')', 442),
  ('that', 383),
  ('this', 357),
  ('for', 340),
  ('(', 326),
  ('paper', 309),
  ('it', 295),
  ('on', 294),
  ('be', 263),
  ('are', 262),
  ('i', 255)])

In [122]:
uncased_word_dict["et"], uncased_word_dict["al"]

(57, 56)

In [123]:
tokenizer_uncased.convert_tokens_to_ids(["et", "al"])

[365, 186]

In [124]:
tokenizer_uncased.special_tokens_map, tokenizer_uncased.convert_tokens_to_ids("UNK")

({'unk_token': '[UNK]',
  'sep_token': '[SEP]',
  'pad_token': '[PAD]',
  'cls_token': '[CLS]',
  'mask_token': '[MASK]'},
 101)

In [125]:
tokenizer_uncased.convert_ids_to_tokens(101)

'[UNK]'

In [126]:
print("Unique words in the setences(cased): {}.\nVocab size of SciBERT: {}".format(len(uncased_word_dict), tokenizer_uncased.vocab_size))

Unique words in the setences(cased): 3937.
Vocab size of SciBERT: 31090


Check coverage

In [127]:
unique_tokens = list(uncased_word_dict.keys()) 
token_ids = tokenizer_uncased.convert_tokens_to_ids(unique_tokens)

In [128]:
"RNN" in uncased_word_dict
# del uncased_word_dict["RNN"]

False

In [129]:
token_ids.count(101)

1086

In [130]:
ctr = Counter(token_ids)
ctr.most_common(10)

[(101, 1086),
 (111, 1),
 (1991, 1),
 (4459, 1),
 (147, 1),
 (626, 1),
 (5470, 1),
 (106, 1),
 (610, 1),
 (131, 1)]

In [134]:
unk_tokens = []
for idx, it in enumerate(token_ids):
    if it == 101:
        unk_tokens.append((unique_tokens[idx], uncased_word_dict[unique_tokens[idx]]))

In [135]:
len(unk_tokens), unk_tokens[0:5]

(1086,
 [('k-dpp', 4),
  ('hyperparameter', 9),
  ('nicely', 2),
  ('hyperparameters', 6),
  ('``', 106)])

In [141]:
sorted(unk_tokens, key=lambda x: x[1], reverse=True)[0:20]

[('``', 106),
 ("'s", 34),
 ('adversarial', 33),
 ("n't", 26),
 ('rnns', 24),
 ('entailment', 16),
 ('baselines', 15),
 ('cnns', 15),
 ('e.g.', 14),
 ('rnn', 14),
 ('state-of-the-art', 13),
 ('one-shot', 13),
 ('ddvi', 11),
 ('uda', 11),
 ('top-k', 11),
 ('mnist', 10),
 ('sota', 10),
 ('hyperparameter', 9),
 ('lstm', 9),
 ('iclr', 9)]

In [139]:
print("Uncased")
print("Unique words in the dataset: {}.\nVocab size of SciBERT: {}".format(len(uncased_word_dict), tokenizer_uncased.vocab_size))
print("{} tokens out of {} unique tokens are not present.\n".format(token_ids.count(101), len(uncased_word_dict)))

print("Total words in the dataset: {}".format(sum(uncased_word_dict.values())))
print("Absent word count in the dataset: {}".format(sum(x[1] for x in unk_tokens)))

Uncased
Unique words in the dataset: 3937.
Vocab size of SciBERT: 31090
1086 tokens out of 3937 unique tokens are not present.

Total words in the dataset: 34868
Absent word count in the dataset: 2031


## 3. Rest analysis

In [78]:
def embed_text_using_scibert(text, verbose=0):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    
    if verbose:
        print("Input text: {}".format(text))
        print("Input ids len: ", len(input_ids[0]))
        print("Input ids: ", input_ids)
        
        np_inp = input_ids[0].detach().numpy()
        for i in range(np_inp.shape[0]):
            print(tokenizer.convert_ids_to_tokens([np_inp[i]]), end=", ")
        
    return last_hidden_states

In [79]:
embed_text_using_scibert("The method should be compared with other state-of-the-art k-shot learning methods (e.g., Matching Networks by Vinyals et al, 2016).", 1)

Input text: The method should be compared with other state-of-the-art k-shot learning methods (e.g., Matching Networks by Vinyals et al, 2016).
Input ids len:  39
Input ids:  tensor([[  101,   111,   626,  1053,   203,  1073,   188,   521,  1199,   578,
           125,   578,   111,   578, 10126,   324,   578, 19859,  2111,  1372,
           143,   142,   211,   175,   211,   430,  5189,  2732,   224, 23208,
         19272, 30111,   386,   197,   430,  5582,   551,   211,   102]])
['[CLS]'], ['the'], ['method'], ['should'], ['be'], ['compared'], ['with'], ['other'], ['state'], ['-'], ['of'], ['-'], ['the'], ['-'], ['art'], ['k'], ['-'], ['shot'], ['learning'], ['methods'], ['('], ['e'], ['.'], ['g'], ['.'], [','], ['matching'], ['networks'], ['by'], ['vin'], ['##yal'], ['##s'], ['et'], ['al'], [','], ['2016'], [')'], ['.'], ['[SEP]'], 

tensor([[[ 0.6396,  0.1981, -0.3156,  ..., -0.2083,  0.4822,  0.7386],
         [ 0.9699,  0.9695, -0.0087,  ...,  0.2130,  0.5376,  0.5488],
         [ 0.0733,  0.4639,  0.2113,  ...,  0.3894,  0.7572,  1.6601],
         ...,
         [-1.1417,  0.9311,  0.1918,  ...,  0.5772, -0.0610,  1.0074],
         [ 0.3090,  0.1241, -0.4208,  ...,  0.4226,  0.3077,  0.2547],
         [ 0.6555,  0.8356, -0.2718,  ..., -0.2735,  0.4125,  0.8153]]],
       grad_fn=<NativeLayerNormBackward>)

In [32]:
tokenizer.all_special_ids, tokenizer.all_special_tokens

([0, 101, 100, 102, 103], ['[PAD]', '[CLS]', '[UNK]', '[SEP]', '[MASK]'])

In [34]:
print(tokenizer.convert_ids_to_tokens(111), end=", ")

the, 

In [23]:
toks = "The method should be compared with other state-of-the-art k-shot learning methods \
(e.g., Matching Networks by Vinyals et al, 2016).".split(" ")

print(len(toks), toks)

19 ['The', 'method', 'should', 'be', 'compared', 'with', 'other', 'state-of-the-art', 'k-shot', 'learning', 'methods', '(e.g.,', 'Matching', 'Networks', 'by', 'Vinyals', 'et', 'al,', '2016).']


## B. Inspect USE coverage

In [12]:
import tensorflow as tf
import tensorflow_hub as hub

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [16]:
embed_text_using_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

USE generates sentence embeddings directly. Uses PTB tokenizer in the process