In [1]:
import spacy
import numpy as np
from tqdm import tqdm
from collections import Counter
from transformers import pipeline

In [2]:
quick = np.load('../data/additional_info_dict.npy', allow_pickle=True)

In [3]:
quick.item().get('d1ca07561b24afe8b1bd18dd1c239dbbbd221964')

{'title': 'Scim: Intelligent Faceted Highlights for Interactive, Multi-Pass Skimming of Scientific Papers',
 'abstract': 'Researchers are expected to keep up with an immense literature, yet often find it prohibitively time-consuming to do so. This paper ex-plores how intelligent agents can help scaffold in-situ information seeking across scientific papers. Specifically, we present Scim, an AI-augmented reading interface designed to help researchers skim papers by automatically identifying, classifying, and highlighting salient sentences, organized into rhetorical facets rooted in common information needs. Using Scim as a design probe, we explore the benefits and drawbacks of imperfect AI assistance within an augmented reading interface. We found researchers used Scim in several different ways: from reading primarily in the ‘highlight browser’ (side panel) to making multiple passes through the paper with different facets activated (e.g., focusing solely on objective and novelty in their

In [4]:
keys = list(quick.item().keys())

In [5]:
texts = []
for key in keys:
    d = quick.item()[key]
    text = ''
    if d['title'] is not None:
        text += d['title'] + '\n'
    if d['abstract'] is not None:
        text += d['abstract'] + '\n'
    if d['tldr'] is not None:
        text += d['tldr']['text']
    texts.append(text)

In [6]:
# spacy.require_gpu()

In [7]:
nlp = spacy.load("en_core_web_lg")

In [8]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [9]:
data_types = ['text', 'video', 'audio', 'speech', 'image']

In [10]:
languages = [
    'English', 'Chinese', 'Spanish', 'Hindi', 'Bengali', 'Portuguese', 'Russian', 
    'Japanese', 'Vietnamese', 'German', 'French', 'Turkish', 'Korean', 'Italian',
    'Polish', 'Dutch', 'Indonesian', 'Thai', 'Danish', 'Czech', 'Finnish', 'Greek',
    'Swedish', 'Hungarian', 'Latvian', 'Lithuanian', 'Estonian', 'Arabic', 'Multilingual'
]

In [11]:
act_functions = [
    'relu', 'silu', 'gelu', 'sigmoid', 'tanh', 'elu', 'softmax',
]

In [12]:
architectures = [
    {'CNN': 'cnn'}, 
    {'DNN': ['dnn', 'ann']}, # should be treated together
    {'RNN' : ['rnn']}, 
    {'LSTM' : ['lstm']}, 
    {'GRU' : ['gru']}, 
    {'GAN' : ['gan']}, 
    {'VAE' : ['vae']}, 
    {'seq2seq' : ['seq2seq']}, 
    {'BERT': ['bert']}, 
    {'Transformer' : ['transformer']},
    {'GPT' : ['gpt']}, 
    {'GPT-2': ['gpt2', 'gpt-2']}, # should be treated together
    {'GPT-3': ['gpt3', 'gpt-3']}, # should be treated together
    {'AE': ['ae', 'autoencoder']}, # should be treated together
    {'ResNet': ['resnet']},
    {'attention': ['attention']},
    {'NER': ['ner']},
    {'ViT': ['vit']}
]

In [13]:
for key in tqdm(keys):
    d = quick.item()[key]
    text = ''
    if d['title'] is not None:
        text += d['title'] + '\n'
    if d['abstract'] is not None:
        text += d['abstract'] + '\n'
    if d['tldr'] is not None:
        text += d['tldr']['text']
        
    quick.item()[key]['language'] = []
    quick.item()[key]['act_function'] = []
    quick.item()[key]['architecture'] = []
    quick.item()[key]['data_type'] = []
    doc = nlp(text)
    
    for token in doc:
        if token.text in languages:
            quick.item()[key]['language'].append(token.text)
        if token.text.lower() in act_functions:
            quick.item()[key]['act_function'].append(token.text)
        for arch in architectures:
            arch_key = list(arch.keys())[0]
            if token.text.lower() in arch[arch_key]:
                quick.item()[key]['architecture'].append(arch_key)

100%|██████████| 24811/24811 [10:07<00:00, 40.84it/s]


In [14]:
data_type_preds = classifier(texts, data_types)

In [25]:
data_types_preds_words = list(map(lambda x: x['labels'][0], data_type_preds))

In [27]:
for key, dt in tqdm(zip(keys, data_types_preds_words)):
    quick.item()[key]['data_type'] = [dt]

24811it [00:00, 925679.39it/s]


In [20]:
for key in tqdm(keys):
    for name in ['language', 'act_function', 'architecture']:
        quick.item()[key][name] = list(set(quick.item()[key][name]))

100%|██████████| 24811/24811 [00:00<00:00, 213189.59it/s]


In [71]:
np.save('../data/full_data_dict.npy', quick)

In [32]:
np.save('../data/lists/data_types.npy', np.array(data_types))

In [33]:
np.save('../data/lists/act_functions', np.array(act_functions))

In [34]:
np.save('../data/lists/languages.npy', np.array(languages))

In [35]:
arch_list = [list(arch.keys())[0] for arch in architectures]

In [41]:
categories = []

In [62]:
years = []

In [114]:
n = 0
for key in keys:
    if len(quick.item()[key]['act_function']) != 0:
        print(quick.item()[key])
        print()
        n += 1
        if n == 3:
            break

{'title': 'Adversarial vulnerability of powerful near out-of-distribution detection', 'abstract': 'There has been a significant progress in detecting out-of-distribution (OOD) inputs in neural networks recently, primarily due to the use of large models pretrained on large datasets, and an emerging use of multi-modality. We show a severe adversarial vulnerability of even the strongest current OOD detection techniques. With a small, targeted perturbation to the input pixels, we can change the image assignment from an in-distribution to an out-distribution, and vice versa, easily. In particular, we demonstrate severe adversarial vulnerability on the challenging near OOD CIFAR-100 vs CIFAR-10 task, as well as on the far OOD CIFAR-100 vs SVHN. We study the adversarial robustness of several post-processing techniques, including the simple baseline of Maximum of Softmax Probabilities (MSP), the Mahalanobis distance, and the newly proposed Relative Mahalanobis distance. By comparing the loss o

In [69]:
for key in keys:
    if quick.item()[key]['authors'] is not None:
        quick.item()[key]['authors_string'] = ', '.join(list(map(lambda x: x['name'], quick.item()[key]['authors'])))

In [None]:
q

In [70]:
quick.item()[keys[0]]

{'title': 'Scim: Intelligent Faceted Highlights for Interactive, Multi-Pass Skimming of Scientific Papers',
 'abstract': 'Researchers are expected to keep up with an immense literature, yet often find it prohibitively time-consuming to do so. This paper ex-plores how intelligent agents can help scaffold in-situ information seeking across scientific papers. Specifically, we present Scim, an AI-augmented reading interface designed to help researchers skim papers by automatically identifying, classifying, and highlighting salient sentences, organized into rhetorical facets rooted in common information needs. Using Scim as a design probe, we explore the benefits and drawbacks of imperfect AI assistance within an augmented reading interface. We found researchers used Scim in several different ways: from reading primarily in the ‘highlight browser’ (side panel) to making multiple passes through the paper with different facets activated (e.g., focusing solely on objective and novelty in their

In [64]:
for key in keys:
    res = quick.item()[key]['year']
    if res is not None:
        years.append(res)

In [66]:
np.save('../data/lists/years.npy', np.array(years))

In [65]:
set(years)

{1967, 1990, 2015, 2016, 2018, 2019, 2020, 2021, 2022}

In [54]:
topics = [
    'Art',
    'Biology',
    'Business',
    'Chemistry',
    'Computer Science',
    'Economics',
    'Engineering',
    'Environmental Science',
    'Geography',
    'Geology',
    'History',
    'Materials Science',
    'Mathematics',
    'Medicine',
    'Philosophy',
    'Physics',
    'Political Science',
    'Psychology',
    'Sociology'
]

In [55]:
np.save('../data/lists/topics.npy', np.array(topics))

In [72]:
str(None)

'None'

In [75]:
cit_rel = np.load('../data/citation_relations.npy', allow_pickle=True)

In [77]:
ddd = {}
for elem in cit_rel:
    key = list(elem.keys())[0]
    ddd[key] = elem[key]

In [79]:
np.save('../data/cit_rel_dict.npy', np.array(ddd))

In [83]:
uniq = np.load('../data/lists/unique_ids_list.npy', allow_pickle=True)

In [85]:
act_functions

['relu', 'silu', 'gelu', 'sigmoid', 'tanh', 'elu', 'softmax']

In [115]:
ddt = np.load('../data/full_data_dict.npy', allow_pickle=True)

In [121]:
np.save('../data.unique_ids_list.npy', np.array(list(ddt.item().keys())))