In [10]:
import json
from pprint import pprint
import re
from collections import defaultdict
import pickle

### Read the year wise paper json files

In [11]:
def read_papers_json(year):
    paper_data = defaultdict(list)
    
    if type(year) == int:
        year = [year]
    
    for y in year:
        file_path = "./science-parse/output/{}/{}_ICLR".format(y, y)
        with open(file_path, "r") as f:
            for line in f:
                paper_data[y].append(json.loads(line))
            
    return paper_data

#### Run only for 1 year

In [87]:
paper_data = read_papers_json(2017)

In [88]:
pprint(paper_data[2017][-1])

{'metadata': {'abstractText': 'Recent work has begun exploring neural acoustic '
                              'word embeddings—fixeddimensional vector '
                              'representations of arbitrary-length speech '
                              'segments corresponding to words. Such '
                              'embeddings are applicable to speech retrieval '
                              'and recognition tasks, where reasoning about '
                              'whole words may make it possible to avoid '
                              'ambiguous sub-word representations. The main '
                              'idea is to map acoustic sequences to '
                              'fixed-dimensional vectors such that examples of '
                              'the same word are mapped to similar vectors, '
                              'while different-word examples are mapped to '
                              'very different vectors. In this work we take a '
   

In [5]:
print(paper_data[0].keys())
print(paper_data[0]['metadata'].keys())

dict_keys(['metadata', 'name'])
dict_keys(['year', 'source', 'referenceMentions', 'abstractText', 'references', 'creator', 'title', 'authors', 'emails', 'sections'])


#### Run for all years

In [14]:
paper_data = read_papers_json([2017, 2018, 2019, 2020])

In [19]:
type(paper_data)
print(paper_data.keys())

print("\nNumber of papers each year:")
for y in [2017, 2018, 2019, 2020]:
    print(len(paper_data[y]))

dict_keys([2017, 2018, 2019, 2020])

Number of papers each year:
490
909
1418
2212


### Construct features

In [20]:
def paper_text_features(sections_list):
    full_text = ""
    text_features = {"contains_appendix": 0, "table_ref_count": 0, "fig_ref_count": 0, "eqn_ref_count": 0}
    
    for section in sections_list:
        if "heading" in section and section["heading"]:
            full_text += " " + section["heading"].lower()
        if "text" in section and section["text"]:
            full_text += " " + section["text"].lower()
    
    # Appendix
    if full_text.find("appendix") > -1:
        text_features["contains_appendix"] = 1
    
    # Table
    p = re.compile('table [0-9][0-9]?|tables [0-9][0-9]?')
    table_refs = p.findall(full_text)
    if table_refs:
#         table_refs_count = len(table_refs)
        text_features["table_ref_count"] = len(table_refs)
    
    # Figure
    fig_refs = re.findall(r'fig[s]?\. [0-9][0-9]*|figure [0-9][0-9]*|figures [0-9][0-9]*', full_text)
    if fig_refs:
        text_features["fig_ref_count"] = len(fig_refs)
    
    # Equations
    eqn_refs = re.findall(r'eq[s]?\.[ \(\[ ][A-Z0-9]|equation[s]?[ \:\.\(\[\{]?[0-9]?', full_text)
    if eqn_refs:
        text_features["eqn_ref_count"] = len(eqn_refs)
    
    return text_features

In [38]:
# SCRATCH FOR REGEX
p = re.compile('table [0-9][0-9]?')
ft = "As a matter of fact, sk plays a similarly table 0 role as the attention score in various attention models such\nas Vinyals et al. (2015). The impact of proceeding elements to the current output can be adjusted (either increase or decrease) by sk. The memory capability of ELSTM-II can be proven in a similarly fashion, so even ELSTM-II does not have forget gate, it is capable in attending to or forgetting a particular position of a sequence as ELSTM-I through the scaling factor.\nThe major difference between the ELSTM-I and the ELSTM-II is that fewer parameters are used in the ELSTM-II than those in the ELSTM-I. The numbers of parameters used by different RNN cells are compared in Table 1, where Xt ∈ RM , ht ∈ RN and t = 1, · · · , T .\nAlthough the number of parameters of ELSTM depends on the maximum length of a sequence in practice, the memory overhead required is limited. ELSTM-II requires less number of parameters than LSTM for typical lengthed sequence. From Table. 1, to double the number of parameters as compare to an ordinary LSTM, the length of a sentence needs to be 4 times the size of the word embedding size and number of cells put together. That is, in the case of Sutskever et al. (2014) with 1000 word embedding and 1000 cells, the sentence length needs to be 4 × (1000 + 1000) = 8000! In practice, most NLP problems whose input involves sentences, the length will be typically less than 100. In our experiment, sequence to sequence with attention (Vinyals et al., 2015) for maximum sentence length 100 (other model settings please refer to Table 2), ELSTM-I parameters uses 75M of memory, ELSTM-II uses 69.1M, LSTM uses 71.5M"
pt = p.findall(ft.lower())
print(pt)


s = "d and accelerated. Our method is outlined in Figure 1. The main idea is to RNN model is shown in Fig. 3. It consists of a lower and an upper BRNN branches. At each time step"
print(re.findall(r'fig\. [0-9][0-9]*|figs\. [0-9][0-9]*|figure [0-9][0-9]*|figures [0-9][0-9]*', s.lower()))
if re.findall(r'fig\. [0-9][0-9]*|figs\. [0-9][0-9]*|figure [0-9][0-9]*|figures [0-9][0-9]*', s):
    print("farw")
else:
    print("yheh")

['table 0', 'table 1', 'table 2']


In [21]:
def construct_features(paper_data):
    features = {}
    
    metadata_nf = 0
    sections_nf = 0
    ref_mentions_nf = 0
    
    for year_key in paper_data.keys():
        for p in paper_data[year_key]:
            try:
                if "name" in p:
                    pid = str(year_key) + "_" + p["name"].split(".")[0]
                    if "metadata" in p:
                        features[pid] = {}

                        # count sections list. Note: This also counts the subsections separately, but works for now as it is consistent for all.
                        if "sections" in p["metadata"] and p["metadata"]["sections"]:
                            features[pid]["num_sections"] = len(p["metadata"]["sections"])

                            # Other features such as presence of tables. equations, figures, and appendix.
                            features[pid].update(paper_text_features(p["metadata"]["sections"]))
                            #features[pid]["contains_appendix"] = paper_text_features(p["metadata"]["sections"])

                        else:
                            features[pid]["num_sections"] = 4
                            sections_nf += 1


                        # count avg reference mention length
                        if "referenceMentions" in p["metadata"] and p["metadata"]["referenceMentions"]:
                            ref_lengths = 0
                            ref_counts = 0
                            for ref_info in p["metadata"]["referenceMentions"]:
                                if "startOffset" in ref_info and "endOffset" in ref_info:
                                    ref_lengths += (ref_info["endOffset"] - ref_info["startOffset"] + 1)
                                    ref_counts += 1

                            features[pid]["avg_ref_mention"] = ref_lengths//ref_counts
                        else:
                            features[pid]["avg_ref_mention"] = 0
                            ref_mentions_nf += 1
                    else:
                        print("Metadata not present for id: ", pid)
                        metadata_nf += 1
                else:
                    print("Paper id not present for: ", end="")
                    pprint(p)
            except Exception as ex:
                print(ex)
                print("Exception occurred for: " + p["name"], end="")
                print("============================================================================================")
        print("NOT FOUNDS: ", metadata_nf, sections_nf, ref_mentions_nf)
    return features

In [91]:
features_dict = construct_features(paper_data)

NOT FOUNDS:  0 1 3


In [92]:
list(features_dict.items())[0:5]

[('2017_S1_pAu9xl',
  {'avg_ref_mention': 86,
   'contains_appendix': 0,
   'eqn_ref_count': 7,
   'fig_ref_count': 8,
   'num_sections': 19,
   'table_ref_count': 5}),
 ('2017_SyEiHNKxx',
  {'avg_ref_mention': 30,
   'contains_appendix': 0,
   'eqn_ref_count': 8,
   'fig_ref_count': 8,
   'num_sections': 14,
   'table_ref_count': 1}),
 ('2017_r1LXit5ee',
  {'avg_ref_mention': 35,
   'contains_appendix': 1,
   'eqn_ref_count': 2,
   'fig_ref_count': 3,
   'num_sections': 17,
   'table_ref_count': 5}),
 ('2017_Hk3mPK5gg',
  {'avg_ref_mention': 308,
   'contains_appendix': 0,
   'eqn_ref_count': 0,
   'fig_ref_count': 7,
   'num_sections': 12,
   'table_ref_count': 0}),
 ('2017_S1AG8zYeg',
  {'avg_ref_mention': 746,
   'contains_appendix': 0,
   'eqn_ref_count': 8,
   'fig_ref_count': 9,
   'num_sections': 22,
   'table_ref_count': 13})]

In [None]:
SkYbF1slg

In [93]:
for p in paper_data[2017]:
    if p["name"] == "SkYbF1slg.pdf":
        pprint(p["metadata"]["sections"])
        break

[{'heading': None,
  'text': 'A framework is presented for unsupervised learning of '
          'representations based on infomax principle for large-scale neural '
          'populations. We use an asymptotic approximation to the Shannon’s '
          'mutual information for a large neural population to demonstrate '
          'that a good initial approximation to the global '
          'information-theoretic optimum can be obtained by a hierarchical '
          'infomax method. Starting from the initial solution, an efficient '
          'algorithm based on gradient descent of the final objective function '
          'is proposed to learn representations from the input datasets, and '
          'the method works for complete, overcomplete, and undercomplete '
          'bases. As confirmed by numerical experiments, our method is robust '
          'and highly efficient for extracting salient features from input '
          'datasets. Compared with the main existing methods, our algor

#### Save 2017 new features

In [80]:
with open("./2017_features.pkl", "wb") as f:
    pickle.dump(features_dict, f)

#### All years

In [102]:
features_dict = construct_features(paper_data)

NOT FOUNDS:  0 1 3
NOT FOUNDS:  0 6 9
NOT FOUNDS:  0 9 17
NOT FOUNDS:  0 16 35


In [107]:
with open("./17_20_new_features.pkl", "wb") as f:
    pickle.dump(features_dict, f)

In [106]:
list(features_dict.items())[-5:]

[('2018_rJGY8GbR-', {'avg_ref_mention': 395, 'num_sections': 4}),
 ('2019_SyehMhC9Y7',
  {'avg_ref_mention': 50,
   'contains_appendix': 0,
   'eqn_ref_count': 2,
   'fig_ref_count': 10,
   'num_sections': 11,
   'table_ref_count': 5}),
 ('2017_SygvTcYee',
  {'avg_ref_mention': 71,
   'contains_appendix': 0,
   'eqn_ref_count': 0,
   'fig_ref_count': 7,
   'num_sections': 8,
   'table_ref_count': 0}),
 ('2019_SkguE30ct7',
  {'avg_ref_mention': 38,
   'contains_appendix': 1,
   'eqn_ref_count': 0,
   'fig_ref_count': 10,
   'num_sections': 20,
   'table_ref_count': 6}),
 ('2020_H1ggKyrYwB',
  {'avg_ref_mention': 51,
   'contains_appendix': 1,
   'eqn_ref_count': 0,
   'fig_ref_count': 14,
   'num_sections': 12,
   'table_ref_count': 9})]

In [5]:
new_features_from_text = None
with open('./feature_dict/2017_features.pkl', "rb") as f:
    new_features_from_text = pickle.load(f)

In [7]:
new_features_from_text["rJeKjwvclx"]

{'avg_ref_mention': 27,
 'contains_appendix': 1,
 'eqn_ref_count': 6,
 'fig_ref_count': 9,
 'num_sections': 26,
 'table_ref_count': 2}

In [8]:
len(new_features_from_text)

490

In [9]:
new_features_from_text["r1Usiwcex"]

{'avg_ref_mention': 56, 'num_sections': 4}