In [1]:
import numpy as np
import pandas as pd 
import json
from collections import defaultdict
import copy
import pickle 

# Load data

In [2]:
f = open("data/obj_all_abstracts_year.json")
data = json.load(f)

In [3]:
f2 = open("data/processed_abstracts.json")
abstracts_list = json.load(f2)

# Streamline data 

In [4]:
# these are the ids in processed$docs.removed and out$docs.removed during preproecessing in R
docs_removed = [4920,10229,16848,17604,24092,68898,104301,108465,109275,118546,118907,128414,132047,135300,
                157901,159166,159813,170490,184016,191411,195803,203451,204141,214446,221736,223792,241881,
                246810,88613,126377] 

updated_abstracts_list = []
for idx,abstract in enumerate(abstracts_list):
    if idx+1 not in docs_removed:
        updated_abstracts_list.append(abstract)

In [6]:
# vocab_id to actual word mapping
id_to_word = defaultdict()
word_to_id = defaultdict()
for idx,word in enumerate(data['vocab']):
    id_to_word[idx] = word
    word_to_id[word] = idx

In [7]:
doc_info_dict_list = [] 

doc_id = 0
for doc_id_str, doc_list in data['documents'].items():
    # get word count distribution
    temp_word_count_dict = defaultdict()
    for id_,count_ in zip(doc_list[0],doc_list[1]):
        temp_word_count_dict[id_to_word[id_-1]]=count_ # -1 to compensate for R being 1-indexed

    temp_info_dict = defaultdict()
    temp_info_dict['abstract_word_distr'] = copy.deepcopy(temp_word_count_dict)
    
    # get updated abstract
    temp_info_dict['updated_abstract'] = updated_abstracts_list[doc_id]
    
    # get other doc info
    temp_info_dict['full_abstract'] = data['meta'][doc_id]['abstract']
    temp_info_dict['doi'] = data['meta'][doc_id]['doi']
    temp_info_dict['year'] = data['meta'][doc_id]['year']
    
    doc_info_dict_list.append(temp_info_dict)
    
    doc_id +=1


# Collect data in a class object

In [9]:
class Paper(object):
    def __init__(self,
                 doi,
                 authors,
                 year,
                 full_abstract,
                 processed_abstract,
                 abstract_word_distr):
        
        self.doi = doi
        self.authors = authors
        self.year = year
        self.full_abstract = full_abstract
        self.processed_abstract = processed_abstract
        self.abstract_word_distr = abstract_word_distr

In [10]:
class APSCorpus(object):
    def __init__(self,
                 vocab,
                 papers_info_dict_list,
                 process = True):
        
        self.vocab = vocab
        self.id_to_word = defaultdict()
        self.word_to_id = defaultdict()
        self.papers = []
        self.doi_to_paperidx = defaultdict()
        
        if process:
            self.get_id_word_mapping()
            self.populate_papers(papers_info_dict_list)
            
    def get_id_word_mapping(self):
        # vocab_id to actual word mapping
        for idx,word in enumerate(self.vocab):
            self.id_to_word[idx] = word
            self.word_to_id[word] = idx
            
    def populate_papers(self,papers_info_dict_list):
        for paper_dict in papers_info_dict_list:
            self.doi_to_paperidx[paper_dict['doi']] = len(self.papers)
            self.papers.append(Paper(paper_dict['doi'],
                                     None,
                                     paper_dict['year'],
                                     paper_dict['full_abstract'],
                                     paper_dict['updated_abstract'],
                                     paper_dict['abstract_word_distr']
                                    ))

In [11]:
corpusobj = APSCorpus(vocab = data['vocab'],papers_info_dict_list = doc_info_dict_list)

# Save data to disk

In [12]:
# Save data to file 
file_aps = open('data/apscorpus.obj', 'wb') 
pickle.dump(corpusobj, file_aps)

In [13]:
# load data
filehandler = open('data/apscorpus.obj', 'rb') 
data_obj = pickle.load(filehandler)

In [14]:
# Example use case
id_ = 249000
data_obj.papers[id_].processed_abstract

'studi effect -site coulomb repuls process reson tunnel find tunnel peak result crossov high-temperatur kondo phase low-temperatur mixed-val phase system chemic potenti vari across -site localized-st energi consequ line shape non-lorentzian rather unusu temperatur depend moreov magnet field split tunnel peak line shape modifi effect coupl local state also discuss'

In [15]:
data_obj.papers[id_].full_abstract

'We study the effect of on-site Coulomb repulsion on the process of resonant tunneling. We find that the tunneling peak results from a crossover from the high-temperature Kondo phase to the low-temperature mixed-valence phase of the system when the chemical potential is varied across the on-site localized-state energy. Consequently, the line shape is non-Lorentzian, with rather unusual temperature dependence. Moreover, a magnetic field does not split the tunneling peak, but the line shape is modified. The effect of coupling between localized states is also discussed.'

In [16]:
data_obj.papers[id_].doi

'10.1103/PhysRevLett.61.1768'

In [17]:
data_obj.papers[id_].year

1988

In [18]:
data_obj.doi_to_paperidx['10.1103/PhysRevLett.61.1768']

249000

# Draft

In [3]:
data.keys()

dict_keys(['documents', 'vocab', 'meta', 'words.removed', 'docs.removed', 'tokens.removed', 'wordcounts'])

In [10]:
len(data['meta'])

249281

In [7]:
data['meta'][0].keys()

dict_keys(['doi', 'abstract', 'year'])

In [35]:
data['meta'][1]['doi']

'10.1103/PhysRevE.65.017102'

In [9]:
len(data['vocab'])

90538

In [59]:
data['vocab'][84083]

'wavelength-scan'

In [19]:
len(data['documents'])

249281

In [70]:
data['documents']['1']

[[7749,
  10810,
  12518,
  14884,
  16453,
  16667,
  18022,
  19950,
  20847,
  21031,
  21251,
  25531,
  29009,
  37051,
  46334,
  47054,
  48262,
  55083,
  55145,
  55418,
  57931,
  64248,
  65685,
  66247,
  71145,
  73168,
  80421,
  80492,
  80620,
  83529],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1]]

In [60]:
# vocab_id to actual word mapping
id_to_word = defaultdict()
word_to_id = defaultdict()
for idx,word in enumerate(data['vocab']):
    id_to_word[idx] = word
    word_to_id[word] = idx

In [31]:
len(id_to_word.keys())

90538

In [32]:
min(id_to_word.keys()),max(id_to_word.keys())

(0, 90537)

In [33]:
# documentwise: convert <id to count> mapping to <word to count> mapping
doc_word_count_dict = defaultdict()

doc_id = 0
for doc_id_str, doc_list in data['documents'].items():
    doc_dict = defaultdict()
    for id_,count_ in zip(doc_list[0],doc_list[1]):
        doc_dict[id_to_word[id_-1]]=count_ # -1 to compensate for R being 1-indexed
    doc_word_count_dict[doc_id] = copy.deepcopy(doc_dict)
    doc_id +=1
    

In [34]:
doc_word_count_dict[1]

defaultdict(None,
            {'addit': 1,
             'also': 1,
             'analyt': 1,
             'appli': 1,
             'argument': 1,
             'asymptot': 1,
             'attach': 1,
             'compon': 1,
             'connect': 3,
             'deviat': 1,
             'distribut': 2,
             'express': 1,
             'extra': 1,
             'group': 1,
             'grown': 1,
             'howev': 1,
             'kind': 1,
             'linear': 3,
             'link': 2,
             'logarithm': 1,
             'mean': 1,
             'necessari': 1,
             'necessarili': 1,
             'network': 1,
             'node': 3,
             'obtain': 1,
             'point': 1,
             'possess': 1,
             'preferenti': 2,
             'prove': 1,
             'rate': 2,
             'recogn': 1,
             'scale-fre': 1,
             'singl': 1,
             'term': 1,
             'time': 1,
             'time-vari': 1,
             

In [37]:
abstracts_list[1]

'recogn time network grown addit node linear preferenti attach will possess scale-fre distribut connect prove analyt argument linear necessari compon obtain kind distribut howev preferenti link rate necessarili appli singl node group node connect also point time-vari mean connect link rate will deviat linear express extra asymptot logarithm term'

In [38]:
len(abstracts_list)

249311

In [45]:
# these are the ids in processed$docs.removed and out$docs.removed during preproecessing in R
docs_removed = [4920,10229,16848,17604,24092,68898,104301,108465,109275,118546,118907,128414,132047,135300,
                157901,159166,159813,170490,184016,191411,195803,203451,204141,214446,221736,223792,241881,
                246810,88613,126377] 

In [48]:
abstracts_list[24091]

'be'

In [43]:
len(abstracts_list[4919])

0

In [51]:
updated_abstracts_list = []
for idx,abstract in enumerate(abstracts_list):
    if (idx+1 in docs_removed):
        pass
    else:
        updated_abstracts_list.append(abstract)

In [52]:
len(updated_abstracts_list)

249281