In [1]:
from pathlib import Path
import pickle
from tqdm import tqdm
import os

In [2]:
processed_ids_file='data/student_graph/processed_ids.pickle'
papers_data_file='data/student_graph/papers_data.pickle'
s2_map_file='data/student_graph/s2_map.pickle'
paper_index = pickle.load(open(papers_data_file, "rb")) if os.path.exists(papers_data_file) else {}
processed = pickle.load(open(processed_ids_file, "rb")) if os.path.exists(processed_ids_file) else []
s2_map = pickle.load(open(s2_map_file, "rb")) if os.path.exists(s2_map_file) else {}

In [3]:
paper_records = {}
author_records = {}
institution_records = set()
field_records = set()
paper_ids = set()
author_ids = set()
ref_ids = set()
arxiv_id_to_s2_id_map = {}
it = 0
it2 = 0

# iterate over the papers in the index
for arxiv_paper_id, paper in tqdm(paper_index.items(), desc="Processing papers", unit="paper"):
    # if it == 1:
    #     pass
    if "code" in paper.keys() or "error" in paper.keys():
        continue
    record = {"arxiv_id": arxiv_paper_id, 
                "s2_paperId": paper['s2_paperId'],
                "title": paper['title'],
                "abstract": paper['abstract'],
                "venue": paper['venue'],
                "year": paper['year'],
                "referenceCount": paper['referenceCount'],
                "influentialCitationCount": paper['influentialCitationCount'],
                "citationCount": paper['citationCount'],
                "publicationType": " | ".join(paper['publicationTypes']) if paper['publicationTypes'] else None,
                'reference_ids': [i['paperId'] for i in paper['references']],
                'author_ids': [i['authorId'] for i in paper['authors']],
                'field_ids': paper['fieldsOfStudy'],
            }
    paper_records[paper['s2_paperId']] = record
    
    paper_ids.add(paper['s2_paperId'])

    field_records.update(paper['fieldsOfStudy'])

    arxiv_id_to_s2_id_map[arxiv_paper_id] = paper['s2_paperId']

    # iterate over the authors
    for author in paper['authors']:
        author_id = author['authorId']
        if author_id not in author_ids:
            author_record = {
                "authorId": author_id,
                "name": author['name'],
                "paperCount": author['paperCount'],
                "citationCount": author['citationCount'],
                "hIndex": author['hIndex'],
                "affiliations": author['affiliations'],
            }
            author_records[author_id] = author_record
            author_ids.add(author_id)

            institution_records.update(author['affiliations'])
    it2 = 0
    for ref in paper['references']:
        # if it2 == 25:
        #     pass
        # if ref['paperId'] is None:
        #     continue
        # if ref['paperId'] not in paper_records.keys():
        #     record = {
        #         "arxiv_id": None,
        #         "s2_paperId": ref['paperId'],
        #         "title": ref['title'],
        #         "abstract": ref['abstract'],
        #         "venue": ref['venue'],
        #         "year": ref['year'],
        #         "referenceCount": ref['referenceCount'],
        #         "influentialCitationCount": ref['influentialCitationCount'],
        #         "citationCount": ref['citationCount'],
        #         "publicationType": "Not available",
        #     }
            # paper_records[ref['paperId']] = record
            # paper_ids.add(ref['paperId'])
        paper_records[paper['s2_paperId']]['reference_ids'].append(ref['paperId'])
        if ref['venue'] in ['Neural Information Processing Systems', 'International Conference on Learning Representations', 'International Conference on Machine Learning', 'Conference on Computer Vision and Pattern Recognition', 'European Conference on Computer Vision', 'International Conference on Computer Vision', 'Association for the Advancement of Artificial Intelligence', 'International Joint Conference on Artificial Intelligence', 'International Conference on Data Mining', 'International Conference on Knowledge Discovery and Data Mining', 'International Conference on Web Search and Data Mining', 'International Conference on Machine Learning and Applications', 'International Conference on Pattern Recognition']:
            ref_ids.add(ref['paperId'])
        # if ref['fieldsOfStudy'] is not None:
        #     field_records.update(ref['fieldsOfStudy'])
        for author in ref['authors']:
            author_id = author['authorId']
            if author_id not in author_ids:
                author_ids.add(author_id)
        
        it2 += 1

    it += 1
        
    


    

Processing papers: 100%|██████████| 6545/6545 [00:00<00:00, 7379.86paper/s]


In [6]:
len(paper_records), len(author_records), len(institution_records), len(field_records), len(paper_ids), len(author_ids)

(6528, 7207, 268, 14, 6528, 177860)

In [7]:
pickle.dump(paper_records, open('data/teacher_graph/records/paper_records.pkl', "wb"))
pickle.dump(author_records, open('data/teacher_graph/records/author_records.pkl', "wb"))
pickle.dump(institution_records, open('data/teacher_graph/records/institution_records.pkl', "wb"))
pickle.dump(field_records, open('data/teacher_graph/records/field_records.pkl', "wb"))
pickle.dump(paper_ids, open('data/teacher_graph/paper_ids.pkl', "wb"))
pickle.dump(author_ids, open('data/teacher_graph/author_ids.pkl', "wb"))
pickle.dump(arxiv_id_to_s2_id_map, open('data/student_graph/arxiv_id_to_s2_id_map.pkl', "wb"))

In [5]:
paper_records

{'8e75864bf912b49e78aa593b4ab3f4c50fe357c3': {'arxiv_id': '1805.12573v5',
  's2_paperId': '8e75864bf912b49e78aa593b4ab3f4c50fe357c3',
  'title': 'Learning a Prior over Intent via Meta-Inverse Reinforcement Learning',
  'abstract': 'A significant challenge for the practical application of reinforcement learning in the real world is the need to specify an oracle reward function that correctly defines a task. Inverse reinforcement learning (IRL) seeks to avoid this challenge by instead inferring a reward function from expert behavior. While appealing, it can be impractically expensive to collect datasets of demonstrations that cover the variation common in the real world (e.g. opening any type of door). Thus in practice, IRL must commonly be performed with only a limited set of demonstrations where it can be exceedingly difficult to unambiguously recover a reward function. In this work, we exploit the insight that demonstrations from other tasks can be used to constrain the set of possibl

In [8]:
processed_ids_file='data/teacher_graph/processed_ids.pickle'
papers_data_file='data/teacher_graph/papers_data.pickle'
s2_map_file='data/teacher_graph/s2_map.pickle'
paper_index = pickle.load(open(papers_data_file, "rb")) if os.path.exists(papers_data_file) else {}
processed = pickle.load(open(processed_ids_file, "rb")) if os.path.exists(processed_ids_file) else []
s2_map = pickle.load(open(s2_map_file, "rb")) if os.path.exists(s2_map_file) else {}

In [9]:
student_set_ids = set()
teacher_set_ids = set()
venues = set()
special_ids = set()
for arxiv_id, paper in tqdm(paper_index.items()):
    if 's2_paperId' not in paper.keys():
        continue
    student_set_ids.add(paper['s2_paperId'])
    for j, citation in enumerate(paper['citations']):
        teacher_set_ids.add(citation['paperId'])
        if citation['venue'] in ['Neural Information Processing Systems', 'International Conference on Learning Representations', 'International Conference on Machine Learning', 'Conference on Computer Vision and Pattern Recognition', 'European Conference on Computer Vision', 'International Conference on Computer Vision', 'Association for the Advancement of Artificial Intelligence', 'International Joint Conference on Artificial Intelligence', 'International Conference on Data Mining', 'International Conference on Knowledge Discovery and Data Mining', 'International Conference on Web Search and Data Mining', 'International Conference on Machine Learning and Applications', 'International Conference on Pattern Recognition']:
            special_ids.add(citation['paperId'])
        venues.add(citation['venue'])

100%|██████████| 1764/1764 [00:00<00:00, 10447.15it/s]


In [10]:
len(student_set_ids), len(teacher_set_ids), len(venues)

(1760, 126527, 6722)

In [25]:
student_paper_ids = pickle.load(open('data/teacher_graph/paper_ids.pkl', "rb"))

In [12]:
len(special_ids.difference(student_paper_ids))

14451

In [13]:
# randomly select 5000 ids from the teacher set
import random
new_ids = special_ids.difference(student_paper_ids)
random_ids = random.sample(list(new_ids), 5000)

pickle.dump(random_ids, open('data/teacher_graph/remaining_ids.pickle', "wb"))


In [15]:
import random
remaining_ids = pickle.load(open('data/teacher_graph/remaining_ids.pickle', "rb"))
ref_ids = ref_ids.difference(remaining_ids)
ref_ids = ref_ids.difference(student_paper_ids)
print(len(ref_ids))

# randomly select 3000 ids from the ref_ids
random_ids = random.sample(list(ref_ids), 3000)
# split the random_ids into 2 lists
eval_ids = random_ids[:1000]
test_ids = random_ids[1000:]

pickle.dump(eval_ids, open('data/teacher_graph/eval_ids.pickle', "wb"))
pickle.dump(test_ids, open('data/teacher_graph/test_ids.pickle', "wb"))

15984


In [6]:
len(teacher_set_ids.difference(student_set_ids))

30133

In [9]:
new_ids = pickle.load(open('data/teacher_graph/possible_new_data.pkl', "rb"))

In [23]:
new_s2_ids = []
for i in new_ids:
    for j in new_ids[i]['citations']:
        if j['venue'] in ['ICML', 'NeurIPS', 'ICLR']:
            new_s2_ids.append(j['paperId'])

In [35]:
len(set(new_s2_ids))

20574

In [30]:
len(set(new_s2_ids).difference(student_paper_ids))

16941

In [33]:
extra_ids = set(new_s2_ids).difference(student_paper_ids)

import random
# randomly select 8000 ids from the extra_ids
random_ids = random.sample(list(extra_ids), 8000)
# split the random_ids into 3 lists of 5000, 2,000 and 1,000
eval_ids = random_ids[:1500]
test_ids = random_ids[1500:3000]
extra_train_ids = random_ids[3000:]

In [36]:
pickle.dump(eval_ids, open('data/teacher_graph/eval_ids.pickle', "wb"))
pickle.dump(test_ids, open('data/teacher_graph/test_ids.pickle', "wb"))
pickle.dump(extra_train_ids, open('data/teacher_graph/extra_train_ids.pickle', "wb"))

In [37]:
paper_records = pickle.load(open('data/teacher_graph/records/paper_records.pkl', "rb"))

In [40]:
all_author_ids = set()
for paper in paper_records.values():
    all_author_ids.update(paper['author_ids'])

In [41]:
len(all_author_ids)

35983

In [None]:
author_records = pickle.load(open('data/teacher_graph/records/author_records.pkl', "rb"))
author_ids = author_records.keys()
len(author_ids)

7207

In [6]:
author_records = pickle.load(open('data/teacher_graph/records/author_records.pkl', "rb"))
author_ids = author_records.keys()
len(author_ids)

35980

In [45]:
remaining_author_ids = all_author_ids.difference(author_ids)

# save the remaining_author_ids to a file
pickle.dump(remaining_author_ids, open('data/teacher_graph/remaining_author_ids.pickle', "wb"))