In [10]:
### Semantic scholar data
from semanticscholar import SemanticScholar
s2_api_key = '1WBrQVQGeo6ZZI2eJmWMk2eFnmgl8W1T7VEDvRyQ'
sch = SemanticScholar(api_key=s2_api_key)

ds_papers = sch.search_paper('data science', limit=100)
se_papers = sch.search_paper('software engineering', limit=100)
bi_papers = sch.search_paper('bioinformatics', limit=100)
graph_papers = sch.search_paper('graph theory', limit=100)
db_papers = sch.search_paper('database', limit=100)


ReadTimeout: HTTPSConnectionPool(host='partner.semanticscholar.org', port=443): Read timed out. (read timeout=10)

In [2]:
### Dump dataset
import json

dataset = [ds_papers, se_papers, bi_papers, graph_papers, db_papers]

result = []
for data in dataset:
    
    length = 1
    for res in data:
        if length > 500:
            break
            
        length += 1
        result.append(res)

# with open('dataset.json', 'w+') as f:
#     json.dump(result, f)

In [6]:
from faker import Faker
from faker.providers import address, internet, sbn
from faker_education import SchoolProvider
from random import choice, randint
import pandas as pd
import numpy as np
import uuid

fake = Faker()
fake.add_provider(SchoolProvider)
fake.add_provider(internet)
fake.add_provider(sbn)
fake.add_provider(address)


with open('./uni.json', 'r') as f:
    uni = json.load(f)

NameError: name 'json' is not defined

In [35]:
### Parse the authors
authors = {":ID": [], "name:STRING": [], "email:STRING": [], "department:STRING": [], "institution:STRING":[]}
author_ids = set()

author_writes_papers = {":START_ID" : [], ":END_ID": [], "corresponding_author:BOOLEAN": []}
author_papers = {}

for res in result:
    ### Assume first author is corresponding author
    corresponding_author = True
    for author in res.authors:
        author_writes_papers[":START_ID"].append(author.authorId)
        author_writes_papers[":END_ID"].append(res.paperId)
        author_writes_papers["corresponding_author:BOOLEAN"].append(corresponding_author)
        
        corresponding_author = False
        
        ### Add author papers relation
        p = author_papers.setdefault(author.authorId, set())
        p.add(res.paperId)
        author_papers[author.authorId] = p
        
        if author.authorId in author_ids:
            continue
            
        gender = np.random.choice(["M", "F"], p=[0.5, 0.5])
        first_name = fake.first_name_male() if gender =="M" else fake.first_name_female()
        last_name = fake.last_name()
        
        author_ids.add(author.authorId)
        
        authors[":ID"].append(author.authorId)
        authors["name:STRING"].append(author.name)
        authors["email:STRING"].append(f'{first_name}.{last_name}@{fake.domain_name()}')
        authors["department:STRING"].append(fake.school_type())
        authors["institution:STRING"].append(uni[randint(0, len(uni)-1)]['institution'])

authors_df = pd.DataFrame.from_dict(authors)
authors_df.to_csv('authors_node_semantic.csv')

author_wp_df =  pd.DataFrame.from_dict(author_writes_papers)
author_wp_df.to_csv('author_writes_papers.csv')

In [37]:
### Parse the paper & keywords
papers = {":ID": [], "title:STRING": [], "abstract:STRING": [], "pages:STRING": [], "DOI:STRING":[], "link:STRING":[]}
paper_has_keywords = {":START_ID" : [], ":END_ID": []}
papers_set = set()

keywords = {":ID": [], "name:STRING": [], "domain:STRING": []}
keywords_dict = {}

for res in result:
    papers_set.add(res.paperId)
    papers[":ID"].append(res.paperId)
    papers["title:STRING"].append(res.title)
    papers["abstract:STRING"].append(res.abstract)
    
    pages = f'{randint(15,100)}-{randint(101,150)}'
    if res.journal is not None:
        pages = res.journal.pages if res.journal.pages is not None else pages
    papers["pages:STRING"].append(pages)
    
    papers["DOI:STRING"].append(res.externalIds.get('DOI', fake.sbn9()))
    papers["link:STRING"].append(fake.uri())
    
    if res.fieldsOfStudy is None or len(res.fieldsOfStudy) == 0:
        paper_has_keywords[":START_ID"].append(res.paperId)
        paper_has_keywords[":END_ID"].append(choice(list(keywords_dict.values())))
    else:
        for fs in res.fieldsOfStudy:
            fs_id = str(uuid.uuid4())
            if fs not in keywords_dict:
                keywords[":ID"].append(fs_id)
                keywords["name:STRING"].append(fs)
                keywords["domain:STRING"].append(res.fieldsOfStudy[0])

            k_id = keywords_dict.setdefault(fs, fs_id)

            paper_has_keywords[":START_ID"].append(res.paperId)
            paper_has_keywords[":END_ID"].append(k_id)
    
papers_df = pd.DataFrame.from_dict(papers)
papers_df.to_csv('papers_semantic.csv')

keywords_df = pd.DataFrame.from_dict(keywords)
keywords_df.to_csv('keywords_semantic.csv')

paper_has_kw = pd.DataFrame.from_dict(paper_has_keywords)
paper_has_kw.to_csv('paper_has_keywords.csv')

In [40]:
### Create reviewers relationships\
author_review_papers = {":START_ID" : [], ":END_ID": []}

for paper in list(papers_set):
    # Pick 3 reviewers per paper
    for i in range(3):
        reviewed = False
        while not reviewed:
            
            # Make sure author not reviewing the same paper
            author = choice(list(author_papers.keys()))
            if paper in author_papers[author]:
                continue
            
            author_review_papers[":START_ID"].append(author)
            author_review_papers[":END_ID"].append(paper)
            reviewed = True

author_rp = pd.DataFrame.from_dict(author_review_papers)
author_rp.to_csv('author_review_papers.csv')

In [42]:
### Create cite relationship
### Assumption is paper is cited by [0-50] papers
paper_cites_paper = {":START_ID" : [], ":END_ID": []}

for cited_paper in list(papers_set):
    
    for i in range(randint(0,50)):
        cited = False
        while not cited:
            
            ### Cannot cite own paper
            paper = choice(list(papers_set))
            if cited_paper == paper:
                continue
            
            paper_cites_paper[':START_ID'].append(paper)
            paper_cites_paper[':END_ID'].append(cited_paper)
            cited = True

pcp = pd.DataFrame.from_dict(paper_cites_paper)
pcp.to_csv('paper_cites_paper.csv')

In [57]:
### Create journal node and relationship
journals = {":ID": [], "name:STRING": []}
paper_published_in_journal = {":START_ID" : [], ":END_ID": [], "volume:STRING": [], "year:INT": []}

default_journal = {'id': str(uuid.uuid4()), 'name': 'Unknown', 'year': 2000, 'volume': 1}
journal_dict = {}

def get_journal_data(res):
    journal_name = default_journal['name']
    journal_id = default_journal['id']
    volume = default_journal['volume']
    year = default_journal['year']
    
    ### Check if journal name exists in publication venue
    if (res.publicationVenue.name and res.publicationVenue.name != ''):
        journal_name = res.publicationVenue.name
        journal_id =  res.publicationVenue.id

    elif res.journal and res.journal.name and res.journal.name != '':
        journal_name = res.journal.name
        journal_id = str(uuid.uuid4())
    
    ### No data about the journal
    else:
        return journal_name, journal_id, volume, year
    
    ### If volume and year not found but there is journal name
    if (res.publicationVenue and res.publicationVenue.type == 'journal' and res.year and res.year != ''):
        year = res.year
    if res.journal and res.journal.volume and res.journal.volume != '':
        volume = res.journal.volume

    return journal_name, journal_id, volume, year

for res in result:
    if not (res.publicationVenue and res.publicationVenue.type == 'journal'):
        continue
        
    journal_name, journal_id, volume, year = get_journal_data(res)
    
    paper_published_in_journal[':START_ID'].append(res.paperId)
    paper_published_in_journal[':END_ID'].append(journal_id)
    paper_published_in_journal['volume:STRING'].append(volume)
    paper_published_in_journal['year:INT'].append(int(year))
    
    
    if journal_name in journal_dict:
        continue
    
    journal_dict.setdefault(journal_name, journal_id)
    journals[':ID'].append(journal_id)
    journals['name:STRING'].append(journal_name)

jdf = pd.DataFrame.from_dict(journals)
jdf.to_csv('journal_semantic.csv')

ppij = pd.DataFrame.from_dict(paper_published_in_journal)
ppij.to_csv('paper_published_in_journal.csv')

In [62]:
### Create conference proceedings node and relationship
conference = {":ID": [], "name:STRING": [], 'year:INT': []}
proceedings = {":ID": [], "name:STRING": [], 'city:STRING': []}

paper_presented_in_conference = {":START_ID" : [], ":END_ID": []}
conference_part_of_proceedings = {":START_ID" : [], ":END_ID": []}

default_conference_proceeding = {'name': 'Unknown', 'year': 2000, 'city': 'Boston', 'proc_id': str(uuid.uuid4())}
conference_dict = {}

def get_conference_data(res):
    conference_name = default_conference_proceeding['name']
    conference_id = str(uuid.uuid4())
    conference_year = default_conference_proceeding['year']
    
    if (res.publicationVenue.name and res.publicationVenue.name != ''):
        conference_name = res.publicationVenue.name
    
    if (res.publicationVenue.id and res.publicationVenue.id != ''):
        conference_id = res.publicationVenue.id
    
    if (res.year and res.year != ''):
        conference_year = res.year
    
    return conference_name, conference_id, conference_year

def get_proceeding_data(res):
    proc_name = default_conference_proceeding['name']
    proc_id = str(uuid.uuid4())
    
    if res.journal and res.journal.name and res.journal.name != '':
        proc_name = res.journal.name
    
    return proc_name, proc_id, fake.city()

for res in result:
    if not (res.publicationVenue and res.publicationVenue.type == 'conference'):
        continue
    
    conference_name, conference_id, conference_year = get_conference_data(res)
    if conference_id not in conference_dict:
        data = conference_dict.setdefault(conference_id, {})
        data['proceeding'] = {}
        proc_name, proc_id, city = get_proceeding_data(res)
        
        data['proceeding']['name'] = proc_name
        data['proceeding']['id'] = proc_id
        data['proceeding']['city'] = city
        
        data['conference'] = {}
        data['conference']['name'] = conference_name
        data['conference']['id'] = conference_id
        data['conference']['year'] = conference_year
        
        conference_dict[conference_id] = data
    
        conference[':ID'].append(conference_id)
        conference['name:STRING'].append(conference_name)
        conference['year:INT'].append(int(conference_year))
        
        proceedings[':ID'].append(proc_id)
        proceedings['name:STRING'].append(proc_name)
        proceedings['city:STRING'].append(city)
        
        conference_part_of_proceedings[':START_ID'].append(conference_id)
        conference_part_of_proceedings[':END_ID'].append(proc_id)
    
    
    paper_presented_in_conference[':START_ID'].append(res.paperId)
    paper_presented_in_conference[':END_ID'].append(conference_id)
    

cf = pd.DataFrame.from_dict(conference)
cf.to_csv('conference_semantic.csv')

pcdgs = pd.DataFrame.from_dict(proceedings)
pcdgs.to_csv('proceedings_semantic.csv')

ppic = pd.DataFrame.from_dict(paper_presented_in_conference)
ppic.to_csv('paper_presented_in_conference.csv')

cpop = pd.DataFrame.from_dict(conference_part_of_proceedings)
cpop.to_csv('conference_part_of_proceedings.csv')

In [69]:
### Transform coference with editions
cdf = pd.read_csv('conference_semantic.csv', index_col=False)
ppij = pd.read_csv('paper_published_in_journal.csv', index_col=False)
ppic = pd.read_csv('paper_presented_in_conference.csv', index_col=False)
papers = pd.read_csv('papers_semantic.csv', index_col=False)
awp = pd.read_csv('author_writes_papers.csv')
 
published_papers = set(ppij[":START_ID"].append(ppij[":START_ID"], ignore_index=True))
paper_set = set(papers[":ID"])

unpublished_papers = paper_set.difference(published_papers)

  published_papers = set(ppij[":START_ID"].append(ppij[":START_ID"], ignore_index=True))


In [70]:
### Add conference Data
conference_ids = ["7654260e-79f9-45c5-9663-d72027cf88f3",
"376732f4-ec63-4b76-bfc8-bbf77119d852",
"25eaf793-6674-4a6d-864f-6c8ae5428912",
"1123f25d-add0-4c9c-8f43-c877aab90a0b",
"b1ee6f13-7776-44aa-a2d5-b79deda2aecb",
"b83b14d5-4e97-4f22-85e2-0b30dfa042f4",
"c40f4908-60d1-42b4-8890-380119178833",
"5afb995f-87ba-455e-bd26-86ae67a10447",
"c2ff5df6-f2f4-4573-a884-8c53979d4c78",
"5042fe05-b1f6-41b6-8092-53294b52cbd6",
"bb718fdd-6d66-4f93-851b-08eeeefb28f5",
"0efa120a-36c7-45fa-b534-597651ae69d2",
"019d3f59-a115-42e3-bd7b-474dd4246499",
"bedd754b-5faf-4eff-8074-3c90be8ac9b0",
"0256ebd3-4f16-4fd0-91bc-b0e77fcd3c0d",
"f3dd946e-cb75-4502-b550-9dec04bda7f9",
"3ff00d27-28c7-4770-a1c7-855a072843fd",
"4c562775-121a-4c25-9f7a-823f54d0e93e",
"10ff739d-ef5f-48b7-9454-9cd1c6d2434d",
"3837ff2b-82e5-4165-8900-b069c31ef3d7",
"b55b50b1-aae7-47a7-b042-8aecc930073d",
"c85dfc25-bcef-4719-9997-f41ad334d998",
"d7907408-25bc-4816-a81d-4e0f2f6482c8"
]

journals = ["c6840156-ee10-4d78-8832-7f8909811576",
"d60da343-ab92-4310-b3d7-2c0860287a9d",
"27475f31-a1d2-401b-84ad-9b405c7609a8",
"961301b0-6f5a-44a6-9216-54b673cded78",
"bc30f894-9a9c-440e-8420-7bd3c5624384"]

unpublished_papers = list(unpublished_papers)
for_journals = unpublished_papers[len(unpublished_papers)//2+1:]
for_conferences = unpublished_papers[:len(unpublished_papers)//2+1]


paper_presented_in_conference = {":START_ID" : [], ":END_ID": []}
paper_published_in_journal = {":START_ID" : [], ":END_ID": [], "volume:STRING": [], "year:INT": []}

### Assign paper to conference
index = 0
for paper in for_conferences:
    paper_presented_in_conference[":START_ID"].append(paper)
    paper_presented_in_conference[":END_ID"].append(conference_ids[index%len(conference_ids)])
    index += 1

    
### Assign paper to journal
avail_paper = len(for_journals)
while avail_paper > 0:
    for journal in journals:
        if avail_paper <= 0:
            break

        year = 2017
        volume = 1
        
        for i in range(5):
            if avail_paper <= 0:
                break
                
            paper_published_in_journal[":START_ID"].append(for_journals[avail_paper-1])
            paper_published_in_journal[":END_ID"].append(journal)
            paper_published_in_journal["volume:STRING"].append(volume)
            paper_published_in_journal["year:INT"].append(year)
            
            year += 1
            volume += 1
            avail_paper -= 1


### Assign authors to review paper
author_writes_papers = {":START_ID" : [], ":END_ID": [], "corresponding_author:BOOLEAN": []}
authors = ["144110054",
"147069795",
"4376295",
"2395456",
"8504175",
"145073266"
# "21366050",
# "1404984281",
# "40281699",
# "2118624194",
# "21512561",
# "1753164"

]

avail_paper = len(for_conferences)
while avail_paper > 0:
    for author in authors:
        if avail_paper <= 0:
            break

        
        for i in range(5):
            if avail_paper <= 0:
                break
                
            author_writes_papers[":START_ID"].append(author)
            author_writes_papers[":END_ID"].append(for_conferences[avail_paper-1])
            author_writes_papers["corresponding_author:BOOLEAN"].append(False)
            
            avail_paper -= 1

In [71]:
awp = awp.append(pd.DataFrame.from_dict(author_writes_papers), ignore_index=True)
ppij = ppij.append(pd.DataFrame.from_dict(paper_published_in_journal), ignore_index=True)
ppic = ppic.append(pd.DataFrame.from_dict(paper_presented_in_conference), ignore_index=True)

awp.drop(columns=['Unnamed: 0'])
ppij.drop(columns=['Unnamed: 0'])
ppic.drop(columns=['Unnamed: 0'])

awp.to_csv("author_write_papers_1.csv", index=False)
ppij.to_csv("paper_published_in_journal_1.csv", index=False)
ppic.to_csv("paper_presented_in_conference_1.csv", index=False)

  awp = awp.append(pd.DataFrame.from_dict(author_writes_papers), ignore_index=True)
  ppij = ppij.append(pd.DataFrame.from_dict(paper_published_in_journal), ignore_index=True)
  ppic = ppic.append(pd.DataFrame.from_dict(paper_presented_in_conference), ignore_index=True)
