In [1]:
import datetime
import json
from semanticscholar import SemanticScholar
import pandas as pd
from json import JSONEncoder

In [2]:
with open('data/dataset.json', encoding='utf-8') as inputfile:
    df = json.load(inputfile)
inputfile.close()

### Collecting details about journals, conferences, authors, fieldsOfStudy

In [3]:
conferences = []
conferenceUrls = []

journals = []
journalUrls = []

authors = []
authorUrls = []

fieldsOfStudy = []

for i in range(len(df)):
    details = df[i]['_data']  
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if jName not in journals:
                    journals.append(jName)
                    journalUrls.append(myId)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if cName not in conferences:
                    conferences.append(cName)
                    conferenceUrls.append(myId)
    if details['authors']:
        for i in range(len(details['authors'])):
            aName = details['authors'][i]['name']
            myId = details['authors'][i]['authorId']
            if aName not in authors:
                authors.append(aName)
                authorUrls.append(myId)
    if details['fieldsOfStudy']:
        for i in details['fieldsOfStudy']:
            if i not in fieldsOfStudy:
                fieldsOfStudy.append(i)

In [4]:
# conferenceColumns = ['cId','cTitle','cUrlId']

ids = []
for i in range(len(conferences)):
    ids.append('c'+str(i))

conferencesDF = pd.DataFrame(ids, columns = ['cId'])
conferencesDF['cUrlId'] = conferenceUrls
conferencesDF['cTitle'] = conferences
conferencesDF.to_csv('data/conferences.csv',index = False, header = True, mode = 'w')

In [5]:
# journalColumns = ['jId','jTitle','jUrlId']

ids = []
for i in range(len(journals)):
    ids.append('j'+str(i))

journalsDF = pd.DataFrame(ids, columns = ['jId'])
journalsDF['jUrlId'] = journalUrls
journalsDF['jTitle'] = journals
journalsDF.to_csv('data/journals.csv',index = False, header = True, mode = 'w')

In [6]:
# authorColumns = ['aId','aName','aUrlId']

ids = []
for i in range(len(authors)):
    ids.append('a'+str(i))

authorsDF = pd.DataFrame(ids, columns = ['aId'])
authorsDF['aUrlId'] = authorUrls
authorsDF['aName'] = authors
authorsDF.to_csv('data/authors.csv',index = False, header = True, mode = 'w')

In [7]:
# fieldsOfStudyColumns = ['fId','fName']

ids = []
for i in range(len(fieldsOfStudy)):
    ids.append('f'+str(i))

fieldsOfStudyDF = pd.DataFrame(ids, columns = ['fId'])
fieldsOfStudyDF['fName'] = fieldsOfStudy
fieldsOfStudyDF.to_csv('data/fieldsOfStudy.csv',index = False, header = True, mode = 'w')

### Collecting papers

In [8]:
conferencesDF = pd.read_csv('data/conferences.csv')
journalsDF = pd.read_csv('data/journals.csv')
authorsDF = pd.read_csv('data/authors.csv')

In [9]:
papers = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
   
    paperCurr.append('p'+str(paperUrlNum))
    paperCurr.append(details['paperId'])
    
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                paperCurr.append(journalsDF[journalsDF['jTitle'] == jName]['jId'].values[0])
                paperCurr.append(jName)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                paperCurr.append(conferencesDF[conferencesDF['cTitle'] == cName]['cId'].values[0])
                paperCurr.append(cName)

            else:
                paperCurr.append('None')
                paperCurr.append('None')
        else:
            paperCurr.append('None')
            paperCurr.append('None')
    else:
        paperCurr.append('None') # conferenceJournalUrl
        paperCurr.append('None') # conferenceJournal
    
    paperCurr.append(details['title'])
    paperCurr.append(details['abstract'])
    paperUrlNum += 1
    papers.append(paperCurr)

In [10]:
paperColumns = ['pId','paperUrlId','conferenceJournalUrl','conferenceJournal','pTitle','abstract']

papersDF = pd.DataFrame(papers, columns = paperColumns)
papersDF.to_csv('data/papers.csv',index = False, header = True, mode = 'w')