In [1]:
import json
import pandas as pd
import random
random.seed(10)

In [2]:
with open('data/dataset.json', encoding='utf-8') as inputfile:
    df = json.load(inputfile)
inputfile.close()

### Collecting details about journals, conferences, authors, fieldsOfStudy

In [3]:
conferences = []
conferenceUrls = []

journals = []
journalUrls = []

authors = []
authorUrls = []

fieldsOfStudy = []

for i in range(len(df)):
    details = df[i]['_data']  
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if jName not in journals:
                    journals.append(jName)
                    journalUrls.append(myId)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if cName not in conferences:
                    conferences.append(cName)
                    conferenceUrls.append(myId)
    if details['authors']:
        for i in range(len(details['authors'])):
            aName = details['authors'][i]['name']
            myId = details['authors'][i]['authorId']
            if aName not in authors:
                authors.append(aName)
                authorUrls.append(myId)
    if details['fieldsOfStudy']:
        for i in details['fieldsOfStudy']:
            if i not in fieldsOfStudy:
                fieldsOfStudy.append(i)

#### Conferences

In [25]:
# assigning conference ids
# conferenceColumns = ['cId','cTitle','cUrlId','cType','cProceedings']

ids = []
proceedings = []
for i in range(len(conferences)):
    ids.append('c'+str(i))
    proceedings.append(random.randint(1,10))
        
conferencesDF = pd.DataFrame(ids, columns = ['cId'])

In [26]:
# assigning conference subclasses
# randomly choose from cTypes
cTypes = ['workshop','expertGroup','symposium','regularConference']

conferenceTypes = []
conferenceTypes = random.choices(cTypes, weights=[0.25,0.25,0.25,0.25], k=len(conferencesDF))

In [27]:
conferencesDF['cUrlId'] = conferenceUrls
conferencesDF['cTitle'] = conferences
conferencesDF['cType'] = conferenceTypes

# Creating proceedings data for conferences
conferencesDF['cProceedings'] = proceedings

conferencesDF.to_csv('data/conferences.csv',index = False, header = True, mode = 'w')

#### Journals

In [28]:
# journalColumns = ['jId','jTitle','jUrlId','jVolumes']

ids = []
volumes = []
for i in range(len(journals)):
    ids.append('j'+str(i))
    volumes.append(random.randint(0,10))

journalsDF = pd.DataFrame(ids, columns = ['jId'])
journalsDF['jUrlId'] = journalUrls
journalsDF['jTitle'] = journals

# Creating volumes data for journals
journalsDF['jVolumes'] = volumes

journalsDF.to_csv('data/journals.csv',index = False, header = True, mode = 'w')

#### Authors

In [8]:
# authorColumns = ['aId','aName','aUrlId']

ids = []
for i in range(len(authors)):
    ids.append('a'+str(i))

authorsDF = pd.DataFrame(ids, columns = ['aId'])
authorsDF['aUrlId'] = authorUrls
authorsDF['aName'] = authors
authorsDF.to_csv('data/authors.csv',index = False, header = True, mode = 'w')

#### FieldsOfStudy

In [9]:
# fieldsOfStudyColumns = ['fId','fName']

ids = []
for i in range(len(fieldsOfStudy)):
    ids.append('f'+str(i))

fieldsOfStudyDF = pd.DataFrame(ids, columns = ['fId'])
fieldsOfStudyDF['fName'] = fieldsOfStudy
fieldsOfStudyDF.to_csv('data/fieldsOfStudy.csv',index = False, header = True, mode = 'w')

### Collecting information about papers

In [10]:
conferencesDF = pd.read_csv('data/conferences.csv')
journalsDF = pd.read_csv('data/journals.csv')
authorsDF = pd.read_csv('data/authors.csv')

In [11]:
papers = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
   
    paperCurr.append('p'+str(paperUrlNum))
    paperCurr.append(details['paperId'])
    
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                paperCurr.append(journalsDF[journalsDF['jTitle'] == jName]['jId'].values[0])
                paperCurr.append(jName)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                paperCurr.append(conferencesDF[conferencesDF['cTitle'] == cName]['cId'].values[0])
                paperCurr.append(cName)

            else:
                paperCurr.append('None')
                paperCurr.append('None')
        else:
            paperCurr.append('None')
            paperCurr.append('None')
    else:
        paperCurr.append('None') # conferenceJournalUrl
        paperCurr.append('None') # conferenceJournal
    
    paperCurr.append(details['title'])
    paperCurr.append(details['abstract'])
    paperUrlNum += 1
    papers.append(paperCurr)

In [12]:
paperColumns = ['pId','paperUrlId','conferenceJournalId','conferenceJournalTitle','pTitle','abstract']

papersDF = pd.DataFrame(papers, columns = paperColumns)

In [13]:
# Assigning paper subclasses
# Assign type poster if not a conference/journal
# Else, randomly choose from pTypes

paperTypes = []
pTypes = ['shortPaper','demoPaper','fullPaper']
for index, row in papersDF.iterrows():
    if row['conferenceJournalId'] == 'None':
        paperTypes.append('poster')
    else:
        paperTypes.append(random.choice(pTypes))
papersDF['pType'] = paperTypes

In [14]:
papersDF.to_csv('data/papers.csv',index = False, header = True, mode = 'w')

### Creating Reviews

In [18]:
# reviewColumns = ['rId','pId','aId','rReviewText','rDecision']

reviewDecisionChoices = ['accepted','rejected']

ids = []
for i in range(len(papersDF)):
    ids.append('r'+str(i))

reviewsDF = pd.DataFrame(ids, columns = ['rId'])
reviewsDF['pId'] = papersDF['pId']
reviewsDF['rReviewText'] = ['content of the reviewed text goes here....'] * len(reviewsDF)
reviewsDF['rDecision'] = random.choices(reviewDecisionChoices, weights = [0.8,0.2], k = len(reviewsDF))
reviewsDF.to_csv('data/reviews.csv',index = False, header = True, mode = 'w')