In [1]:
import json
import pandas as pd
import datetime
import random
random.seed(10)

In [2]:
with open('data/dataset.json', encoding='utf-8') as inputfile:
    df = json.load(inputfile)
inputfile.close()

### Collecting details about journals, conferences, authors, fieldsOfStudy

In [3]:
conferences = []
conferenceUrls = []

journals = []
journalUrls = []

authors = []
authorUrls = []

subjectDomains = []

for i in range(len(df)):
    details = df[i]['_data']  
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if jName not in journals:
                    journals.append(jName)
                    journalUrls.append(myId)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if cName not in conferences:
                    conferences.append(cName)
                    conferenceUrls.append(myId)
    if details['authors']:
        for i in range(len(details['authors'])):
            aName = details['authors'][i]['name']
            myId = details['authors'][i]['authorId']
            if aName not in authors:
                authors.append(aName)
                authorUrls.append(str(myId))
    if details['fieldsOfStudy']:
        for i in details['fieldsOfStudy']:
            if i not in subjectDomains:
                subjectDomains.append(i)

#### Proceedings

It is assumed that each proceeding will be unique, i.e. proceedings will be the same amount as conferences.

In [4]:
# assigning proceedings ids
proceedingChoices = []
proceedingYears = []
ids = []
for i in range(len(conferences)):
    ids.append('cp'+str(i)) # stands for conferenceProceedings
    
    # Creating proceedings name as a random integer between (1, 30)
    proceedingChoices.append('proceeding' + str(i))
    
    # Creating proceedings year as a random year between (2001, 2022)
    proceedingYears.append(random.randint(2001,2022))
        
proceedingsDF = pd.DataFrame(ids, columns = ['proceedingId'])
proceedingsDF['proceedingName'] = proceedingChoices
proceedingsDF['proceedingYear'] = proceedingYears
proceedingsDF.to_csv('data/proceedings.csv',index = False, header = True, mode = 'w')

In [5]:
proceedingsDF.head(2)

Unnamed: 0,proceedingId,proceedingName,proceedingYear
0,cp0,proceeding0,2019
1,cp1,proceeding1,2002


#### Conferences

In [6]:
# assigning conference ids
# conferenceColumns = ['cId','cTitle','cUrlId','cType','cProceedings']

ids = []
proceedings = []
for i in range(len(conferences)):
    ids.append('c'+str(i))

conferencesDF = pd.DataFrame(ids, columns = ['conferenceId'])

In [7]:
# assigning conference subclasses
# randomly choose from cTypes
cTypes = ['workshop','expertGroup','symposium','regularConference']

conferenceTypes = []
conferenceTypes = random.choices(cTypes, weights=[0.25,0.25,0.25,0.25], k=len(conferencesDF))

In [8]:
conferencesDF['conferenceUrl'] = conferenceUrls
conferencesDF['conferenceTitle'] = conferences
conferencesDF['conferenceType'] = conferenceTypes

# Creating proceedings data for conferences
conferencesDF['conferenceProceedingIds'] = proceedingsDF['proceedingId']
conferencesDF.to_csv('data/conferences.csv',index = False, header = True, mode = 'w')

#### ConferenceProceedings

In [9]:
conferencesDF.head(2)

Unnamed: 0,conferenceId,conferenceUrl,conferenceTitle,conferenceType,conferenceProceedingIds
0,c0,b55b50b1-aae7-47a7-b042-8aecc930073d,International Conference on Human Factors in C...,regularConference,cp0
1,c1,2c9ecac6-f875-4a9b-acc2-10bd9f6782df,Technical Symposium on Computer Science Education,workshop,cp1


In [10]:
proceedingsDF.head(2)

Unnamed: 0,proceedingId,proceedingName,proceedingYear
0,cp0,proceeding0,2019
1,cp1,proceeding1,2002


In [11]:
conferenceProceedingsDF = conferencesDF.merge(
    proceedingsDF,
    how = 'inner',
    left_on = ['conferenceProceedingIds'],
    right_on = ['proceedingId']
    ).drop(columns = ['proceedingId'], axis = 1)
conferenceProceedingsDF.to_csv('data/conferenceProceedings.csv',index = False, header = True, mode = 'w')

In [12]:
conferenceProceedingsDF.head(2)

Unnamed: 0,conferenceId,conferenceUrl,conferenceTitle,conferenceType,conferenceProceedingIds,proceedingName,proceedingYear
0,c0,b55b50b1-aae7-47a7-b042-8aecc930073d,International Conference on Human Factors in C...,regularConference,cp0,proceeding0,2019
1,c1,2c9ecac6-f875-4a9b-acc2-10bd9f6782df,Technical Symposium on Computer Science Education,workshop,cp1,proceeding1,2002


#### Volumes

It is assumed that each volume will be unique, i.e. volumes will be the same amount as journals.

In [13]:
# assigning proceedings ids
volumeChoices = []
volumeYears = []
ids = []
for i in range(len(journals)):
    ids.append('jv'+str(i)) # stands for journalVolume
    
    # Creating proceedings value as a random integer between (1,30)
    volumeChoices.append('journal' + str(i))
    volumeYears.append(random.randint(2001,2022))
        
volumesDF = pd.DataFrame(ids, columns = ['volumeId'])
volumesDF['volumeName'] = volumeChoices
volumesDF['volumeYear'] = volumeYears
volumesDF.to_csv('data/volumes.csv',index = False, header = True, mode = 'w')

In [14]:
volumesDF.head(2)

Unnamed: 0,volumeId,volumeName,volumeYear
0,jv0,journal0,2006
1,jv1,journal1,2020


#### Journals

In [15]:
# journalColumns = ['jId','jTitle','jUrlId','jVolumes']

ids = []
volumes = []
for i in range(len(journals)):
    ids.append('j'+str(i))

journalsDF = pd.DataFrame(ids, columns = ['journalId'])
journalsDF['journalUrl'] = journalUrls
journalsDF['journalTitle'] = journals

# Creating volumes data for journals
journalsDF['journalVolumeIds'] = volumesDF['volumeId']

journalsDF.to_csv('data/journals.csv',index = False, header = True, mode = 'w')

In [16]:
journalsDF.head(2)

Unnamed: 0,journalId,journalUrl,journalTitle,journalVolumeIds
0,j0,458166b3-de17-4bf3-bbbb-e53782de2f0f,Nature Biotechnology,jv0
1,j1,c6840156-ee10-4d78-8832-7f8909811576,IEEE Transactions on Knowledge and Data Engine...,jv1


#### JournalVolumes

In [17]:
journalsDF.head(2)

Unnamed: 0,journalId,journalUrl,journalTitle,journalVolumeIds
0,j0,458166b3-de17-4bf3-bbbb-e53782de2f0f,Nature Biotechnology,jv0
1,j1,c6840156-ee10-4d78-8832-7f8909811576,IEEE Transactions on Knowledge and Data Engine...,jv1


In [18]:
volumesDF.head(2)

Unnamed: 0,volumeId,volumeName,volumeYear
0,jv0,journal0,2006
1,jv1,journal1,2020


In [19]:
journalVolumesDF = journalsDF.merge(
    volumesDF,
    how = 'inner',
    left_on = ['journalVolumeIds'],
    right_on = ['volumeId']
    ).drop(columns = ['volumeId'], axis = 1)
journalVolumesDF.to_csv('data/journalVolumes.csv',index = False, header = True, mode = 'w')

In [20]:
journalVolumesDF.head(2)

Unnamed: 0,journalId,journalUrl,journalTitle,journalVolumeIds,volumeName,volumeYear
0,j0,458166b3-de17-4bf3-bbbb-e53782de2f0f,Nature Biotechnology,jv0,journal0,2006
1,j1,c6840156-ee10-4d78-8832-7f8909811576,IEEE Transactions on Knowledge and Data Engine...,jv1,journal1,2020


#### Authors

In [21]:
# authorColumns = ['aId','aName','aUrlId']

ids = []
for i in range(len(authors)):
    ids.append('a'+str(i))

authorsDF = pd.DataFrame(ids, columns = ['authorId'])
authorsDF['authorUrl'] = authorUrls
authorsDF['authorName'] = authors
authorsDF.to_csv('data/authors.csv',index = False, header = True, mode = 'w')

In [22]:
authorsDF.head()

Unnamed: 0,authorId,authorUrl,authorName
0,a0,6391996,E. Bolyen
1,a1,6307534,J. Rideout
2,a2,40950692,Matthew R. Dillon
3,a3,4230536,N. Bokulich
4,a4,5538503,C. Abnet


#### SubjectDomain

In [23]:
# subjectDomainColumns = ['eId','fName']

ids = []
for i in range(len(subjectDomains)):
    ids.append('d'+str(i))

subjectDomainDF = pd.DataFrame(ids, columns = ['subjectDomainId'])
subjectDomainDF['subjectDomainName'] = subjectDomains
subjectDomainDF.to_csv('data/subjectDomain.csv',index = False, header = True, mode = 'w')

### Collecting information about papers

In [24]:
papers = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    paperCurr.append(currPaperUrlNum)
    paperCurr.append(details['paperId'])
    
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                paperCurr.append(journalsDF[journalsDF['journalTitle'] == jName]['journalId'].values[0])
                paperCurr.append(jName)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                paperCurr.append(conferencesDF[conferencesDF['conferenceTitle'] == cName]['conferenceId'].values[0])
                paperCurr.append(cName)

            else:
                paperCurr.append('None')
                paperCurr.append('None')
        else:
            paperCurr.append('None')
            paperCurr.append('None')
    else:
        paperCurr.append('None') # conferenceJournalUrl
        paperCurr.append('None') # conferenceJournal
    
    paperCurr.append(details['title'])
    paperCurr.append(details['abstract'])
    paperUrlNum += 1
    papers.append(paperCurr)

In [25]:
paperColumns = ['paperId','paperUrl','conferenceJournalId','conferenceJournalTitle','paperTitle','paperAbstract']

papersDF = pd.DataFrame(papers, columns = paperColumns)

In [26]:
# Assigning paper subclasses
# Assign type poster if not a conference/journal
# Else, randomly choose from pTypes

paperTypes = []
pTypes = ['shortPaper','demoPaper','fullPaper']
for index, row in papersDF.iterrows():
    if row['conferenceJournalId'] == 'None':
        paperTypes.append('poster')
    else:
        paperTypes.append(random.choice(pTypes))
papersDF['paperType'] = paperTypes

In [27]:
papersDF.head(1)

Unnamed: 0,paperId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType
0,p0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",,shortPaper


In [28]:
papersDF['conferenceJournalId'] = papersDF.apply(lambda x: random.choice(conferencesDF.conferenceId) if x['paperType'] == 'poster' else x['conferenceJournalId'], axis = 1)
papersDF['conferenceJournalTitle'] = papersDF.apply(lambda x: conferencesDF[conferencesDF['conferenceId'] == x['conferenceJournalId']]['conferenceTitle'].item() if x['paperType'] == 'poster' else x['conferenceJournalTitle'], axis = 1)

In [29]:
procVols = []
for k in range(len(papersDF['paperId'])):
    currConfJourId = papersDF['conferenceJournalId'][k]
    if currConfJourId[0] == 'c':
        getProcVol = conferencesDF[conferencesDF['conferenceId'] == currConfJourId]['conferenceProceedingIds'].item()
    else:
        getProcVol = journalsDF[journalsDF['journalId'] == currConfJourId]['journalVolumeIds'].item()
    procVols.append(getProcVol)
papersDF['proceedingsVolumeIds'] = procVols

In [30]:
papersDF.head(2)

Unnamed: 0,paperId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,p0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",,shortPaper,jv0
1,p1,fa5853fdef7d2f6bb68203d187ddacbbddc63a8b,c14,International Conference on Exploring Services...,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",poster,cp14


In [31]:
# Imputing None values for abstract
papersDF['paperAbstract'] = papersDF.apply(lambda x: 'Abstract content goes here ...' if x['paperAbstract'] == None else x['paperAbstract'], axis = 1)

Imputing conferenceJournalId as randomly chosen conference from the list of conferences and corresponding conferenceJournalTitle for the paperType as poster.

In [32]:
papersDF.to_csv('data/papers.csv',index = False, header = True, mode = 'w')

### Creating authors subjectDomain relation

In [33]:
subjectDomainDF.head(2)

Unnamed: 0,subjectDomainId,subjectDomainName
0,d0,Engineering
1,d1,Medicine


In [34]:
subject_domain_ids = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Adding pId to subjectDomainId
    if details['fieldsOfStudy']:
        for i in range(len(details['fieldsOfStudy'])):
            subjectDomainId = details['fieldsOfStudy'][i]
            subject_domain_ids.append(subjectDomainId)
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

subjectsPapersDF = pd.DataFrame(subject_domain_ids, columns = ['subjectDomainName'])
subjectsPapersDF['paperId'] = pIds

In [35]:
subjectsPapersDF = subjectsPapersDF.merge(
                    subjectDomainDF,
                    how = 'inner',
                    left_on = ['subjectDomainName'],
                    right_on = ['subjectDomainName']
                    )#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)

In [36]:
subjectsPapersDF = subjectsPapersDF.merge(
                    papersDF,
                    how = 'inner',
                    left_on = ['paperId'],
                    right_on = ['paperId']
                    )#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)
subjectsPapersDF.to_csv('data/subjectsPapers.csv',index = False, header = True, mode = 'w')

In [37]:
subjectsPapersDF.head()

Unnamed: 0,subjectDomainName,paperId,subjectDomainId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,Engineering,p0,d0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0
1,Medicine,p0,d1,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0
2,Engineering,p19,d0,2d6adb9636df5a8a5dbcbfaecd0c4d34d7c85034,c70,International Conference on Intelligent Roboti...,Spectral Methods for Data Science: A Statistic...,Spectral methods have emerged as a simple yet ...,poster,cp70
3,Computer Science,p19,d3,2d6adb9636df5a8a5dbcbfaecd0c4d34d7c85034,c70,International Conference on Intelligent Roboti...,Spectral Methods for Data Science: A Statistic...,Spectral methods have emerged as a simple yet ...,poster,cp70
4,Mathematics,p19,d5,2d6adb9636df5a8a5dbcbfaecd0c4d34d7c85034,c70,International Conference on Intelligent Roboti...,Spectral Methods for Data Science: A Statistic...,Spectral methods have emerged as a simple yet ...,poster,cp70


### Creating proceedings - subjectDomain and volumes - subjectDomain relation

In [38]:
subjectsPapersDF.head(1)

Unnamed: 0,subjectDomainName,paperId,subjectDomainId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,Engineering,p0,d0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0


In [39]:
subjectsConferencesJournalsDF = subjectsPapersDF[['conferenceJournalId','proceedingsVolumeIds','subjectDomainId']]
# subjectsPapersDF.groupby(['proceedingsVolumeIds'], as_index = False, sort = False)['subjectDomainId'].agg(lambda x: [l for l in x])
# subjectsConferencesJournalsDF['subjectDomainName'] = subjectsConferencesJournalsDF['subjectDomainName'].apply(lambda x: list(set(x)))

In [40]:
subjectsConferencesJournalsDF.head()

Unnamed: 0,conferenceJournalId,proceedingsVolumeIds,subjectDomainId
0,j0,jv0,d0
1,j0,jv0,d1
2,c70,cp70,d0
3,c70,cp70,d3
4,c70,cp70,d5


In [41]:
subjectsConferencesJournalsDF.to_csv('data/subjectsProceedingsVolumes.csv',index = False, header = True, mode = 'w')

### Creating authors papers relation

In [42]:
aUrlIds = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Appending the pId to authorsDF
    if details['authors']:
        for i in range(len(details['authors'])):
            aUrlId = details['authors'][i]['authorId']
            aUrlIds.append(str(aUrlId))
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

authorsPapersDF = pd.DataFrame(aUrlIds, columns = ['authorUrl'])
authorsPapersDF['paperId'] = pIds

In [43]:
authorsDF.head(2)

Unnamed: 0,authorId,authorUrl,authorName
0,a0,6391996,E. Bolyen
1,a1,6307534,J. Rideout


In [44]:
authorsPapersDF = authorsPapersDF.merge(
                    authorsDF,
                    how = 'inner',
                    left_on = ['authorUrl'],
                    right_on = ['authorUrl']
                    )#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)

In [45]:
authorsPapersDF = authorsPapersDF.merge(
                    papersDF,
                    how = 'inner',
                    left_on = ['paperId'],
                    right_on = ['paperId']
                    )
authorsPapersDF.head()#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)
authorsPapersDF.to_csv('data/authorsPapers.csv',index = False, header = True, mode = 'w')

In [46]:
authorsPapersDF.head(2)

Unnamed: 0,authorUrl,paperId,authorId,authorName,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,6391996,p0,a0,E. Bolyen,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0
1,6307534,p0,a1,J. Rideout,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0


### Creating Submitted Paper Related Data

In [47]:
decisionChoices = ['accepted','rejected']

ids = []
for i in range(len(papersDF)):
    ids.append('s'+str(i))

submittedPapersDF = pd.DataFrame(ids, columns = ['submissionId'])
submittedPapersDF['paperId'] = papersDF['paperId']
submittedPapersDF['paperTitle'] = papersDF['paperTitle']
submittedPapersDF['paperAbstract'] = papersDF['paperAbstract']
submittedPapersDF['conferenceJournalId'] = papersDF['conferenceJournalId']
submittedPapersDF['conferenceJournalTitle'] = papersDF['conferenceJournalTitle']
submittedPapersDF['proceedingsVolume'] = papersDF['proceedingsVolumeIds']
submittedPapersDF['decision'] = random.choices(decisionChoices, weights = [0.8,0.2], k = len(submittedPapersDF))
submittedPapersDF.to_csv('data/submittedPapers.csv',index = False, header = True, mode = 'w')

In [48]:
submittedPapersDF.head()

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted
2,s2,p2,CRISP-DM Twenty Years Later: From Data Mining ...,CRISP-DM(CRoss-Industry Standard Process for D...,j1,IEEE Transactions on Knowledge and Data Engine...,jv1,rejected
3,s3,p3,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,c0,International Conference on Human Factors in C...,cp0,accepted
4,s4,p4,Data Science and Analytics: An Overview from D...,Abstract content goes here ...,j2,SN Computer Science,jv2,accepted


### Creating Reviews Data

In [49]:
# reviewColumns = ['rId','pId','aId','rReviewText','rDecision']

ids = []
for i in range(len(submittedPapersDF)):
    ids.append('r'+str(i))

reviewsDF = pd.DataFrame(ids, columns = ['reviewId'])
reviewsDF['submittedPaperId'] = submittedPapersDF['submissionId']
reviewsDF['reviewText'] = ['content of the reviewed text goes here....'] * len(reviewsDF)
reviewsDF['reviewDecision'] = submittedPapersDF['decision']
reviewsDF['reviewDecisionBoolean'] = reviewsDF.apply(lambda x: 1 if x['reviewDecision'] == 'accepted' else 0, axis = 1)
reviewsDF['submittedPaperTitle'] = submittedPapersDF['paperTitle']
reviewsDF.to_csv('data/reviews.csv',index = False, header = True, mode = 'w')

In [50]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted


### Creating Final Paper Related Data

In [51]:
finalPapersDF = submittedPapersDF[(submittedPapersDF['decision'] == 'accepted')].reset_index().drop(['index'],axis = 1)

ids = []
for i in range(len(finalPapersDF)):
    ids.append('f'+str(i))
    
finalPapersDF['finalPaperId'] = ids
finalPapersDF['proceedingVolumeYear'] = finalPapersDF.apply(lambda x: proceedingsDF[proceedingsDF['proceedingId'] == x['proceedingsVolume']]['proceedingYear'].item() if x['proceedingsVolume'][0] == 'c' else volumesDF[volumesDF['volumeId'] == x['proceedingsVolume']]['volumeYear'].item(),axis = 1)

In [52]:
# It is assumed that all the proceedings/volumes are between 2001 and 2022
# And all the publications are accepted on a random date in their publicationYear

acceptanceDates = []
for i in range(len(finalPapersDF['paperId'])):  
    year = finalPapersDF['proceedingVolumeYear'][i]
    
    # Generate a random acceptance date in the year of publication
    start_date = datetime.date(year, 1, 1)
    end_date = datetime.date(year, 12, 31)
    
    num_days = (end_date - start_date).days
    rand_days = random.randint(1, num_days)
    random_date = start_date + datetime.timedelta(days=rand_days)
    acceptanceDates.append(random_date)

finalPapersDF['acceptanceDate'] = acceptanceDates 

In [53]:
finalPapersDF.to_csv('data/finalPapers.csv',index = False, header = True, mode = 'w')

In [54]:
finalPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision,finalPaperId,proceedingVolumeYear,acceptanceDate
0,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted,f0,2016,2016-05-08
1,s3,p3,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,c0,International Conference on Human Factors in C...,cp0,accepted,f1,2019,2019-10-29


### Creating Reviewer Data

In [55]:
all_authors = list(authorsPapersDF['authorId'].unique())
paperAuthors = authorsPapersDF.groupby('paperId', as_index = False, sort = False)['authorId'].agg(lambda x: [l for l in x])

In [56]:
authorsPapersDF.head(2)

Unnamed: 0,authorUrl,paperId,authorId,authorName,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,6391996,p0,a0,E. Bolyen,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0
1,6307534,p0,a1,J. Rideout,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0


In [57]:
paperAuthors

Unnamed: 0,paperId,authorId
0,p0,"[a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, ..."
1,p13,"[a0, a1, a2, a3, a4, a5, a6, a6, a7, a8, a9, a..."
2,p2039,"[a15, a8363, a8365, a8366, a8369, a8372, a8402..."
3,p1191,"[a39, a5600, a5601, a5602]"
4,p2009,"[a76, a8168, a8169, a8170, a8171, a8172, a8173..."
...,...,...
2479,p2490,"[a10642, a10643]"
2480,p2491,"[a10644, a10645, a10646, a10647]"
2481,p2493,"[a10652, a10653, a10654, a10655, a10656, a1065..."
2482,p2494,"[a10670, a10671, a10672, a10673, a10674]"


In [58]:
totalPapers = len(submittedPapersDF)
set_authors = set(all_authors)

ids = []
submissions = []
reviewers = []

for i in range(totalPapers * 2):
    ids.append('r'+str(i))
    submissions.append(submittedPapersDF['submissionId'][i//2])
    
    curr_pId = submittedPapersDF['paperId'][i//2]
    curr_authors = paperAuthors[paperAuthors['paperId'] == curr_pId]['authorId'].tolist()
    
    availableReviewers = [x for x in set_authors if not x in curr_authors]
    reviewers.append(random.choice(availableReviewers))

In [59]:
# Reviewer (author id: aId) reviews the submission (with id: sId)
reviewersDF = pd.DataFrame(ids, columns = ['rId'])
reviewersDF['submissionId'] = submissions
reviewersDF['authorId'] = reviewers

In [60]:
reviewersDF = reviewersDF.merge(
            authorsDF,
            how = 'inner',
            left_on = ['authorId'],
            right_on = ['authorId']
            )

In [61]:
reviewersDF = reviewersDF.merge(
            submittedPapersDF,
            how = 'inner',
            left_on = ['submissionId'],
            right_on = ['submissionId']
            )

In [62]:
reviewersDF.to_csv('data/reviewers.csv',index = False, header = True, mode = 'w')

In [63]:
reviewersDF.head(3)

Unnamed: 0,rId,submissionId,authorId,authorUrl,authorName,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,r0,s0,a10520,50222659,Jonathan Logan,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,r1,s0,a4075,1746021,E. Yu,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
2,r2,s1,a1025,3033014,N. Zumel,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted


### Creating chairs data

In [64]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted


In [65]:
confPapers = submittedPapersDF[submittedPapersDF['conferenceJournalId'].str[:1] == 'c']['conferenceJournalId'].tolist()

confPapersDF = pd.DataFrame(confPapers, columns = ['conferenceId'])

ids = []
for i in range(len(confPapersDF)):
    ids.append('chair'+str(i))
confPapersDF['chairId'] = ids

confPapersDF['authorId'] = random.choices(authorsDF['authorId'], k = len(confPapersDF))

In [66]:
confPapersDF.head(2)

Unnamed: 0,conferenceId,chairId,authorId
0,c14,chair0,a8258
1,c0,chair1,a9757


It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [67]:
confPapersDF.to_csv('data/chairs.csv',index = False, header = True, mode = 'w')

This implies that for author with x id chairs the conference y with c chairId or author x handles the conference y.

### Creating editors data

In [68]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted


In [69]:
jourPapers = submittedPapersDF[submittedPapersDF['conferenceJournalId'].str[:1] == 'j']['conferenceJournalId'].tolist()

jourPapersDF = pd.DataFrame(jourPapers, columns = ['journalId'])

ids = []
for i in range(len(jourPapersDF)):
    ids.append('editor'+str(i))
jourPapersDF['editorId'] = ids

jourPapersDF['authorId'] = random.choices(authorsDF['authorId'], k = len(jourPapersDF))
jourPapersDF.to_csv('data/editors.csv',index = False, header = True, mode = 'w')

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [70]:
jourPapersDF.head()

Unnamed: 0,journalId,editorId,authorId
0,j0,editor0,a6023
1,j1,editor1,a6532
2,j2,editor2,a6892
3,j3,editor3,a2485
4,j4,editor4,a10028


This implies that for author with x id is the editor of the journal y with e editorId or author x handles the journal y.

In [71]:
jourPapersDF.to_csv('data/editors.csv',index = False, header = True, mode = 'w')

### HandlerReviewer Data

In [72]:
jourPapersDF.tail(2)

Unnamed: 0,journalId,editorId,authorId
821,j383,editor821,a6247
822,j175,editor822,a5962


In [73]:
reviewersDF.head(2)

Unnamed: 0,rId,submissionId,authorId,authorUrl,authorName,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,r0,s0,a10520,50222659,Jonathan Logan,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,r1,s0,a4075,1746021,E. Yu,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected


In [107]:
handlerReviewersDF1 = pd.DataFrame(confPapersDF[['chairId']].values, columns = ['temp'])
handlerReviewersDF1['confJour'] = confPapersDF['conferenceId']

handlerReviewersDF2 = pd.DataFrame(jourPapersDF[['editorId']].values, columns = ['temp'])
handlerReviewersDF2['confJour'] = jourPapersDF['journalId']

handlerReviewersDF = pd.concat([handlerReviewersDF1,handlerReviewersDF2]).rename({'temp':'handlerId'}, axis = 1)

In [108]:
handlerReviewersDF.tail(2)

Unnamed: 0,handlerId,confJour
821,editor821,j383
822,editor822,j175


In [109]:
reviewersDF.head(2)

Unnamed: 0,rId,submissionId,authorId,authorUrl,authorName,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,r0,s0,a10520,50222659,Jonathan Logan,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,r1,s0,a4075,1746021,E. Yu,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected


In [110]:
handlerReviewersDF = handlerReviewersDF.merge(
    right = reviewersDF,
    how = 'inner',
    left_on = 'confJour',
    right_on = 'conferenceJournalId'
)[['handlerId','conferenceJournalId','authorId']]

In [111]:
handlerReviewersDF.head()

Unnamed: 0,handlerId,conferenceJournalId,authorId
0,chair0,c14,a1025
1,chair0,c14,a352
2,chair0,c14,a8417
3,chair0,c14,a7172
4,chair0,c14,a1777


In [112]:
handlerReviewersDF.to_csv('data/handlerReviewers.csv',index = False, header = True, mode = 'w')