In [1]:
import json
import pandas as pd
import random
random.seed(10)

In [2]:
with open('data/dataset.json', encoding='utf-8') as inputfile:
    df = json.load(inputfile)
inputfile.close()

### Collecting details about journals, conferences, authors, fieldsOfStudy

In [3]:
conferences = []
conferenceUrls = []

journals = []
journalUrls = []

authors = []
authorUrls = []

subjectDomains = []

for i in range(len(df)):
    details = df[i]['_data']  
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if jName not in journals:
                    journals.append(jName)
                    journalUrls.append(myId)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if cName not in conferences:
                    conferences.append(cName)
                    conferenceUrls.append(myId)
    if details['authors']:
        for i in range(len(details['authors'])):
            aName = details['authors'][i]['name']
            myId = details['authors'][i]['authorId']
            if aName not in authors:
                authors.append(aName)
                authorUrls.append(str(myId))
    if details['fieldsOfStudy']:
        for i in details['fieldsOfStudy']:
            if i not in subjectDomains:
                subjectDomains.append(i)

#### Conferences

In [4]:
# assigning conference ids
# conferenceColumns = ['cId','cTitle','cUrlId','cType','cProceedings']

ids = []
proceedings = []
for i in range(len(conferences)):
    ids.append('c'+str(i))
    
    # Creating proceedings value as a random integer between (1,10)
    proceedings.append(random.randint(1,10))
        
conferencesDF = pd.DataFrame(ids, columns = ['conferenceId'])

In [5]:
# assigning conference subclasses
# randomly choose from cTypes
cTypes = ['workshop','expertGroup','symposium','regularConference']

conferenceTypes = []
conferenceTypes = random.choices(cTypes, weights=[0.25,0.25,0.25,0.25], k=len(conferencesDF))

In [6]:
conferencesDF['conferenceUrl'] = conferenceUrls
conferencesDF['conferenceTitle'] = conferences
conferencesDF['conferenceType'] = conferenceTypes

# Creating proceedings data for conferences
conferencesDF['conferenceProceedings'] = proceedings

conferencesDF.to_csv('data/conferences.csv',index = False, header = True, mode = 'w')

#### Journals

In [7]:
# journalColumns = ['jId','jTitle','jUrlId','jVolumes']

ids = []
volumes = []
for i in range(len(journals)):
    ids.append('j'+str(i))
    
    # Creating volumes value as a random integer between (1,10)
    volumes.append(random.randint(0,10))

journalsDF = pd.DataFrame(ids, columns = ['journalId'])
journalsDF['journalUrl'] = journalUrls
journalsDF['journalTitle'] = journals

# Creating volumes data for journals
journalsDF['journalVolumes'] = volumes

journalsDF.to_csv('data/journals.csv',index = False, header = True, mode = 'w')

#### Authors

In [8]:
# authorColumns = ['aId','aName','aUrlId']

ids = []
for i in range(len(authors)):
    ids.append('a'+str(i))

authorsDF = pd.DataFrame(ids, columns = ['authorId'])
authorsDF['authorUrl'] = authorUrls
authorsDF['authorName'] = authors
authorsDF.to_csv('data/authors.csv',index = False, header = True, mode = 'w')

In [9]:
authorsDF.head()

Unnamed: 0,authorId,authorUrl,authorName
0,a0,6391996,E. Bolyen
1,a1,6307534,J. Rideout
2,a2,40950692,Matthew R. Dillon
3,a3,4230536,N. Bokulich
4,a4,5538503,C. Abnet


#### SubjectDomain

In [10]:
# subjectDomainColumns = ['eId','fName']

ids = []
for i in range(len(subjectDomains)):
    ids.append('d'+str(i))

subjectDomainDF = pd.DataFrame(ids, columns = ['subjectDomainId'])
subjectDomainDF['subjectDomainName'] = subjectDomains
subjectDomainDF.to_csv('data/subjectDomain.csv',index = False, header = True, mode = 'w')

### Collecting information about papers

In [11]:
conferencesDF = pd.read_csv('data/conferences.csv')
journalsDF = pd.read_csv('data/journals.csv')
authorsDF = pd.read_csv('data/authors.csv')

In [12]:
papers = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    paperCurr.append(currPaperUrlNum)
    paperCurr.append(details['paperId'])
    
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                paperCurr.append(journalsDF[journalsDF['journalTitle'] == jName]['journalId'].values[0])
                paperCurr.append(jName)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                paperCurr.append(conferencesDF[conferencesDF['conferenceTitle'] == cName]['conferenceId'].values[0])
                paperCurr.append(cName)

            else:
                paperCurr.append('None')
                paperCurr.append('None')
        else:
            paperCurr.append('None')
            paperCurr.append('None')
    else:
        paperCurr.append('None') # conferenceJournalUrl
        paperCurr.append('None') # conferenceJournal
    
    paperCurr.append(details['title'])
    paperCurr.append(details['abstract'])
    paperUrlNum += 1
    papers.append(paperCurr)

In [13]:
paperColumns = ['paperId','paperUrl','conferenceJournalId','conferenceJournalTitle','paperTitle','paperAbstract']

papersDF = pd.DataFrame(papers, columns = paperColumns)

In [14]:
# Assigning paper subclasses
# Assign type poster if not a conference/journal
# Else, randomly choose from pTypes

paperTypes = []
pTypes = ['shortPaper','demoPaper','fullPaper']
for index, row in papersDF.iterrows():
    if row['conferenceJournalId'] == 'None':
        paperTypes.append('poster')
    else:
        paperTypes.append(random.choice(pTypes))
papersDF['paperType'] = paperTypes

In [15]:
papersDF['conferenceJournalId'] = papersDF.apply(lambda x: random.choice(conferencesDF.conferenceId) if x['paperType'] == 'poster' else x['conferenceJournalId'], axis = 1)
papersDF['conferenceJournalTitle'] = papersDF.apply(lambda x: conferencesDF[conferencesDF['conferenceId'] == x['conferenceJournalId']]['conferenceTitle'].item() if x['paperType'] == 'poster' else x['conferenceJournalTitle'], axis = 1)

In [16]:
# Imputing None values for abstract
papersDF['paperAbstract'] = papersDF.apply(lambda x: 'Abstract content goes here ...' if x['paperAbstract'] == None else x['paperAbstract'], axis = 1)

In [17]:
papersDF.head()

Unnamed: 0,paperId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType
0,p0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper
1,p1,fa5853fdef7d2f6bb68203d187ddacbbddc63a8b,c90,Computer Vision and Pattern Recognition,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",poster
2,p2,4c6e31458b0b44c1e8bd6e58f7d7e0767f7fde44,j1,IEEE Transactions on Knowledge and Data Engine...,CRISP-DM Twenty Years Later: From Data Mining ...,CRISP-DM(CRoss-Industry Standard Process for D...,demoPaper
3,p3,7282f5c9d84cd47c516a6a66c5a6b8f1e2cf44b6,c0,International Conference on Human Factors in C...,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,shortPaper
4,p4,3569c79cf90b203325dd7b8f6c30bacc60f5d30e,j2,SN Computer Science,Data Science and Analytics: An Overview from D...,Abstract content goes here ...,fullPaper


Imputing conferenceJournalId as randomly chosen conference from the list of conferences and corresponding conferenceJournalTitle for the paperType as poster.

In [18]:
papersDF.to_csv('data/papers.csv',index = False, header = True, mode = 'w')

### Creating authors subjectDomain relation

In [19]:
subjectDomainDF.head(2)

Unnamed: 0,subjectDomainId,subjectDomainName
0,d0,Engineering
1,d1,Medicine


In [20]:
subject_domain_ids = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Adding pId to subjectDomainId
    if details['fieldsOfStudy']:
        for i in range(len(details['fieldsOfStudy'])):
            subjectDomainId = details['fieldsOfStudy'][i]
            subject_domain_ids.append(subjectDomainId)
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

subjectsPapersDF = pd.DataFrame(subject_domain_ids, columns = ['subjectDomainName'])
subjectsPapersDF['paperId'] = pIds

In [21]:
subjectsPapersDF = subjectsPapersDF.merge(
                    subjectDomainDF,
                    how = 'inner',
                    left_on = ['subjectDomainName'],
                    right_on = ['subjectDomainName']
                    )#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)

In [22]:
subjectsPapersDF = subjectsPapersDF.merge(
                    papersDF,
                    how = 'inner',
                    left_on = ['paperId'],
                    right_on = ['paperId']
                    )#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)
subjectsPapersDF.to_csv('data/subjectsPapers.csv',index = False, header = True, mode = 'w')

In [23]:
subjectsPapersDF.head()

Unnamed: 0,subjectDomainName,paperId,subjectDomainId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType
0,Engineering,p0,d0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper
1,Medicine,p0,d1,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper
2,Engineering,p19,d0,2d6adb9636df5a8a5dbcbfaecd0c4d34d7c85034,c52,Workshop on Learning from Authoritative Securi...,Spectral Methods for Data Science: A Statistic...,Spectral methods have emerged as a simple yet ...,poster
3,Computer Science,p19,d3,2d6adb9636df5a8a5dbcbfaecd0c4d34d7c85034,c52,Workshop on Learning from Authoritative Securi...,Spectral Methods for Data Science: A Statistic...,Spectral methods have emerged as a simple yet ...,poster
4,Mathematics,p19,d5,2d6adb9636df5a8a5dbcbfaecd0c4d34d7c85034,c52,Workshop on Learning from Authoritative Securi...,Spectral Methods for Data Science: A Statistic...,Spectral methods have emerged as a simple yet ...,poster


### Creating authors papers relation

In [24]:
aUrlIds = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Appending the pId to authorsDF
    if details['authors']:
        for i in range(len(details['authors'])):
            aUrlId = details['authors'][i]['authorId']
            aUrlIds.append(str(aUrlId))
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

authorsPapersDF = pd.DataFrame(aUrlIds, columns = ['authorUrl'])
authorsPapersDF['paperId'] = pIds

In [25]:
authorsDF.head(2)

Unnamed: 0,authorId,authorUrl,authorName
0,a0,6391996,E. Bolyen
1,a1,6307534,J. Rideout


In [26]:
authorsPapersDF = authorsPapersDF.merge(
                    authorsDF,
                    how = 'inner',
                    left_on = ['authorUrl'],
                    right_on = ['authorUrl']
                    )#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)

In [27]:
authorsPapersDF = authorsPapersDF.merge(
                    papersDF,
                    how = 'inner',
                    left_on = ['paperId'],
                    right_on = ['paperId']
                    )
authorsPapersDF.head()#.drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)
authorsPapersDF.to_csv('data/authorsPapers.csv',index = False, header = True, mode = 'w')

In [28]:
authorsPapersDF.head(2)

Unnamed: 0,authorUrl,paperId,authorId,authorName,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType
0,6391996,p0,a0,E. Bolyen,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper
1,6307534,p0,a1,J. Rideout,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper


### Creating Submitted Paper Related Data

In [29]:
decisionChoices = ['accepted','rejected']

ids = []
for i in range(len(papersDF)):
    ids.append('s'+str(i))

submittedPapersDF = pd.DataFrame(ids, columns = ['submissionId'])
submittedPapersDF['paperId'] = papersDF['paperId']
submittedPapersDF['paperTitle'] = papersDF['paperTitle']
submittedPapersDF['paperAbstract'] = papersDF['paperAbstract']
submittedPapersDF['conferenceJournalId'] = papersDF['conferenceJournalId']
submittedPapersDF['conferenceJournalTitle'] = papersDF['conferenceJournalTitle']
submittedPapersDF['decision'] = random.choices(decisionChoices, weights = [0.8,0.2], k = len(submittedPapersDF))
submittedPapersDF.to_csv('data/submittedPapers.csv',index = False, header = True, mode = 'w')

In [30]:
submittedPapersDF.head()

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted
2,s2,p2,CRISP-DM Twenty Years Later: From Data Mining ...,CRISP-DM(CRoss-Industry Standard Process for D...,j1,IEEE Transactions on Knowledge and Data Engine...,accepted
3,s3,p3,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,c0,International Conference on Human Factors in C...,rejected
4,s4,p4,Data Science and Analytics: An Overview from D...,Abstract content goes here ...,j2,SN Computer Science,accepted


### Creating Reviews Data

In [31]:
# reviewColumns = ['rId','pId','aId','rReviewText','rDecision']

ids = []
for i in range(len(submittedPapersDF)):
    ids.append('r'+str(i))

reviewsDF = pd.DataFrame(ids, columns = ['reviewId'])
reviewsDF['paperId'] = submittedPapersDF['paperId']
reviewsDF['reviewText'] = ['content of the reviewed text goes here....'] * len(reviewsDF)
reviewsDF['reviewDecision'] = submittedPapersDF['decision']
reviewsDF.to_csv('data/reviews.csv',index = False, header = True, mode = 'w')

### Creating Final Paper Related Data

In [32]:
finalPapersDF = submittedPapersDF[(submittedPapersDF['decision'] == 'accepted')]

ids = []
for i in range(len(finalPapersDF)):
    ids.append('f'+str(i))
    
finalPapersDF['finalPaperId'] = ids
# finalPapersDF.to_csv('data/finalPapers.csv',index = False, header = True, mode = 'w')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finalPapersDF['finalPaperId'] = ids


In [33]:
finalPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,decision,finalPaperId
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted,f0
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted,f1


### Creating Reviewer Data

In [34]:
all_authors = list(authorsPapersDF['authorId'].unique())
paperAuthors = authorsPapersDF.groupby('paperId', as_index = False, sort = False)['authorId'].agg(lambda x: [l for l in x])

In [35]:
authorsPapersDF.head(2)

Unnamed: 0,authorUrl,paperId,authorId,authorName,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType
0,6391996,p0,a0,E. Bolyen,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper
1,6307534,p0,a1,J. Rideout,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper


In [36]:
paperAuthors

Unnamed: 0,paperId,authorId
0,p0,"[a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, ..."
1,p13,"[a0, a1, a2, a3, a4, a5, a6, a6, a7, a8, a9, a..."
2,p2039,"[a15, a8363, a8365, a8366, a8369, a8372, a8402..."
3,p1191,"[a39, a5600, a5601, a5602]"
4,p2009,"[a76, a8168, a8169, a8170, a8171, a8172, a8173..."
...,...,...
2479,p2490,"[a10642, a10643]"
2480,p2491,"[a10644, a10645, a10646, a10647]"
2481,p2493,"[a10652, a10653, a10654, a10655, a10656, a1065..."
2482,p2494,"[a10670, a10671, a10672, a10673, a10674]"


In [37]:
totalPapers = len(submittedPapersDF)
set_authors = set(all_authors)

ids = []
submissions = []
reviewers = []

for i in range(totalPapers * 2):
    ids.append('r'+str(i))
    submissions.append(submittedPapersDF['submissionId'][i//2])
    
    curr_pId = submittedPapersDF['paperId'][i//2]
    curr_authors = paperAuthors[paperAuthors['paperId'] == curr_pId]['authorId'].tolist()
    
    availableReviewers = [x for x in set_authors if not x in curr_authors]
    reviewers.append(random.choice(availableReviewers))

In [38]:
# Reviewer (author id: aId) reviews the submission (with id: sId)
reviewersDF = pd.DataFrame(ids, columns = ['rId'])
reviewersDF['submissionId'] = submissions
reviewersDF['authorId'] = reviewers
reviewersDF.to_csv('data/reviewers.csv',index = False, header = True, mode = 'w')

In [39]:
reviewersDF.head()

Unnamed: 0,rId,submissionId,authorId
0,r0,s0,a8563
1,r1,s0,a6515
2,r2,s1,a2884
3,r3,s1,a7455
4,r4,s2,a4081


### Creating chairs data

In [40]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted


In [41]:
confPapers = submittedPapersDF[submittedPapersDF['conferenceJournalId'].str[:1] == 'c']['conferenceJournalId'].tolist()

confPapersDF = pd.DataFrame(confPapers, columns = ['conferenceId'])

ids = []
for i in range(len(confPapersDF)):
    ids.append('chair'+str(i))
confPapersDF['chairId'] = ids

confPapersDF['authorId'] = random.choices(authorsDF['authorId'], k = len(confPapersDF))
confPapersDF.to_csv('data/chairs.csv',index = False, header = True, mode = 'w')

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [42]:
confPapersDF.head()

Unnamed: 0,conferenceId,chairId,authorId
0,c90,chair0,a5876
1,c0,chair1,a5540
2,c100,chair2,a1396
3,c51,chair3,a5545
4,c87,chair4,a2392


### Creating editors data

In [43]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted


In [44]:
jourPapers = submittedPapersDF[submittedPapersDF['conferenceJournalId'].str[:1] == 'j']['conferenceJournalId'].tolist()

jourPapersDF = pd.DataFrame(jourPapers, columns = ['journalId'])

ids = []
for i in range(len(jourPapersDF)):
    ids.append('editor'+str(i))
jourPapersDF['editorId'] = ids

jourPapersDF['authorId'] = random.choices(authorsDF['authorId'], k = len(jourPapersDF))
jourPapersDF.to_csv('data/editors.csv',index = False, header = True, mode = 'w')

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [45]:
jourPapersDF.head()

Unnamed: 0,journalId,editorId,authorId
0,j0,editor0,a9128
1,j1,editor1,a1135
2,j2,editor2,a3505
3,j3,editor3,a10661
4,j4,editor4,a5527
