In [2]:
import json
import pandas as pd
import random
random.seed(10)

In [3]:
with open('data/dataset.json', encoding='utf-8') as inputfile:
    df = json.load(inputfile)
inputfile.close()

### Collecting details about journals, conferences, authors, fieldsOfStudy

In [29]:
conferences = []
conferenceUrls = []

journals = []
journalUrls = []

authors = []
authorUrls = []

fieldsOfStudy = []

for i in range(len(df)):
    details = df[i]['_data']  
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if jName not in journals:
                    journals.append(jName)
                    journalUrls.append(myId)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if cName not in conferences:
                    conferences.append(cName)
                    conferenceUrls.append(myId)
    if details['authors']:
        for i in range(len(details['authors'])):
            aName = details['authors'][i]['name']
            myId = details['authors'][i]['authorId']
            if aName not in authors:
                authors.append(aName)
                authorUrls.append(myId)
    if details['fieldsOfStudy']:
        for i in details['fieldsOfStudy']:
            if i not in fieldsOfStudy:
                fieldsOfStudy.append(i)

#### Conferences

In [5]:
# assigning conference ids
# conferenceColumns = ['cId','cTitle','cUrlId','cType','cProceedings']

ids = []
proceedings = []
for i in range(len(conferences)):
    ids.append('c'+str(i))
    
    # Creating proceedings value as a random integer between (1,10)
    proceedings.append(random.randint(1,10))
        
conferencesDF = pd.DataFrame(ids, columns = ['cId'])

In [6]:
# assigning conference subclasses
# randomly choose from cTypes
cTypes = ['workshop','expertGroup','symposium','regularConference']

conferenceTypes = []
conferenceTypes = random.choices(cTypes, weights=[0.25,0.25,0.25,0.25], k=len(conferencesDF))

In [7]:
conferencesDF['cUrlId'] = conferenceUrls
conferencesDF['cTitle'] = conferences
conferencesDF['cType'] = conferenceTypes

# Creating proceedings data for conferences
conferencesDF['cProceedings'] = proceedings

conferencesDF.to_csv('data/conferences.csv',index = False, header = True, mode = 'w')

#### Journals

In [8]:
# journalColumns = ['jId','jTitle','jUrlId','jVolumes']

ids = []
volumes = []
for i in range(len(journals)):
    ids.append('j'+str(i))
    
    # Creating volumes value as a random integer between (1,10)
    volumes.append(random.randint(0,10))

journalsDF = pd.DataFrame(ids, columns = ['jId'])
journalsDF['jUrlId'] = journalUrls
journalsDF['jTitle'] = journals

# Creating volumes data for journals
journalsDF['jVolumes'] = volumes

journalsDF.to_csv('data/journals.csv',index = False, header = True, mode = 'w')

#### Authors

In [39]:
# authorColumns = ['aId','aName','aUrlId']

ids = []
for i in range(len(authors)):
    ids.append('a'+str(i))

authorsDF = pd.DataFrame(ids, columns = ['aId'])
authorsDF['aUrlId'] = authorUrls
authorsDF['aName'] = authors
authorsDF.to_csv('data/authors.csv',index = False, header = True, mode = 'w')

In [40]:
authorsDF.head()

Unnamed: 0,aId,aUrlId,aName
0,a0,6391996,E. Bolyen
1,a1,6307534,J. Rideout
2,a2,40950692,Matthew R. Dillon
3,a3,4230536,N. Bokulich
4,a4,5538503,C. Abnet


#### FieldsOfStudy

In [10]:
# fieldsOfStudyColumns = ['fId','fName']

ids = []
for i in range(len(fieldsOfStudy)):
    ids.append('f'+str(i))

fieldsOfStudyDF = pd.DataFrame(ids, columns = ['fId'])
fieldsOfStudyDF['fName'] = fieldsOfStudy
fieldsOfStudyDF.to_csv('data/fieldsOfStudy.csv',index = False, header = True, mode = 'w')

### Collecting information about papers

In [11]:
conferencesDF = pd.read_csv('data/conferences.csv')
journalsDF = pd.read_csv('data/journals.csv')
authorsDF = pd.read_csv('data/authors.csv')

In [67]:
papers = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    paperCurr.append(currPaperUrlNum)
    paperCurr.append(details['paperId'])
    
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                paperCurr.append(journalsDF[journalsDF['jTitle'] == jName]['jId'].values[0])
                paperCurr.append(jName)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                paperCurr.append(conferencesDF[conferencesDF['cTitle'] == cName]['cId'].values[0])
                paperCurr.append(cName)

            else:
                paperCurr.append('None')
                paperCurr.append('None')
        else:
            paperCurr.append('None')
            paperCurr.append('None')
    else:
        paperCurr.append('None') # conferenceJournalUrl
        paperCurr.append('None') # conferenceJournal
    
    paperCurr.append(details['title'])
    paperCurr.append(details['abstract'])
    paperUrlNum += 1
    papers.append(paperCurr)

In [68]:
paperColumns = ['pId','paperUrlId','conferenceJournalId','conferenceJournalTitle','pTitle','abstract']

papersDF = pd.DataFrame(papers, columns = paperColumns)

In [69]:
# Assigning paper subclasses
# Assign type poster if not a conference/journal
# Else, randomly choose from pTypes

paperTypes = []
pTypes = ['shortPaper','demoPaper','fullPaper']
for index, row in papersDF.iterrows():
    if row['conferenceJournalId'] == 'None':
        paperTypes.append('poster')
    else:
        paperTypes.append(random.choice(pTypes))
papersDF['pType'] = paperTypes

In [70]:
papersDF['conferenceJournalId'] = papersDF.apply(lambda x: random.choice(conferencesDF.cId) if x['pType'] == 'poster' else x['conferenceJournalId'], axis = 1)
papersDF['conferenceJournalTitle'] = papersDF.apply(lambda x: conferencesDF[conferencesDF['cId'] == x['conferenceJournalId']]['cTitle'].item() if x['pType'] == 'poster' else x['conferenceJournalTitle'], axis = 1)

In [71]:
# Imputing None values for abstract
papersDF['abstract'] = papersDF.apply(lambda x: 'Abstract content goes here ...' if x['abstract'] == None else x['abstract'], axis = 1)

In [72]:
papersDF.head()

Unnamed: 0,pId,paperUrlId,conferenceJournalId,conferenceJournalTitle,pTitle,abstract,pType
0,p0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,fullPaper
1,p1,fa5853fdef7d2f6bb68203d187ddacbbddc63a8b,c54,International Workshop on Agent-Oriented Softw...,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",poster
2,p2,4c6e31458b0b44c1e8bd6e58f7d7e0767f7fde44,j1,IEEE Transactions on Knowledge and Data Engine...,CRISP-DM Twenty Years Later: From Data Mining ...,CRISP-DM(CRoss-Industry Standard Process for D...,fullPaper
3,p3,7282f5c9d84cd47c516a6a66c5a6b8f1e2cf44b6,c0,International Conference on Human Factors in C...,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,fullPaper
4,p4,3569c79cf90b203325dd7b8f6c30bacc60f5d30e,j2,SN Computer Science,Data Science and Analytics: An Overview from D...,Abstract content goes here ...,fullPaper


Imputing conferenceJournalId as randomly chosen conference from the list of conferences and corresponding conferenceJournalTitle for the paperType as poster.

In [18]:
papersDF.to_csv('data/papers.csv',index = False, header = True, mode = 'w')

### Creating authors papers relation

In [109]:
aUrlIds = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Appending the pId to authorsDF
    if details['authors']:
        for i in range(len(details['authors'])):
            aUrlId = details['authors'][i]['authorId']
            aUrlIds.append(aUrlId)
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

authorsPapersDF = pd.DataFrame(aUrlIds, columns = ['aUrlId'])
authorsPapersDF['pId'] = pIds

In [110]:
authorsPapersDF = authorsPapersDF.merge(
                    authorsDF,
                    how = 'inner',
                    left_on = ['aUrlId'],
                    right_on = ['aUrlId']
                    ).drop(columns = ['pId_y'], axis = 1).rename({'pId_x':'pId'},axis = 1)
authorsPapersDF.to_csv('data/authorsPapers.csv',index = False, header = True, mode = 'w')

In [111]:
authorsPapersDF.head()

Unnamed: 0,aUrlId,pId,aId,aName
0,6391996,p0,a0,E. Bolyen
1,6391996,p13,a0,E. Bolyen
2,6307534,p0,a1,J. Rideout
3,6307534,p13,a1,J. Rideout
4,40950692,p0,a2,Matthew R. Dillon


### Creating Submitted Paper Related Data

In [19]:
decisionChoices = ['accepted','rejected']

ids = []
for i in range(len(papersDF)):
    ids.append('s'+str(i))

submittedPapersDF = pd.DataFrame(ids, columns = ['sId'])
submittedPapersDF['pId'] = papersDF['pId']
submittedPapersDF['pTitle'] = papersDF['pTitle']
submittedPapersDF['abstract'] = papersDF['abstract']
submittedPapersDF['conferenceJournalId'] = papersDF['conferenceJournalId']
submittedPapersDF['conferenceJournalTitle'] = papersDF['conferenceJournalTitle']
submittedPapersDF['decision'] = random.choices(decisionChoices, weights = [0.8,0.2], k = len(submittedPapersDF))
submittedPapersDF.to_csv('data/submittedPapers.csv',index = False, header = True, mode = 'w')

In [20]:
submittedPapersDF.head()

Unnamed: 0,sId,pId,pTitle,abstract,conferenceJournalId,conferenceJournalTitle,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted
2,s2,p2,CRISP-DM Twenty Years Later: From Data Mining ...,CRISP-DM(CRoss-Industry Standard Process for D...,j1,IEEE Transactions on Knowledge and Data Engine...,accepted
3,s3,p3,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,c0,International Conference on Human Factors in C...,rejected
4,s4,p4,Data Science and Analytics: An Overview from D...,Abstract content goes here ...,j2,SN Computer Science,accepted


### Creating Reviews Data

In [21]:
# reviewColumns = ['rId','pId','aId','rReviewText','rDecision']

ids = []
for i in range(len(submittedPapersDF)):
    ids.append('r'+str(i))

reviewsDF = pd.DataFrame(ids, columns = ['rId'])
reviewsDF['pId'] = submittedPapersDF['pId']
reviewsDF['rReviewText'] = ['content of the reviewed text goes here....'] * len(reviewsDF)
reviewsDF['rDecision'] = submittedPapersDF['decision']
reviewsDF.to_csv('data/reviews.csv',index = False, header = True, mode = 'w')

### Creating Final Paper Related Data

In [22]:
finalPapersDF = submittedPapersDF[(submittedPapersDF['decision'] == 'accepted')]

ids = []
for i in range(len(finalPapersDF)):
    ids.append('f'+str(i))
    
finalPapersDF['fId'] = ids
finalPapersDF.to_csv('data/finalPapers.csv',index = False, header = True, mode = 'w')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finalPapersDF['fId'] = ids


### Creating Reviewer Data

In [150]:
all_authors = list(authorsPapersDF['aId'].unique())
paperAuthors = authorsPapersDF.groupby('pId', as_index = False, sort = False)['aId'].agg(lambda x: [l for l in x])

In [151]:
paperAuthors

Unnamed: 0,pId,aId
0,p0,"[a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, ..."
1,p13,"[a0, a1, a2, a3, a4, a5, a6, a6, a7, a8, a9, a..."
2,p2039,"[a15, a8363, a8365, a8366, a8369, a8372, a8402..."
3,p1191,"[a39, a5600, a5601, a5602]"
4,p2009,"[a76, a8168, a8169, a8170, a8171, a8172, a8173..."
...,...,...
2479,p2490,"[a10642, a10643]"
2480,p2491,"[a10644, a10645, a10646, a10647]"
2481,p2493,"[a10652, a10653, a10654, a10655, a10656, a1065..."
2482,p2494,"[a10670, a10671, a10672, a10673, a10674]"


In [175]:
totalPapers = len(submittedPapersDF)
set_authors = set(all_authors)

ids = []
submissions = []
reviewers = []

for i in range(totalPapers * 2):
    ids.append('r'+str(i))
    submissions.append(submittedPapersDF['sId'][i//2])
    
    curr_pId = submittedPapersDF['pId'][i//2]
    curr_authors = paperAuthors[paperAuthors['pId'] == curr_pId]['aId'].tolist()
    
    availableReviewers = [x for x in set_authors if not x in curr_authors]
    reviewers.append(random.choice(availableReviewers))

In [176]:
# Reviewer (author id: aId) reviews the submission (with id: sId)
reviewersDF = pd.DataFrame(ids, columns = ['rId'])
reviewersDF['sId'] = submissions
reviewersDF['aId'] = reviewers
reviewersDF.to_csv('data/reviewers.csv',index = False, header = True, mode = 'w')

In [177]:
reviewersDF.head()

Unnamed: 0,rId,sId,aId
0,r0,s0,a6427
1,r1,s0,a8163
2,r2,s1,a9920
3,r3,s1,a10597
4,r4,s2,a2067


### Creating chairs data

In [185]:
submittedPapersDF.head(2)

Unnamed: 0,sId,pId,pTitle,abstract,conferenceJournalId,conferenceJournalTitle,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted


In [242]:
confPapers = submittedPapersDF[submittedPapersDF['conferenceJournalId'].str[:1] == 'c']['conferenceJournalId'].tolist()

confPapersDF = pd.DataFrame(confPapers, columns = ['cId'])

ids = []
for i in range(len(confPapersDF)):
    ids.append('chair'+str(i))
confPapersDF['chairId'] = ids

confPapersDF['aId'] = random.choices(authorsDF['aId'], k = len(confPapersDF))
confPapersDF.to_csv('data/chairs.csv',index = False, header = True, mode = 'w')

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [243]:
confPapersDF.head()

Unnamed: 0,cId,chairId,aId
0,c90,chair0,a4577
1,c0,chair1,a13
2,c100,chair2,a575
3,c51,chair3,a4262
4,c87,chair4,a9943


### Creating editors data

In [244]:
submittedPapersDF.head(2)

Unnamed: 0,sId,pId,pTitle,abstract,conferenceJournalId,conferenceJournalTitle,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,accepted
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c90,Computer Vision and Pattern Recognition,accepted


In [247]:
jourPapers = submittedPapersDF[submittedPapersDF['conferenceJournalId'].str[:1] == 'j']['conferenceJournalId'].tolist()

jourPapersDF = pd.DataFrame(jourPapers, columns = ['jId'])

ids = []
for i in range(len(jourPapersDF)):
    ids.append('editor'+str(i))
jourPapersDF['editorId'] = ids

jourPapersDF['aId'] = random.choices(authorsDF['aId'], k = len(jourPapersDF))
jourPapersDF.to_csv('data/editors.csv',index = False, header = True, mode = 'w')

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [248]:
jourPapersDF.head()

Unnamed: 0,jId,editorId,aId
0,j0,editor0,a4208
1,j1,editor1,a9095
2,j2,editor2,a4718
3,j3,editor3,a8799
4,j4,editor4,a3447
