In [1]:
# Importing necessary libraries and modules
import pandas as pd
import random
from rdflib.namespace import RDF, RDFS, FOAF, XSD, URIRef
from rdflib import Graph
import pandas as pd
from rdflib import Namespace
from rdflib import Literal

In [2]:
# Importing necessary files
authorsDF = pd.read_csv('data/authors.csv')
authorsPapersDF = pd.read_csv('data/authorsPapers.csv')
chairsDF = pd.read_csv('data/chairs.csv')
conferencesDF = pd.read_csv('data/conferences.csv')
conferenceProceedingsDF = pd.read_csv('data/conferenceProceedings.csv')
editorsDF = pd.read_csv('data/editors.csv')
handlerReviewersDF = pd.read_csv('data/handlerReviewers.csv')
finalPapersDF = pd.read_csv('data/finalPapers.csv')
journalsDF = pd.read_csv('data/journals.csv')
journalVolumesDF = pd.read_csv('data/journalVolumes.csv')
papersDF = pd.read_csv('data/papers.csv')
proceedingsDF = pd.read_csv('data/proceedings.csv')
reviewersDF = pd.read_csv('data/reviewers.csv')
reviewsDF = pd.read_csv('data/reviews.csv')
subjectsProceedingsVolumesDF = pd.read_csv('data/subjectsProceedingsVolumes.csv')
subjectDomainDF = pd.read_csv('data/subjectDomain.csv')
subjectsPapersDF = pd.read_csv('data/subjectsPapers.csv')
submittedPapersDF = pd.read_csv('data/submittedPapers.csv')
volumesDF = pd.read_csv('data/volumes.csv')

In [3]:
# Creating a graph
g = Graph()

In [4]:
# Create many URIRefs in the same namespace, i.e. URIs with the same prefix
LAB = Namespace("http://SDM_LAB3.org/")

In [5]:
# Bind the lab namespace to a prefix for more readable output
g.bind('lab',LAB)

#### Author -- [writes] --> Paper

In [6]:
authorsPapersDF.head(2)

Unnamed: 0,authorUrl,paperId,authorId,authorName,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,6391996,p0,a0,E. Bolyen,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0
1,6307534,p0,a1,J. Rideout,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0


In [7]:
# Connecting author--[writes]--> paper instances to the tbox
# g.add((LAB.Author, LAB.writes, LAB.Paper))   
for k in range(len(authorsPapersDF['authorId'])):
    g.add((URIRef(LAB+authorsPapersDF['authorId'][k]), RDF.type, LAB.Author))
    ptype = authorsPapersDF['paperType'][k]
    if ptype == "shortPaper":
        g.add((URIRef(LAB+authorsPapersDF['paperId'][k]), RDF.type, LAB.ShortPaper))
    elif ptype == "demoPaper":
        g.add((URIRef(LAB+authorsPapersDF['paperId'][k]), RDF.type, LAB.DemoPaper))
    elif ptype == "fullPaper":
        g.add((URIRef(LAB+authorsPapersDF['paperId'][k]), RDF.type, LAB.FullPaper))
    elif ptype == "poster":
        g.add((URIRef(LAB+authorsPapersDF['paperId'][k]), RDF.type, LAB.Poster))

#### Paper -- [submitted] --> SubmittedPaper

In [8]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted


In [9]:
# Connecting paper--[submitted]--> submittedPaper instances to the tbox
# g.add((LAB.Paper, LAB.submitted, LAB.SubmittedPaper))   
for k in range(len(submittedPapersDF['paperId'])):
    g.add((URIRef(LAB+submittedPapersDF['paperId'][k]), RDF.type, LAB.SubmittedPaper))
    g.add((URIRef(LAB+submittedPapersDF['paperId'][k]), RDF.type, LAB.Paper))

#### Paper -- [paperRelatedTo] --> SubjectDomain

In [10]:
subjectsPapersDF.head(2)

Unnamed: 0,subjectDomainName,paperId,subjectDomainId,paperUrl,conferenceJournalId,conferenceJournalTitle,paperTitle,paperAbstract,paperType,proceedingsVolumeIds
0,Engineering,p0,d0,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0
1,Medicine,p0,d1,0c2d3b28d48426b8b72f7214a7708ba8b4efa9d6,j0,Nature Biotechnology,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,shortPaper,jv0


In [11]:
# Connecting paper--[paperRelatedTo]--> subjectDomain instances to the tbox
# g.add((LAB.Paper, LAB.paperRelatedTo, LAB.SubjectArea))
for k in range(len(subjectsPapersDF['paperId'])):
    g.add((URIRef(subjectsPapersDF['subjectDomainId'][k]), RDF.type, LAB.SubjectDomain))

#### Conference -- [isIn] --> Proceedings

In [12]:
conferenceProceedingsDF.head(2)

Unnamed: 0,conferenceId,conferenceUrl,conferenceTitle,conferenceType,conferenceProceedingIds,proceedingName,proceedingYear
0,c0,b55b50b1-aae7-47a7-b042-8aecc930073d,International Conference on Human Factors in C...,regularConference,cp0,proceeding0,2019
1,c1,2c9ecac6-f875-4a9b-acc2-10bd9f6782df,Technical Symposium on Computer Science Education,workshop,cp1,proceeding1,2002


In [13]:
# Connecting conference--[isIn]--> proceedings instances to the tbox
# g.add((LAB.Conference, LAB.isIn, LAB.Proceedings))
for k in range(len(conferenceProceedingsDF['conferenceId'])):
    ctype = conferenceProceedingsDF['conferenceType'][k]
    if ctype == 'regularConference':
        g.add((URIRef(LAB+conferenceProceedingsDF['conferenceId'][k]), RDF.type, LAB.RegularConference))
    elif ctype == 'workshop':
        g.add((URIRef(LAB+conferenceProceedingsDF['conferenceId'][k]), RDF.type, LAB.Workshop))
    elif ctype == 'expertGroup':
        g.add((URIRef(LAB+conferenceProceedingsDF['conferenceId'][k]), RDF.type, LAB.ExpertGroup))
    elif ctype == 'symposium':
        g.add((URIRef(LAB+conferenceProceedingsDF['conferenceId'][k]), RDF.type, LAB.Symposium))
    g.add((URIRef(LAB+conferenceProceedingsDF['conferenceProceedingIds'][k]), RDF.type, LAB.Proceedings))

#### Journal -- [isOf] --> Volume

In [14]:
journalVolumesDF.head(2)

Unnamed: 0,journalId,journalUrl,journalTitle,journalVolumeIds,volumeName,volumeYear
0,j0,458166b3-de17-4bf3-bbbb-e53782de2f0f,Nature Biotechnology,jv0,journal0,2006
1,j1,c6840156-ee10-4d78-8832-7f8909811576,IEEE Transactions on Knowledge and Data Engine...,jv1,journal1,2020


In [15]:
# Adding journal--[isOf]--> volume instances to the graph
# g.add((LAB.Journal, LAB.isOf, LAB.Volume))
for k in range(len(journalVolumesDF['journalId'])):
    g.add((URIRef(LAB+journalVolumesDF['journalId'][k]), RDF.type, LAB.Journal))
    g.add((URIRef(LAB+journalVolumesDF['journalVolumeIds'][k]), RDF.type, LAB.Volume))

#### SubmittedPaper -- [isSubmittedTo] --> Venue

In [16]:
submittedPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,s0,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted


In [17]:
# Connecting submittedPaper--[isSubmittedTo]--> venue instances to the tbox
# g.add((LAB.SubmittedPaper, LAB.isSubmittedTo, LAB.Venue))
for k in range(len(submittedPapersDF['paperId'])):
    g.add((URIRef(LAB+submittedPapersDF['paperId'][k]), RDF.type, LAB.SubmittedPaper))

#### SubmittedPaper -- [isAs] --> FinalPaper

In [18]:
finalPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision,finalPaperId,proceedingVolumeYear,acceptanceDate
0,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted,f0,2016,2016-05-08
1,s3,p3,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,c0,International Conference on Human Factors in C...,cp0,accepted,f1,2019,2019-10-29


In [19]:
# Connecting submittedPaper--[isAs]--> finalPaper instances to the tbox
# g.add((LAB.SubmittedPaper, LAB.isAs, LAB.FinalPaper))
for k in range(len(finalPapersDF['paperId'])):
    g.add((URIRef(LAB+finalPapersDF['finalPaperId'][k]), RDF.type, LAB.FinalPaper))

### FinalPaper -- [isPublishedInConference] --> Proceedings & FinalPaper -- [isPublishedInJournal] --> Volume

In [20]:
finalPapersDF.head(2)

Unnamed: 0,submissionId,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision,finalPaperId,proceedingVolumeYear,acceptanceDate
0,s1,p1,High-Dimensional Probability: An Introduction ...,"© 2018, Cambridge University Press Let us summ...",c14,International Conference on Exploring Services...,cp14,accepted,f0,2016,2016-05-08
1,s3,p3,AutoDS: Towards Human-Centered Automation of D...,Data science (DS) projects often follow a life...,c0,International Conference on Human Factors in C...,cp0,accepted,f1,2019,2019-10-29


In [21]:
conferencesDF.head(2)

Unnamed: 0,conferenceId,conferenceUrl,conferenceTitle,conferenceType,conferenceProceedingIds
0,c0,b55b50b1-aae7-47a7-b042-8aecc930073d,International Conference on Human Factors in C...,regularConference,cp0
1,c1,2c9ecac6-f875-4a9b-acc2-10bd9f6782df,Technical Symposium on Computer Science Education,workshop,cp1


In [22]:
# Connecting FinalPaper -- [isPublishedInConference] --> Proceedings & FinalPaper -- [isPublishedInJournal] --> Volume instances to the tbox
# g.add((LAB.FinalPaper, LAB.isPublishedInConference, LAB.Proceedings))
# g.add((LAB.FinalPaper, LAB.isPublishedInJournal, LAB.Volume))
for k in range(len(finalPapersDF['paperId'])):
    if finalPapersDF['conferenceJournalId'][k][0] == 'c':
        g.add((URIRef(LAB+finalPapersDF['proceedingsVolume'][k]), RDF.type, LAB.Proceedings))
    else:
        g.add((URIRef(LAB+finalPapersDF['proceedingsVolume'][k]), RDF.type, LAB.Volume))

#### Proceedings -- [proceedingsRelatedTo] --> SubjectDomain & Volumes -- [volumeRelatedTo] --> SubjectDomain

In [23]:
subjectsProceedingsVolumesDF.head(2)

Unnamed: 0,conferenceJournalId,proceedingsVolumeIds,subjectDomainId
0,j0,jv0,d0
1,j0,jv0,d1


In [24]:
# Connecting Proceedings -- [proceedingsRelatedTo] --> SubjectDomain & Volumes -- [volumeRelatedTo] --> SubjectDomain instances to the tbox
# g.add((LAB.Proceedings, LAB.proceedingRelatedTo, LAB.SubjectDomain))
# g.add((LAB.Volume, LAB.volumeRelatedTo, LAB.SubjectDomain))
for k in range(len(subjectsProceedingsVolumesDF['proceedingsVolumeIds'])):
    curr_confJour_keyword = subjectsProceedingsVolumesDF['subjectDomainId'][k]
    if subjectsProceedingsVolumesDF['proceedingsVolumeIds'][k][0] == 'c':
        g.add((URIRef(LAB+subjectsProceedingsVolumesDF['proceedingsVolumeIds'][k]), RDF.type, LAB.Proceedings))
    else:
        g.add((URIRef(LAB+subjectsProceedingsVolumesDF['proceedingsVolumeIds'][k]), RDF.type, LAB.Volume))

#### Creating Chair -- [handleConference] --> Conference & Editor -- [handleJournal] --> [Journal]

In [25]:
chairsDF.head(2)

Unnamed: 0,conferenceId,chairId,authorId
0,c14,chair0,a8258
1,c0,chair1,a9757


In [26]:
# Connecting Creating Chair -- [handleConference] --> Conference instances to the tbox
# g.add((LAB.Chair, LAB.handleConference, LAB.Conference))
for k in range(len(chairsDF['conferenceId'])):
    g.add((URIRef(LAB+chairsDF['authorId'][k]), RDF.type, LAB.Chair))
    g.add((URIRef(LAB+chairsDF['conferenceId'][k]), RDF.type, LAB.Conference))

In [27]:
editorsDF.head(2)

Unnamed: 0,journalId,editorId,authorId
0,j0,editor0,a6023
1,j1,editor1,a6532


In [28]:
# Connecting Editor -- [handleJournal] --> [Journal] instances to the tbox
g.add((LAB.Editor, LAB.handleJournal, LAB.Journal))
for k in range(len(editorsDF['journalId'])):
    g.add((URIRef(LAB+editorsDF['authorId'][k]), RDF.type, LAB.Editor))
    g.add((URIRef(LAB+editorsDF['journalId'][k]), RDF.type, LAB.Journal))

#### Creating Handler -- [assigns] --> Reviewer: #TODO:

In [29]:
handlerReviewersDF.head(2)

Unnamed: 0,handlerId,conferenceJournalId,authorId
0,chair0,c14,a3224
1,chair0,c14,a7848


In [30]:
# Connecting Handler -- [assigns] --> Reviewer instances to the tbox
# g.add((LAB.Handler,LAB.assigns,LAB.Reviewer))
for k in range(len(handlerReviewersDF['handlerId'])):
    g.add((URIRef(LAB+handlerReviewersDF['authorId'][k]), RDF.type, LAB.Reviewer))

### Creating Reviewer -- [reviewed] --> ReviewText

In [31]:
reviewersDF.head(2)

Unnamed: 0,rId,submissionId,authorId,authorUrl,authorName,paperId,paperTitle,paperAbstract,conferenceJournalId,conferenceJournalTitle,proceedingsVolume,decision
0,r0,s0,a1576,49051967,Gabriella M. Harari,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected
1,r1,s0,a1989,74025114,S. Managau,p0,"Reproducible, interactive, scalable and extens...",Abstract content goes here ...,j0,Nature Biotechnology,jv0,rejected


In [32]:
# Connecting Reviewer -- [reviewed] --> ReviewText instances to the tbox
# g.add((LAB.Reviewer, LAB.reviewed, LAB.ReviewText))
for k in range(len(reviewersDF['rId'])):
    g.add((URIRef(LAB+reviewersDF['conferenceJournalId'][k]), RDF.type, LAB.ReviewText))

#### Creating ReviewText -- [isReviewOf] --> SubmittedPaper

In [33]:
reviewsDF.head(2)

Unnamed: 0,reviewId,submittedPaperId,reviewText,reviewDecision,reviewDecisionBoolean,submittedPaperTitle
0,r0,s0,content of the reviewed text goes here....,rejected,0,"Reproducible, interactive, scalable and extens..."
1,r1,s1,content of the reviewed text goes here....,accepted,1,High-Dimensional Probability: An Introduction ...


In [34]:
# Connecting ReviewText -- [isReviewOf] --> SubmittedPaper instances to the tbox
# g.add((LAB.ReviewText, LAB.isReviewOf, LAB.SubmittedPaper))
for k in range(len(reviewsDF['reviewId'])):
    g.add((URIRef(LAB+reviewsDF['submittedPaperId'][k]), RDF.type, LAB.SubmittedPaper))

#### Paper -- [paperTitle] --> String 

In [35]:
# Connecting Paper -- [paperTitle] --> String instances to the tbox
# g.add((LAB.Paper, LAB.paperTitle, XSD.string))
# for k in range(len(papersDF['paperId'])):
#     g.add((URIRef(LAB+papersDF['paperId'][k]), LAB.paperTitle, Literal(papersDF['paperTitle'][k])))

#### Paper -- [paperAbstract] --> String

In [36]:
# Connecting Paper -- [paperAbstract] --> String instances to the tbox
# g.add((LAB.Paper, LAB.paperAbstract, XSD.string))
# for k in range(len(papersDF['paperId'])):
#     g.add((URIRef(LAB+papersDF['paperId'][k]), LAB.paperAbstract, Literal(papersDF['paperAbstract'][k])))

#### Author -- [authorName] --> String 

In [37]:
# Connecting Paper -- [authorName] --> String instances to the tbox
# g.add((LAB.Author, LAB.authorName, XSD.string))
# for k in range(len(authorsDF['authorId'])):
#     g.add((URIRef(LAB+authorsDF['authorId'][k]), LAB.authorName, Literal(authorsDF['authorName'][k])))

#### Journal -- [journalTitle] --> String

In [38]:
# Connecting Journal -- [journalTitle] --> String instances to the tbox
# g.add((LAB.Journal, LAB.journalTitle, XSD.string))
# for k in range(len(journalsDF['journalId'])):
#     g.add((URIRef(LAB+journalsDF['journalId'][k]), LAB.journalTitle, Literal(journalsDF['journalTitle'][k])))
    # g.add((Literal(journalsDF['journalTitle'][k]), RDF.type, LAB.journalTitle))

#### Conference -- [conferenceTitle] --> String 

In [39]:
# Connecting Conference -- [conferenceTitle] --> String instances to the tbox
# g.add((LAB.Conference, LAB.conferenceTitle, XSD.string))
# for k in range(len(conferencesDF['conferenceId'])):
#     g.add((URIRef(LAB+conferencesDF['conferenceId'][k]), LAB.conferenceTitle, Literal(conferencesDF['conferenceTitle'][k])))

#### Proceedings -- [proceedingName] --> String 

In [40]:
# Connecting Proceedings -- [proceedingName] --> String instances to the tbox
# g.add((LAB.Proceedings, LAB.proceedingName, XSD.string))
# for k in range(len(proceedingsDF['proceedingId'])):
#     g.add((URIRef(LAB+proceedingsDF['proceedingId'][k]), LAB.proceedingName, Literal(proceedingsDF['proceedingName'][k])))

#### Proceedings -- [proceedingYear] --> int

In [41]:
# Connecting Proceedings -- [proceedingYear] --> int instances to the tbox
# g.add((LAB.Proceedings, LAB.proceedingYear, XSD.int))
# for k in range(len(proceedingsDF['proceedingId'])):
#     g.add((URIRef(LAB+proceedingsDF['proceedingId'][k]), LAB.proceedingYear, Literal(proceedingsDF['proceedingYear'][k])))

#### Volume -- [volumeName] --> String 

In [42]:
# Connecting Volume -- [volumeName] --> String instances to the tbox
# g.add((LAB.Volume, LAB.volumeName, XSD.string))
# for k in range(len(volumesDF['volumeId'])):
#     g.add((URIRef(LAB+volumesDF['volumeId'][k]), LAB.volumeName, Literal(volumesDF['volumeName'][k])))

#### Volume -- [volumeYear] --> int

In [43]:
# Connecting Volume -- [volumeYear] --> int instances to the tbox
# g.add((LAB.Volume, LAB.volumeYear, XSD.int))
# for k in range(len(volumesDF['volumeId'])):
#     g.add((URIRef(LAB+volumesDF['volumeId'][k]), LAB.volumeYear, Literal(volumesDF['volumeYear'][k])))

#### SubjectDomain -- [keywords] --> String

In [44]:
# Connecting SubjectDomain -- [keywords] --> String instances to the tbox
# g.add((LAB.SubjectDomain, LAB.keywords, XSD.string))
for k in range(len(subjectDomainDF['subjectDomainId'])):
    g.add((URIRef(LAB+subjectDomainDF['subjectDomainId'][k]), RDF.type, LAB.SubjectDomain))

#### ReviewText -- [comment] --> String

In [45]:
# Connecting ReviewText -- [comment] --> String instances to the tbox
# g.add((LAB.ReviewText, LAB.comment, XSD.string))
# for k in range(len(reviewsDF['reviewId'])):
#   g.add((URIRef(LAB+reviewsDF['reviewId'][k]), LAB.comment, Literal(reviewsDF['reviewText'][k])))

#### ReviewText -- [decision] --> boolean 

In [46]:
# Connecting ReviewText -- [decision] --> boolean instances to the tbox, wherein 1 - Accepted, 0 - Rejected
# g.add((LAB.ReviewText, LAB.decision, XSD.boolean))
# for k in range(len(reviewsDF['reviewId'])):
#     g.add((URIRef(LAB+reviewsDF['reviewId'][k]), LAB.decision, Literal(reviewsDF['reviewDecisionBoolean'][k])))

#### FinalPaper -- [paperAcceptanceDate] --> date

In [47]:
# Connecting FinalPaper -- [paperAcceptanceDate] --> date instances to the tbox
# g.add((LAB.FinalPaper, LAB.paperAcceptanceDate, XSD.date))
# for k in range(len(finalPapersDF['paperId'])):
#     g.add((URIRef(LAB+finalPapersDF['paperId'][k]), LAB.paperAcceptanceDate, Literal(finalPapersDF['acceptanceDate'][k])))

#### Creating output

In [48]:
g.serialize('data/tbox_abox_connection.ttl',format = 'ttl')

<Graph identifier=N39af188f08504a988ecb275d35c34fab (<class 'rdflib.graph.Graph'>)>