In [69]:
from rdflib import Graph, Namespace, URIRef #import already in RDFlib integrated namespaces 
import pandas as pd #for handling csv and csv contents       
from owlrl import DeductiveClosure, RDFS_Semantics
import hashlib


In [70]:
g = Graph()
g.parse("output_files/ontology.ttl", format="turtle")
DBO = Namespace("http://dbpedia.org/ontology/")
DBR = Namespace("http://dbpedia.org/resource/")


g.bind("dbo", DBO)
g.bind("dbr", DBR)

print(f"Total triples: {len(g)}")


Total triples: 80


In [71]:
def consistent_hash(value):
    return int(hashlib.sha256(str(value).encode()).hexdigest(), 16)

In [72]:

# ConferenceEdition -is_edition_of_conf-> Conference
# WorkshopEdition -is_edition_of_workshop-> Workshop
path_ed='data/is_edition_of_relationships.csv'
path2='data/conferences.csv'
df_ed=pd.read_csv(path_ed,sep=",")
df2=pd.read_csv(path2,sep=",")
df= pd.merge(df_ed, df2, on='confID')
for index, row in df.iterrows():
    if row['type'] == 'conference':
        g.add((URIRef(DBR+str(row['editionID'])), DBO.is_edition_of_conf, URIRef(DBR+str(row['confID']))))
    elif row['type'] == 'workshop':
        g.add((URIRef(DBR+str(row['editionID'])), DBO.is_edition_of_workshop, URIRef(DBR+str(row['confID']))))
    else:
        print("Unknown type:", row['type'])

# ConferenceChair -organizes_conference-> Conference
# ConferenceChair -organizes_workshop-> Workshop
# JournalEditor -edits-> Journal
path='data/coordinators.csv'
df=pd.read_csv(path,sep=",")
path1='data/journals.csv'
df1=pd.read_csv(path1,sep=",")
dict = {}
for index, row in df.iterrows():
    if index < len(df2):
        if df2.loc[index]['type']== 'conference':
            g.add((URIRef(DBR+str(row['id'])), DBO.organizes_conference, URIRef(DBR+str(df2.loc[index]['confID']))))
            dict[str(df2.loc[index]['confID'])] = str(row['id'])
        else:
            g.add((URIRef(DBR+str(row['id'])), DBO.organizes_workshop, URIRef(DBR+str(df2.loc[index]['confID']))))
            dict[str(df2.loc[index]['confID'])] = str(row['id'])
    else:
        g.add((URIRef(DBR+str(row['id'])), DBO.edits, URIRef(DBR+str(consistent_hash(df1.loc[index-len(df2)]['journalID'])))))
        dict[str(consistent_hash(df1.loc[index-len(df2)]['journalID']))] = str(row['id'])


# Paper -published_in-> JournalVolume
path_pub='data/published_in_relationships.csv'
df_pub=pd.read_csv(path_pub,sep=",")
for index, row in df_pub.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.published_in, URIRef(DBR+str(row['volumeID']).replace(" ", ""))))


# JournalVolume -is_volume_of-> Journal
path_vol='data/is_volume_of_relationships.csv'
df_vol=pd.read_csv(path_vol,sep=",")
for index, row in df_vol.iterrows():
    g.add((URIRef(DBR+str(row['volumeID']).replace(" ", "")), DBO.is_volume_of, URIRef(DBR+str(consistent_hash(row['journalID'])))))


# Paper -presentedAt-> EventEdition
path_pres='data/presented_in_relationships.csv'
df_pres=pd.read_csv(path_pres,sep=",")
for index, row in df_pres.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.presentedAt, URIRef(DBR+str(row['editionID']))))

# Review -review_written_by-> Reviewer
# Review -corresponds_to_paper-> Paper
#I WONDER IF THIS LINK COULD BE INFERRED BY REASONING
#Review -is_assigned_by-> Coordinator   
path="data/evolved_reviews_relationships.csv"
df=pd.read_csv(path,sep=",")
df_merged1 = pd.merge(df_pub, df_vol, on='volumeID')
df_merged2 = pd.merge(df_pres, df_ed, on='editionID')

union_df = pd.concat([df_merged1, df_merged2], ignore_index=True)
df1= pd.merge(df, union_df, on='paperID')
df1['hashed_journalID'] =df1['journalID'].apply(consistent_hash)
df1['combined'] = df1['confID'].combine_first(df1['hashed_journalID'])

for index, row in df.iterrows():
     coordinator=str(dict[str(df1[df1['content'] == row['content']]['combined'].values[0])])
     g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.review_written_by, URIRef(DBR+str(row['authorID']))))
     g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.corresponds_to_paper, URIRef(DBR+str(row['paperID']))))
     g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.is_assigned_by, URIRef(DBR+coordinator)))
     

##############################################################

# Paper -is_written-> Author
# Paper -is_corresponding_author-> Author
path='data/wrote_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    if row['corresponding'] == True:
        g.add((URIRef(DBR+str(row['paperID'])), DBO.is_corresponding_author, URIRef(DBR+str(row['authorID']))))
    elif row['corresponding'] == False:
        g.add((URIRef(DBR+str(row['paperID'])), DBO.is_written, URIRef(DBR+str(row['authorID']))))
        
# Paper -cites-> Paper
path='data/citation_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['citingPaperID'])), DBO.cites, URIRef(DBR+str(row['citedPaperID']))))

# Paper -is_about-> Keyword
path='data/keyword_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.is_about, URIRef(DBR+str(row['keywords']).replace(" ", ""))))

# EventEdition -held_in_city-> City
# EventEdition -held_in_year-> Year
path='data/editions.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['editionID'])), DBO.held_in_city, URIRef(DBR+str(row['city']))))
    g.add((URIRef(DBR+str(row['editionID'])), DBO.held_in_year, URIRef(DBR+str(row['year']))))

# JournalVolume -of_year-> Year
path='data/volumes.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['volumeID']).replace(" ", "")), DBO.of_year, URIRef(DBR+str(row['year']))))


In [13]:
g.serialize("output_files/abox.ttl", format="turtle")
print(g.serialize(format="turtle"))

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

dbo:Author a rdfs:Class ;
    rdfs:subClassOf dbo:Person .

dbo:City a rdfs:Class .

dbo:Conference a rdfs:Class .

dbo:ConferenceChair a rdfs:Class ;
    rdfs:subClassOf dbo:Coordinator .

dbo:ConferenceEdition a rdfs:Class ;
    rdfs:subClassOf dbo:EventEdition .

dbo:Coordinator a rdfs:Class ;
    rdfs:subClassOf dbo:Person .

dbo:EventEdition a rdfs:Class .

dbo:Journal a rdfs:Class .

dbo:JournalEditor a rdfs:Class ;
    rdfs:subClassOf dbo:Coordinator .

dbo:JournalVolume a rdfs:Class .

dbo:Keyword a rdfs:Class .

dbo:Paper a rdfs:Class .

dbo:Person a rdfs:Class .

dbo:Review a rdfs:Class .

dbo:Reviewer a rdfs:Class ;
    rdfs:subClassOf dbo:Author .

dbo:Workshop a rdfs:Class .

dbo:WorkshopEdition a rdfs:Class ;
    rdfs:subClassOf dbo:EventEdition .

dbo:Year a rdfs:C

In [73]:
print(f"Total triples before inference: {len(g)}")
# Apply RDFS reasoning
DeductiveClosure(RDFS_Semantics).expand(g)

print(f"Total triples after inference: {len(g)}")

Total triples before inference: 129275
Total triples after inference: 240270


In [74]:
import networkx as nx
import re

def extract_nodes(s):
     # keep the string after the last hashtag or frontslash
    match = re.search(r'[^/#]+$', s)
    return str(match.group(0)) if match else None

def extract_edge(s):
    # keep the string after the last hashtag or frontslash
    match = re.search(r'[^/#]+$', s)
    label = str(match.group(0)) 
    if label == 'type':
        label = 'rdf:type'
    elif label in ['range', 'domain', 'subClassOf', 'subPropertyOf']:
        label = 'rdfs:' + label
    return label if match else None
    

G = nx.DiGraph()

for s, p, o in g:
    G.add_edge(extract_nodes(s),
               extract_nodes(o),
               label=extract_edge(p))

nx.write_graphml(G, "output_files/inference.graphml")