In [None]:
from rdflib import Graph, Namespace, URIRef,  Literal#import already in RDFlib integrated namespaces 
import pandas as pd #for handling csv and csv contents       
from owlrl import DeductiveClosure, RDFS_Semantics
import hashlib
from rdflib.namespace import RDF, RDFS, XSD

In [None]:
g = Graph()
g.parse("output_files/ontology.ttl", format="turtle")
DBO = Namespace("http://sdm_upc.org/ontology/")
DBR = Namespace("http://sdm_upc.org/resource/")


g.bind("dbo", DBO)
g.bind("dbr", DBR)

print(f"Total triples before inference: {len(g)}")
# Apply RDFS reasoning
DeductiveClosure(RDFS_Semantics).expand(g)

print(f"Total triples after inference: {len(g)}")
nodes = set(g.subjects()) | set(g.predicates()) | set(g.objects())
print("Number of unique nodes (including predicates):", len(nodes))


Total triples before inference: 80
Total triples after inference: 185
Number of unique nodes (including predicates): 44


In [75]:
def consistent_hash(value):
    return int(hashlib.sha256(str(value).encode()).hexdigest(), 16)

In [None]:
# ConferenceEdition -is_edition_of_conf-> Conference
path='data/is_edition_of_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['editionID'])), DBO.is_edition_of_conf, URIRef(DBR+str(row['confID']))))



# ConferenceChair -organizes_conference-> Conference
# JournalEditor -edits-> Journal
path='data/coordinators.csv'
df=pd.read_csv(path,sep=",")
path1='data/journals.csv'
df1=pd.read_csv(path1,sep=",")
path2='data/conferences.csv'
df2=pd.read_csv(path2,sep=",")
dict = {}
for index, row in df.iterrows():
    if index < len(df2):
        g.add((URIRef(DBR+str(row['id'])), DBO.organizes_conference, URIRef(DBR+str(df2.loc[index]['confID']))))
        dict[str(df2.loc[index]['confID'])] = str(row['id'])
    else:
        g.add((URIRef(DBR+str(row['id'])), DBO.edits, URIRef(DBR+str(consistent_hash(df1.loc[index-len(df2)]['journalID'])))))
        dict[str(consistent_hash(df1.loc[index-len(df2)]['journalID']))] = str(row['id'])


# Paper -published_in-> JournalVolume
path_pub='data/published_in_relationships.csv'
df_pub=pd.read_csv(path_pub,sep=",")
for index, row in df_pub.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.published_in, URIRef(DBR+str(row['volumeID']).replace(" ", ""))))


# JournalVolume -is_volume_of-> Journal
path_vol='data/is_volume_of_relationships.csv'
df_vol=pd.read_csv(path_vol,sep=",")
for index, row in df_vol.iterrows():
    g.add((URIRef(DBR+str(row['volumeID']).replace(" ", "")), DBO.is_volume_of, URIRef(DBR+str(consistent_hash(row['journalID'])))))


# Paper -presentedAt-> ConferenceEdition
path_pres='data/presented_in_relationships.csv'
df_pres=pd.read_csv(path_pres,sep=",")
for index, row in df_pres.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.presentedAt, URIRef(DBR+str(row['editionID']))))

# Review -review_written_by-> Reviewer
# Review -corresponds_to_paper-> Paper
#I WONDER IF THIS LINK COULD BE INFERRED BY REASONING
#Review -is_assigned_by-> Coordinator   
path="data/evolved_reviews_relationships.csv"
df=pd.read_csv(path,sep=",")
df_merged1 = pd.merge(df_pub, df_vol, on='volumeID')
df_merged2 = pd.merge(df_pres, df_ed, on='editionID')

union_df = pd.concat([df_merged1, df_merged2], ignore_index=True)
df1= pd.merge(df, union_df, on='paperID')
df1['hashed_journalID'] =df1['journalID'].apply(consistent_hash)
df1['combined'] = df1['confID'].combine_first(df1['hashed_journalID'])

for index, row in df.iterrows():
     coordinator=str(dict[str(df1[df1['content'] == row['content']]['combined'].values[0])])
     g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.review_written_by, URIRef(DBR+str(row['authorID']))))
     g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.corresponds_to_paper, URIRef(DBR+str(row['paperID']))))
     g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.is_assigned_by, URIRef(DBR+coordinator)))


# Paper -is_written-> Author
# Paper -is_corresponding_author-> Author
path='data/wrote_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.head(5).iterrows():
    if row['corresponding'] == True:
        g.add((URIRef(DBR+str(row['paperID'])), DBO.is_corresponding_author, URIRef(DBR+str(row['authorID']))))
    elif row['corresponding'] == False:
        g.add((URIRef(DBR+str(row['paperID'])), DBO.is_written, URIRef(DBR+str(row['authorID']))))

# Paper -cites-> Paper
path='data/citation_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['citingPaperID'])), DBO.cites, URIRef(DBR+str(row['citedPaperID']))))

# Paper -is_about-> Keyword
path='data/keyword_relationships.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.is_about, URIRef(DBR+str(row['keywords']).replace(" ", ""))))

unique_values = df['keywors'].unique().tolist()
for kw in unique_values:
    g.add((URIRef(DBR+str(row['keywords']).replace(" ", "")), DBO.keyword_name, Literal(str(row['keywords']).replace(" ", ""),datatype=XSD.string )))

# EventEdition -held_in_city-> City
# EventEdition -held_in_year-> Year
path='data/editions.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows(): 
    g.add((URIRef(DBR+str(row['editionID'])), DBO.conference_city, Literal(str(row['city']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(row['editionID'])), DBO.conference_year, Literal(int(row['year']), datatype=XSD.integer)))

# JournalVolume -of_year-> Year
path='data/volumes.csv'
df=pd.read_csv(path,sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['volumeID']).replace(" ", "")), DBO.volume_year, Literal(int(row['year']),datatype=XSD.integer )))

# Every paper has a title
# Every paper has an abstract
# Every paper has a year that it was written
path= 'data/papers.csv'
df=pd.read_csv(path, sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['paperID'])), DBO.title, Literal(str(row['title']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(row['paperID'])), DBO.abstract, Literal(str(row['abstract']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(row['paperID'])), DBO.year_written, Literal(int(row['year']),datatype=XSD.integer )))

# Every review has a description
# Every review has a boolean value for if the paper was accepted
path= 'data/evolved_reviews_relationships.csv'
df=pd.read_csv(path, sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(consistent_hash(row['content']))),DBO.review_description, Literal(str(row['content']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(consistent_hash(row['content']))), DBO.is_accepted, Literal(bool(row['decision']=="positive"),datatype=XSD.boolean )))
   
# Every journal has a name (we assume it is unique)
path= 'data/journals.csv'
df=pd.read_csv(path, sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(consistent_hash(row['journalID']))),DBO.journal_name, Literal(str(row['name']),datatype=XSD.string )))
 
# Every conference has a name (we assume it is unique)
# Every conference has a boolean value if it is a workshop or not
path= 'data/conferences.csv'
df=pd.read_csv(path, sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['confID'])),DBO.conference_name, Literal(str(row['name']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(row['confID'])),DBO.is_workshop, Literal(bool(row['conference']=="workshop"), datatype=XSD.boolean )))
    
    
# Every person has a name
# Every person has a unique id
path= 'data/authors.csv'
df=pd.read_csv(path, sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['authorID'])),DBO.name, Literal(str(row['name']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(row['authorID'])),DBO.person_id, Literal(str(row['authorID']), datatype=XSD.string )))
path= 'data/coordinators.csv'  
df=pd.read_csv(path, sep=",")
for index, row in df.iterrows():
    g.add((URIRef(DBR+str(row['id'])),DBO.name, Literal(str(row['first_name']),datatype=XSD.string )))
    g.add((URIRef(DBR+str(row['id'])),DBO.person_id, Literal(str(row['id']), datatype=XSD.string )))




In [77]:
print(f"Total triples before inference: {len(g)}")
# Apply RDFS reasoning
DeductiveClosure(RDFS_Semantics).expand(g)

print(f"Total triples after inference: {len(g)}")
nodes = set(g.subjects()) | set(g.predicates()) | set(g.objects())
print("Number of unique nodes (including predicates):", len(nodes))

Total triples before inference: 96036
Total triples after inference: 182503
Number of unique nodes (including predicates): 27919


In [78]:
g.serialize("output_files/abox.ttl", format="turtle")
print(g.serialize(format="turtle"))

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

dbo:Author a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbo:Author,
        dbo:Person,
        rdfs:Resource .

dbo:City a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbo:City,
        rdfs:Resource .

dbo:Conference a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbo:Conference,
        rdfs:Resource .

dbo:ConferenceChair a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbo:ConferenceChair,
        dbo:Coordinator,
        dbo:Person,
        rdfs:Resource .

dbo:ConferenceEdition a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbo:ConferenceEdition,
        dbo:EventEdition,
        rdfs:Resource .

dbo:Coordinator a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbo:Coordinator,
        dbo:Person,
        rdfs:R

In [79]:
import networkx as nx
import re

def extract_nodes(s):
     # keep the string after the last hashtag or frontslash
    match = re.search(r'[^/#]+$', s)
    return str(match.group(0)) if match else None

def extract_edge(s):
    # keep the string after the last hashtag or frontslash
    match = re.search(r'[^/#]+$', s)
    label = str(match.group(0)) 
    if label == 'type':
        label = 'rdf:type'
    elif label in ['range', 'domain', 'subClassOf', 'subPropertyOf']:
        label = 'rdfs:' + label
    elif label in ['of_year','held_in_year','held_in_city','cites', 'is_about', 'is_corresponding_author','is_written','is_edition_of_conf', 'is_edition_of_workshop', 'organizes_conference', 'organizes_workshop', 'edits', 'published_in', 'is_volume_of', 'presentedAt', 'review_written_by', 'corresponds_to_paper', 'is_assigned_by']:
        label = 'dbo:' + label
    else:
        print(f"Unknown edge label: {label}")
    return label if match else None
    

G = nx.MultiDiGraph()
edge_counter = 0

for s, p, o in g:
    source = extract_nodes(s)
    target = extract_nodes(o)
    edge_label = extract_edge(p)

    if source is None or target is None or edge_label is None:
        continue

    # Add edge with a unique key/ID
    G.add_edge(source, target, key=edge_counter, id=str(edge_counter), label=edge_label)
    edge_counter += 1
print(len(g))
print(len(G.edges))

nx.write_graphml(G, "output_files/inference.graphml")

182503
182503
