In [None]:
!pip install rdflib
!pip install owlrl

In [2]:
import pandas as pd
import re
import rdflib
from rdflib import Graph, Literal, Namespace, RDF, RDFS, OWL, URIRef
from rdflib.namespace import XSD, FOAF

In [None]:
# Load Data
books = pd.read_csv("Books.csv")
ratings = pd.read_csv("Ratings.csv")
users = pd.read_csv("Users.csv")

In [None]:
books.head()

In [None]:
books.info()

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
users.head()

In [None]:
users.info()

In [None]:
# Size of datasets

print(f'''\t  Size of books data is {books.shape}
          Size of ratings data is {ratings.shape}
          Size of users data is {users.shape}''')

In [None]:
# Checking duplicates in datasets using duplicated method of dataframe.

print(f'''\t  Duplicates in books data is {books.duplicated().sum()}
          Duplicates in ratings data is {ratings.duplicated().sum()}
          Duplicates in users data is {users.duplicated().sum()}''')

In [12]:
# A function to get the missing values count and it's percentage
def missing_values(df):
  """
  Description : This function takes a data frame as an input and gives missing value count and its percentage as an output
  function_name : missing_values
  Argument : dataframe.
  Return : dataframe
  
  """
  miss = df.isnull().sum() # finding the missing values.
  
  per = df.isnull().mean() # finding mean/ Average of missing values.
  df = pd.concat([miss,per*100],keys = ['Missing_Values','Percentage'], axis = 1) # concatenating both of them using concat method of pandas module.
  return df # returning dataframe

In [None]:
''' STEP 1: CLEANING OF BOOKS'''
books.describe()

In [None]:
missing_values(books)

In [None]:
# Remove duplicates
books.drop_duplicates(inplace=True)
# Drop rows with 'unknown' ISBN or Book_Title
books.drop(books[books['ISBN'].str.lower() == 'unknown'].index, inplace=True)
books.drop(books[books['Book_Title'].str.lower() == 'unknown'].index, inplace=True)
# Clean ISBN column: remove invalid characters and spaces
books['ISBN'] = books['ISBN'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9]', '', x).strip())

# Ensure the Year_Of_Publication is numeric and valid
books['Year_Of_Publication'] = pd.to_numeric(books['Year_Of_Publication'], errors='coerce').fillna(0).astype(int)
books.loc[(books['Year_Of_Publication'] < 1000) | (books['Year_Of_Publication']>2024), 'Year_Of_Publication'] = None  # Remove invalid years
books['Year_Of_Publication'] = pd.to_numeric(books['Year_Of_Publication'], errors='coerce').fillna(0000).astype(int)
# Fill null values for critical fields
books['Publisher'] = books['Publisher'].fillna('Unknown')
books['Book_Author'] = books['Book_Author'].fillna('Unknown')

# Drop unused columns
books = books.drop(columns=['Image_URL_S', 'Image_URL_M', 'Image_URL_L'], axis=1) # these columns are not relevant for our reccomendation system

books.info()

In [None]:
books['Year_Of_Publication'].unique()

In [None]:
''' STEP 2: CLEANING OF RATINGS'''
ratings.describe()

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
ratings.describe(include = 'O')

In [21]:
# similar to books, we will remove duplicates and clean the ISBN column
ratings.drop_duplicates(inplace=True)
ratings.drop(ratings[ratings['ISBN'].str.lower() == 'unknown'].index, inplace=True)
ratings['ISBN'] = ratings['ISBN'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9]', '', x).strip())

In [None]:
''' STEP 3: CLEANING OF USERS'''
users.describe()

In [None]:
users.info()

In [24]:
users['Age'] = pd.to_numeric(users['Age'], errors='coerce').fillna(-1).astype(int)

In [None]:
# Ontology IRI
ONTOLOGY_IRI = "http://www.semanticweb.org/alexanderanderson/ontologies/bookOntology/"

# Define Namespaces
EX = Namespace(ONTOLOGY_IRI)
OWL_NS = OWL
RDFS_NS = RDFS
FOAF_NS = FOAF

# Initialize RDF Graph
g = Graph()
g.bind("ex", EX)
g.bind("owl", OWL_NS)
g.bind("rdfs", RDFS_NS)
g.bind("foaf", FOAF_NS)

# Define Classes
BOOK_CLASS = URIRef(EX.Novel)
USER_CLASS = URIRef(EX.User)
RATING_CLASS = URIRef(EX.Rating)

# Add Classes to the Graph
g.add((BOOK_CLASS, RDF.type, OWL.Class))
g.add((USER_CLASS, RDF.type, OWL.Class))
g.add((RATING_CLASS, RDF.type, OWL.Class))

# Subclass Example
g.add((RATING_CLASS, RDFS.subClassOf, OWL_NS.Thing))  # Rating is a subclass of Thing

# Define Object and Datatype Properties with Axioms
RATED_PROPERTY = URIRef(EX.rated)
RATING_PROPERTY = URIRef(EX.rating)
AUTHOR_PROPERTY = URIRef(EX.author)
PUBLISHER_PROPERTY = URIRef(EX.publisher)
YEAR_PROPERTY = URIRef(EX.publicationYear)
LOCATION_PROPERTY = URIRef(EX.location)
SELF_RATED_PROPERTY = URIRef(EX.selfRated)

# Object Properties
g.add((RATED_PROPERTY, RDF.type, OWL.ObjectProperty))
g.add((RATED_PROPERTY, RDFS.domain, USER_CLASS))
g.add((RATED_PROPERTY, RDFS.range, BOOK_CLASS))
g.add((RATED_PROPERTY, RDF.type, OWL.SymmetricProperty))
g.add((RATED_PROPERTY, RDF.type, OWL.TransitiveProperty))

# Reflexive Property
g.add((SELF_RATED_PROPERTY, RDF.type, OWL.ReflexiveProperty))
g.add((SELF_RATED_PROPERTY, RDFS.domain, USER_CLASS))
g.add((SELF_RATED_PROPERTY, RDFS.range, USER_CLASS))

# Datatype Properties
g.add((RATING_PROPERTY, RDF.type, OWL.DatatypeProperty))
g.add((RATING_PROPERTY, RDFS.domain, USER_CLASS))
g.add((RATING_PROPERTY, RDFS.range, XSD.integer))

g.add((AUTHOR_PROPERTY, RDF.type, OWL.DatatypeProperty))
g.add((AUTHOR_PROPERTY, RDFS.domain, BOOK_CLASS))
g.add((AUTHOR_PROPERTY, RDFS.range, XSD.string))

g.add((PUBLISHER_PROPERTY, RDF.type, OWL.DatatypeProperty))
g.add((PUBLISHER_PROPERTY, RDFS.domain, BOOK_CLASS))
g.add((PUBLISHER_PROPERTY, RDFS.range, XSD.string))

g.add((YEAR_PROPERTY, RDF.type, OWL.DatatypeProperty))
g.add((YEAR_PROPERTY, RDFS.domain, BOOK_CLASS))
g.add((YEAR_PROPERTY, RDFS.range, XSD.gYear))

g.add((LOCATION_PROPERTY, RDF.type, OWL.DatatypeProperty))
g.add((LOCATION_PROPERTY, RDFS.domain, USER_CLASS))
g.add((LOCATION_PROPERTY, RDFS.range, XSD.string))




In [26]:
# Add Books to the Graph
for _, row in books.iterrows():
    book_uri = URIRef(EX[f"Book_{row['ISBN']}"])
    g.add((book_uri, RDF.type, BOOK_CLASS))
    g.add((book_uri, RDFS.label, Literal(row['Book_Title'], lang="en")))
    g.add((book_uri, AUTHOR_PROPERTY, Literal(row['Book_Author'])))
    g.add((book_uri, PUBLISHER_PROPERTY, Literal(row['Publisher'])))
    g.add((book_uri, YEAR_PROPERTY, Literal(row['Year_Of_Publication'], datatype=XSD.gYear)))



In [27]:
# Add Users to the Graph
for _, row in users.iterrows():
    user_uri = URIRef(EX[f"User_{row['User-ID']}"])
    g.add((user_uri, RDF.type, USER_CLASS))
    g.add((user_uri, FOAF.name, Literal(f"User_{row['User-ID']}")))
    g.add((user_uri, LOCATION_PROPERTY, Literal(row['Location'])))
    if row['Age'] > 0:
        g.add((user_uri, FOAF.age, Literal(row['Age'], datatype=XSD.integer)))
    g.add((user_uri, SELF_RATED_PROPERTY, user_uri))  # Reflexive property


In [None]:
# Add Ratings to the Graph
for _, row in ratings.iterrows():
    user_uri = URIRef(EX[f"User_{row['User-ID']}"])
    book_uri = URIRef(EX[f"Book_{row['ISBN']}"])
    g.add((user_uri, RATED_PROPERTY, book_uri))
    g.add((user_uri, RATING_PROPERTY, Literal(row['Book-Rating'], datatype=XSD.integer)))

# Apply OWL Reasoning
import owlrl
owlrl.DeductiveClosure(owlrl.OWLRL_Semantics).expand(g)

In [None]:
# Serialize the RDF graph to Turtle format
ttl_file = 'bookData2.ttl'
g.serialize(destination=ttl_file, format='turtle')

print(f"RDF data has been successfully converted and stored in {ttl_file}")