In [3]:

%pip install rdflib
%pip install --upgrade --force-reinstall owlrl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting owlrl
  Downloading owlrl-7.1.2-py3-none-any.whl (51 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.7/51.7 kB[0m [31m425.2 kB/s[0m eta [36m0:00:00[0m31m?[0m eta [36m-:--:--[0m
[?25hCollecting rdflib>=7.1.1
  Using cached rdflib-7.1.1-py3-none-any.whl (562 kB)
Collecting pyparsing<4,>=2.1.0
  Downloading pyparsing-3.2.0-py3-none-any.whl (106 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.9/106.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m[31m4.3 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting isodate<1.0.0,>=0.7.2
  Using cached isodate-0.7.2-py3-none-any.whl (22 kB)
Install

In [1]:
import pandas as pd
import re
import rdflib
from rdflib import Graph, Literal, Namespace, RDF, RDFS, OWL, URIRef
from rdflib.namespace import XSD
import owlrl  # For reasoning

ModuleNotFoundError: No module named 'owlrl'

In [None]:
# Load datasets
books = pd.read_csv("Books.csv")  
ratings = pd.read_csv("Ratings.csv")  
users = pd.read_csv("Users.csv")

In [None]:
books.head()

In [None]:
books.info()

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
users.head()

In [None]:
users.info()

In [None]:
# Size of datasets

print(f'''\t  Size of books data is {books.shape}
          Size of ratings data is {ratings.shape}
          Size of users data is {users.shape}''')

In [None]:
# Checking duplicates in datasets using duplicated method of dataframe.

print(f'''\t  Duplicates in books data is {books.duplicated().sum()}
          Duplicates in ratings data is {ratings.duplicated().sum()}
          Duplicates in users data is {users.duplicated().sum()}''')

In [12]:
# A function to get the missing values count and it's percentage
def missing_values(df):
  """
  Description : This function takes a data frame as an input and gives missing value count and its percentage as an output
  function_name : missing_values
  Argument : dataframe.
  Return : dataframe
  
  """
  miss = df.isnull().sum() # finding the missing values.
  
  per = df.isnull().mean() # finding mean/ Average of missing values.
  df = pd.concat([miss,per*100],keys = ['Missing_Values','Percentage'], axis = 1) # concatenating both of them using concat method of pandas module.
  return df # returning dataframe

In [None]:
''' STEP 1: CLEANING OF BOOKS'''
books.describe()

In [None]:
missing_values(books)

number of missing values for Book_author and Publisher is negegable --> drop those rows

In [None]:
# Remove duplicates
books.drop_duplicates(inplace=True)
# Drop rows with 'unknown' ISBN or Book_Title
books.drop(books[books['ISBN'].str.lower() == 'unknown'].index, inplace=True)
books.drop(books[books['Book_Title'].str.lower() == 'unknown'].index, inplace=True)
# Clean ISBN column: remove invalid characters and spaces
books['ISBN'] = books['ISBN'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9]', '', x).strip())

# Ensure the Year_Of_Publication is numeric and valid
books['Year_Of_Publication'] = pd.to_numeric(books['Year_Of_Publication'], errors='coerce').fillna(0).astype(int)
books.loc[(books['Year_Of_Publication'] < 1000) | (books['Year_Of_Publication']>2024), 'Year_Of_Publication'] = None  # Remove invalid years
books['Year_Of_Publication'] = pd.to_numeric(books['Year_Of_Publication'], errors='coerce').fillna(0000).astype(int)
# Fill null values for critical fields
books['Publisher'] = books['Publisher'].fillna('Unknown')
books['Book_Author'] = books['Book_Author'].fillna('Unknown')

# Drop unused columns
books = books.drop(columns=['Image_URL_S', 'Image_URL_M', 'Image_URL_L'], axis=1) # these columns are not relevant for our reccomendation system

books.info()


In [None]:
books['Year_Of_Publication'].unique()

In [None]:
''' STEP 2: CLEANING OF RATINGS'''
ratings.describe()

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
ratings.describe(include = 'O')

In [21]:
# similar to books, we will remove duplicates and clean the ISBN column
ratings.drop_duplicates(inplace=True)
ratings.drop(ratings[ratings['ISBN'].str.lower() == 'unknown'].index, inplace=True)
ratings['ISBN'] = ratings['ISBN'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9]', '', x).strip())

In [None]:
''' STEP 3: CLEANING OF USERS'''
users.describe()

In [None]:
users.info()

In [24]:
# !pip install pycountry
# import pycountry
# countries = [country.name.strip().lower() for country in pycountry.countries]

In [25]:
# import re

# # Example: Your Country Column
# users['Country'] = users['Location'].str.split(',').str[-1].str.strip().str.lower()

# # Define a mapping for misspelled countries and replacements
# country_mapping = {
#     # Common corrections
#     'u.s.a.': 'usa', 'u.s.a': 'usa', 'good old usa !': 'usa', 'usa"': 'usa',
#     'u.s. of a.': 'usa', 'america': 'usa', 'united states': 'usa',
#     'united stated': 'usa', 'united staes': 'usa', 'unite states': 'usa',
#     'england': 'united kingdom', 'u.k.': 'united kingdom', 'uk': 'united kingdom',
#     'united kindgdom': 'united kingdom', 'united kindgonm': 'united kingdom',
#     'russia': 'russian federation', 'russian federation': 'russian federation',
#     'deutschland': 'germany', 'germay': 'germany', 'geermany': 'germany',
#     'españa': 'spain', 'espaã±a': 'spain', 'la france': 'france',
#     'méxico': 'mexico', 'mã?â©xico': 'mexico',
#     'italia': 'italy', 'itlay': 'italy', 'italy"': 'italy',
#     'catalunya spain': 'spain', 'brasil': 'brazil', 'brazil"': 'brazil',
#     'suisse': 'switzerland', 'la suisse': 'switzerland', 'switzerland"': 'switzerland',
#     'netherlands"': 'netherlands', 'holland': 'netherlands',
#     'u.a.e': 'united arab emirates', 'u.a.e"': 'united arab emirates',
#     'uae': 'united arab emirates',
#     'phillipines': 'philippines', 'phippines': 'philippines',
#     'india"': 'india',
#     # Handle special regions and nonsense
#     'somewherein space': 'unknown', 'space': 'unknown', 'n/a': 'unknown',
#     'n/a - on the road': 'unknown', 'nowhere': 'unknown',
#     'in your heart': 'unknown', 'home of the van!!': 'unknown',
#     'everywhere and anywhere': 'unknown', 'strongbadia': 'unknown',
#     'mordor': 'unknown', 'evil empire': 'unknown', 'fairyland': 'unknown',
#     'unknown': 'unknown', 'aaa': 'unknown', '-': 'unknown', '.': 'unknown', 
#     '...': 'unknown', '????': 'unknown', '*': 'unknown'
# }

# # Function to clean country names
# def clean_country(country):
#     # Check if the country is in the mapping dictionary
#     if country in country_mapping:
#         return country_mapping[country]
#     # Remove numeric and special characters except letters, spaces, and hyphens
#     cleaned = re.sub(r'[^a-zA-Z\s\-]', '', country).strip()
#     # If still invalid, return 'unknown'
#     return cleaned if cleaned else 'unknown'

# # Apply the cleaning function
# users['Country'] = users['Country'].apply(clean_country)

# # Check unique cleaned countries
# print(users['Country'].unique())

In [34]:
users['Age'] = pd.to_numeric(users['Age'], errors='coerce').fillna(-1).astype(int)

In [27]:
# Define namespaces
EX = Namespace("http://example.org/bookRec#")
SCHEMA = Namespace("http://schema.org/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

# Create RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("schema", SCHEMA)
g.bind("foaf", FOAF)

In [None]:
# Add books to RDF graph
for _, row in books.iterrows():
    book_uri = URIRef(EX[f"Book_{row['ISBN']}"]) # unique identifier for each book
    g.add((book_uri, RDF.type, EX.Book))
    g.add((book_uri, RDFS.label, Literal(row['Book_Title'], lang="en")))
    g.add((book_uri, EX.author, Literal(row['Book_Author'])))
    g.add((book_uri, EX.publisher, Literal(row['Publisher'])))
    g.add((book_uri, EX.year, Literal(row['Year_Of_Publication'], datatype=XSD.gYear)))

# Serialize the graph
print("Serialized RDF Graph:")
print(g.serialize(format="turtle", indent=4))

In [None]:
for _, row in users.iterrows():
    user_uri = URIRef(EX[f"User_{row['User-ID']}"])
    g.add((user_uri, RDF.type, FOAF.Person))
    g.add((user_uri, FOAF.id, Literal(f"User_{row['User-ID']}")))
    g.add((user_uri, EX.location, Literal(row['Location'])))
    if row['Age'] != -1:
        g.add((user_uri, FOAF.age, Literal(row['Age'], datatype=XSD.integer)))

# Serialize the graph
print("Serialized RDF Graph:")
print(g.serialize(format="turtle", indent=4))

In [36]:
for _, row in ratings.iterrows():
    user_uri = URIRef(EX[f"User_{row['User-ID']}"])
    book_uri = URIRef(EX[f"Book_{row['ISBN']}"])
    g.add((user_uri, EX.rated, book_uri))
    g.add((user_uri, EX.rating, Literal(row['Book-Rating'], datatype=XSD.integer)))

In [None]:
# Serialize the RDF graph to Turtle format
ttl_file = 'bookData.ttl'
g.serialize(destination=ttl_file, format='turtle')

print(f"RDF data has been successfully converted and stored in {ttl_file}")