In [1]:
# Imports
import numpy as np
import pandas as pd
import os

In [2]:
# Load the dataset
path = "./books.csv"
df = pd.read_csv(path, delimiter=',', on_bad_lines="skip")

# Drop columns that are unnecessary
df.rename(columns={"  num_pages": "num_pages"}, inplace=True)
df = df.drop(columns=["ratings_count", "text_reviews_count", "publication_date", "average_rating", "num_pages"])

In [3]:
import chardet

with open("books.csv", "rb") as file:
    result = chardet.detect(file.read())
    print(result['encoding'])

utf-8


In [4]:
df.head()

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,0439785960,9780439785969,eng,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,0439358078,9780439358071,eng,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,0439554896,9780439554893,eng,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,043965548X,9780439655484,eng,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,0439682584,9780439682589,eng,Scholastic


In [5]:
# The titles and authors' names contain special characters
# We need to escape them to properly convert to facts in Prolog
def filter_token(token):
    token = token.replace("'", "''")
    token = token.replace('"', "")
    token = token.replace('/', "'/'")  # use single quotes to escape
    token = token.replace('[', '').replace(']', '')

    return token

def filter_authors(authors):
    # Split authors by '/' and filter each name
    authors_list = [author.strip() for author in authors.split('/')]
    authors_list = [author.replace("'", "''") for author in authors_list]  # similarly, escape
    # Return authors as a Prolog list
    return f"[{', '.join([f'\'{author}\'' for author in authors_list])}]"


# Example: "J.K. Rowling/Mary GrandPrÃ©" -->  ['J.K. Rowling', 'Mary GrandPrÃ©']
df["title"] = df["title"].apply(filter_token)
df["authors"] = df["authors"].apply(filter_authors)
df["publisher"] = df["publisher"].apply(filter_token)

In [6]:
# facts are of the form : book(bookID, title, [list of authors], Publisher)
with open("books.pl", "w", encoding="utf-8") as facts:
    for index, row in df.iterrows():
        try:
            facts.write(f"book({row["bookID"]},\'{row["title"]}\',{row["authors"]},\'{row["publisher"]}\').\n")
        except:
            print(f"Error at index {index}")