In [1]:
import numpy as np
import pandas as pd
import nltk
import os
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

for dirname, _, filenames in os.walk('../Data-Files/Raw-Files'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/content/EODP2/BX-NewBooksRatings.csv
/content/EODP2/BX-Books.csv
/content/EODP2/BX-NewBooks.csv
/content/EODP2/BX-Users.csv
/content/EODP2/BX-NewBooksUsers.csv
/content/EODP2/BX-Ratings.csv


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def load_data(input_path):
    # Load and return all datasets
    ratings = pd.read_csv(os.path.join(input_path, 'BX-Ratings.csv'))
    new_ratings = pd.read_csv(os.path.join(input_path, 'BX-NewBooksRatings.csv'))
    books_old = pd.read_csv(os.path.join(input_path, 'BX-Books.csv'))
    new_books = pd.read_csv(os.path.join(input_path, 'BX-NewBooks.csv'))
    books = pd.concat([books_old, new_books]).drop_duplicates(subset=["Book-Title"], keep='first')
    ratings = pd.concat([ratings, new_ratings])
    return ratings, books

"""Function for age string preprocessing"""
def onlyNumber(age):
  try:
      string_age = str(age)
      cleaned_age = int(re.sub(r'[^0-9]', '', string_age))
      if cleaned_age < 2 or cleaned_age> 123:
        #oldest person in the world is 122 years old and the youngest is 2 years old
          return np.nan
      return cleaned_age
  except:
      return np.nan

def has_special_characters(s):
  s = str(s)
  return any(ord(char) > 127 for char in s)

def abbreviate(s):
  text = s.split()
  new_string = []
  for i in range(len(text)):
      if i < (len(text) - 1):
          new_string.append(text[i][0])
      else:
          new_string.append(text[i])
  return ' '.join(new_string)

# Joins together consecutive single characters in a string
def process_strings(s):
  text = s.split()
  curr = ''
  new_words = []
  # joining together single letters
  for word in text:
      if len(word) == 1:
          curr += word
      else:
          if curr:
              new_words.append(curr)
              curr = ''
          new_words.append(word)

  if curr:
      new_words.append(curr)

  return ' '.join(new_words)

# Preprocesses titles
def title_preprocess(doc, stop_words, lemmatizer):
  doc = str(doc)
  doc = doc.lower()
  # Fixing special cases (periods with no spaces)
  doc_special = re.sub(r'\.(?=\w)', '. ', doc)

  # Removing all punctuation
  doc_punct = re.sub(r'[^A-Za-z0-9\s]', '', doc_special)

  # Remove all instances of 'paperback' in titles
  processed_doc = re.sub(r'paperback\s*', '', doc_punct)

  # Tokenising, removing stop words, then lemmatizing the tokens
  tokens = word_tokenize(processed_doc)
  tokens = [w for w in tokens if not w in stop_words]
  lemmatized = [lemmatizer.lemmatize(w) for w in tokens]

  return ' '.join(lemmatized)

# Preprocesses publishers
def publisher_preprocess(doc, stop_words, lemmatizer):
  doc = doc.lower()
  # Fixing special cases (periods with no spaces)
  doc_special = re.sub(r'\.(?=\w)', '. ', doc)

  # Removing all punctuation
  doc_punct = re.sub(r'[^A-Za-z0-9\s]', '', doc_special)

  # Join single characters (initials)
  doc_joined = process_strings(doc_punct)

  # Removing irrelevant words in publishers
  processed_doc = re.sub(r'\b(?:paperback|books|press|publishing|paperbacks)\b', '', doc_joined)

  # Tokenising, removing stop words, then lemmatizing the tokens
  tokens = word_tokenize(processed_doc)
  tokens = [w for w in tokens if not w in stop_words]
  lemmatized = [lemmatizer.lemmatize(w) for w in tokens]
  return ' '.join(lemmatized)

def prepare_data(books):

    # Filter out all non ASCII characters
    books = books[~books['Book-Title'].apply(has_special_characters)]
    books = books[~books['Book-Author'].apply(has_special_characters)]
    books = books[~books['Book-Publisher'].apply(has_special_characters)]

    # Book Title preprocessing steps
    # Initialising lemmatizer, stopwords, and tfidf
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    processed_titles = []
    for i, title in enumerate(books['Book-Title']):
        processed_titles.append(title_preprocess(title, stop_words,lemmatizer))
    books['Title-Tokens'] = [word_tokenize(t) for t in processed_titles]

    # Author preprocessing steps
    # Setting all author names to lowercase
    books['Book-Author'] = books['Book-Author'].astype(str)
    books['Book-Author'] = books['Book-Author'].apply(lambda x: x.lower())

    # Fixing special cases (periods with no spaces)
    author_special = books['Book-Author'].apply(lambda x: re.sub(r'\.(?=\w)', '. ', x))

    # Removing All punctuations
    author_punct = author_special.apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x))

    # Abbreviating then joining together single letters into one word (for name initials)
    books['Book-Author'] = author_punct.apply(abbreviate)
    books['Book-Author'] = books['Book-Author'].apply(process_strings)
    books['Author-Tokens'] = books['Book-Author'].apply(word_tokenize)

    # Publishing Year Preprocessing Steps
    # Convert all years outside plausible range to 0
    books.loc[~books['Year-Of-Publication'].between(1920, 2005), 'Year-Of-Publication'] = 0
    filtered_books = books[books['Year-Of-Publication'] != 0]

    # Get average year of publication for each author
    publish_years = filtered_books.groupby('Book-Author')['Year-Of-Publication'].mean().reset_index()

    # Replace 0 values with corresponding author mean
    books = books.merge(publish_years, on='Book-Author', suffixes=('', '_mean'))
    books['Year-Of-Publication'] = books.apply(lambda row: row['Year-Of-Publication_mean']
        if row['Year-Of-Publication'] == 0 else row['Year-Of-Publication'], axis=1)

    books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)
    books.drop(columns='Year-Of-Publication_mean', inplace=True)

    # Publisher Preprocessing Steps
    processed_publishers = []
    for i, title in enumerate(books['Book-Publisher']):
        processed_publishers.append(publisher_preprocess(title, stop_words,lemmatizer))

    books['Publisher-Tokens'] = books['Book-Publisher'].apply(word_tokenize)
    return books.drop_duplicates(subset=["Book-Title"], keep='first').dropna()

def combine_tokens(books):

    tokens = ["Year-Of-Publication-Tokens","Title-Tokens", "Author-Tokens", "Publisher-Tokens"]
    books["Year-Of-Publication-Tokens"] = books["Year-Of-Publication"].astype(str).apply(lambda x: [x])

    # Combine the token arrays from different columns into a single array
    books['Combined-Tokens'] = books[tokens].apply(lambda x: np.concatenate(x), axis=1)

    # Create a new DataFrame with 'ISBN' and 'Combined-Tokens' columns
    df_tokens = books[['ISBN', 'Combined-Tokens']]
    return df_tokens

def prepare_tfidf(books):

    df_tokens = combine_tokens(books)

    # Apply TF-IDF vectorization to the combined token column
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_tokens['Combined-Tokens'].apply(lambda x: ' '.join(x)))

    # Create a DataFrame with TF-IDF values and 'ISBN' as the index
    df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df_tokens['ISBN'])
    return df_tfidf

def prepare_vector(books):

    df_tokens = combine_tokens(books)
    cv = CountVectorizer(max_features=500, stop_words="english")
    vectors_matrix = cv.fit_transform(df_tokens["Combined-Tokens"].apply(lambda x: ' '.join(x)))
    df_vector = pd.DataFrame(vectors_matrix.toarray(), columns=cv.get_feature_names_out(), index=df_tokens['ISBN'])

    return df_vector

def find_similar_books(book_id, books, vector, recommend_number = 5):
    # Calculate similarity and find similar books

    similarity = cosine_similarity(vector)
    try:
        book_idx = books.index[books['ISBN'] == book_id].tolist()[0]
        #default is set at recommending 10 books this value will also affect predicted rating
        similar_books = sorted(list(enumerate(similarity[book_idx])), key=lambda x: x[1], reverse=True)[1:recommend_number]

        return similar_books

    except IndexError:
        print("Book ID not found in dataset.")

        return []

def recommend_books_vector(ratings, books, similar_books):
    # predicting books rating based on the cosine similarity scores of the vector based word processing

    rating_sims = []
    for idx, sim_score in similar_books:
        similar_isbn = books.iloc[idx]['ISBN']
        if similar_isbn in books['ISBN'].values:
            book_title = books.iloc[idx]['Book-Title']
            book_author = books.iloc[idx]['Book-Author']
            isbn_ratings = ratings[ratings["ISBN"] == similar_isbn]
            book_year = books.iloc[idx]['Year-Of-Publication']
            book_publisher = books.iloc[idx]['Book-Publisher']
            if not isbn_ratings.empty:
                mean_rating = isbn_ratings['Book-Rating'].mean()
                adjusted_score = mean_rating * sim_score
                rating_sims.append(float(adjusted_score))
    if rating_sims:
        average_adjusted_score = sum(rating_sims) / len(rating_sims)
        print(f"\nPredicted Rating: {average_adjusted_score:.2f}")

        return average_adjusted_score

def recommend_books_tfidf(ratings, books, similar_books):
    # Recommend books based on the cosine similarity scores of the tfidf based word processing

    for idx, sim_score in similar_books:
        similar_isbn = books.iloc[idx]['ISBN']
        if similar_isbn in books['ISBN'].values:
            book_title = books.iloc[idx]['Book-Title']
            book_author = books.iloc[idx]['Book-Author']
            isbn_ratings = ratings[ratings["ISBN"] == similar_isbn]
            book_year = books.iloc[idx]['Year-Of-Publication']
            book_publisher = books.iloc[idx]['Book-Publisher']
            if not isbn_ratings.empty:
                print(f"{book_title} by {book_author} published in {book_year} by {book_publisher}")

def recommend( wanted_book,books,ratings, recommend_state = False):
    print("\nChoosen book: ",wanted_book)
    added_books = pd.concat([books,wanted_book])
    books = prepare_data(added_books)

    print("\nPreprocessed data finished\n")
    wanted_ISBN = wanted_book.iloc[0]['ISBN']

    #These section is for the rating prediction
    vector = prepare_vector(books)
    print("Vectorized data finished\n")

    similar_books_vector = find_similar_books(wanted_ISBN, books, vector)

    # These section is for recommendation section
    if recommend_state:
      tfidf_books = prepare_data(added_books)
      tfidf = prepare_tfidf(tfidf_books)
      similar_books_tfidf = find_similar_books(wanted_ISBN, tfidf_books, tfidf)
    print("Find Similarity finished\n")

    if similar_books_vector:
        #this section will calculate and return the recommend books

        if recommend_state:
          print("\nRecommend Book\n")
          recommend_books_tfidf(ratings, tfidf_books, similar_books_tfidf)

        #this section will calculate and return the predicted value
        return recommend_books_vector(ratings, books, similar_books_vector)

"""This is the recommendation model that use cosine similarity to recommend
    book based on the user input book which it will give the predicted rating
    and give recommending books based on the tfidf vector. The recommend functionality
    is very computing intensive so it is set as False by default. However you can
    try and run it with the recommend_state = True like the following cell.
    """
def RecommendationSystem(wanted_book,books,ratings,recommend_state =False):

    return recommend(wanted_book,books,ratings,recommend_state)

In [3]:
#Please don't forget to provide the file path to the load_data
ratings, books = load_data('../../Data-Files/Raw-Files')
#It also recommend to run this program in cloud based ipynb as it is very computing intensive
#Our group use the kaggle and Google Colab to run these and the setting is cpu runtime type with high-ram
#if using the free version it will not run as it exceed googlecolab ram limit
RecommendationSystem((pd.DataFrame({'ISBN': ['0425163091'],
                                    'Book-Title': ['Chocolate Jesus'],
                                    'Book-Author': ['s jaramillo'],
                                    'Year-Of-Publication': [1998],
                                    'Book-Publisher': ['Berkley Publishing Group']})),
                     books,ratings,recommend_state =True)


Choosen book:           ISBN       Book-Title  Book-Author  Year-Of-Publication  \
0  0425163091  Chocolate Jesus  s jaramillo                 1998   

             Book-Publisher  
0  Berkley Publishing Group  

Preprocessed data finished

Vectorized data finished

Find Similarity finished


Recommend Book

Chocolate Quake by n fairbanks published in 2003 by Berkley Publishing Group
Jesus Freaks: DC Talk and The Voice of the Martyrs - Stories of Those Who Stood For Jesus, the Ultimate Jesus Freaks by d talk published in 1999 by Bethany House Publishers
Here on Earth by a hoffman published in 1998 by Berkley Publishing Group
N or M? by a christie published in 1996 by Berkley Publishing Group

Predicted Rating: 6.36


6.360266135096248

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
def delete_records_by_isbn(df, isbn):
    #Function for making train data set by delete the test data set out off the dataframe

    isbn_set = set(isbn['ISBN'])
    mask = (df['ISBN'].apply(lambda x: x not in isbn_set))
    updated_df = df[mask]
    return updated_df

def select_random_data(df, percentage):
    # Function for sampling a percentage of the dataframe using for test dataset

    return df.sample(n=int(len(df) * percentage))

def evaluate_model(percentage = 0.001): # this is fractional percentage 0.001 mean 0.1%
  #This function will evaluate the model which use mse, rmse, mae and r2 for evaluation
  ratings, books = load_data('../../Data-Files/Raw-Files')
  books = prepare_data(books)
  test_books = select_random_data(books,percentage)
  train_books = delete_records_by_isbn(books,test_books)
  train_ratings = delete_records_by_isbn(ratings,test_books)
  test_rating = ratings.groupby('ISBN')['Book-Rating'].mean().reset_index()
  test_rating.rename(columns={'Book-Rating': 'Avg-Book-Rating'}, inplace=True)
  test_rating = pd.merge(test_books,test_rating,on ="ISBN")
  pred_rate = []

  # Print the recommendations for each book
  for _, row in test_books.iterrows():
    row = (pd.DataFrame(row)).transpose()
    pred_rate.append(RecommendationSystem(row, train_books, train_ratings))
  #linear_rating = ratings.groupby('ISBN')['Book-Rating'].mean().reset_index()
  mse = mean_squared_error(test_rating['Avg-Book-Rating'], pred_rate)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(test_rating['Avg-Book-Rating'], pred_rate)
  r2 = r2_score(test_rating['Avg-Book-Rating'], pred_rate)
  print(f"R-squared: {r2:.2f}")
  print(f"Mean Squared Error: {mse:.2f}")
  print(f"Root Mean Squared Error: {rmse:.2f}")
  print(f"Mean Absolute Error: {mae:.2f}")

In [5]:
evaluate_model()


Choosen book:               ISBN                      Book-Title Book-Author  \
14739  0671769944  Sanctuary (Star Trek, Book 61)  j vornholt   

      Year-Of-Publication Book-Publisher                       Title-Tokens  \
14739                1992      Star Trek  [sanctuary, star, trek, book, 61]   

       Author-Tokens Publisher-Tokens  
14739  [j, vornholt]     [Star, Trek]  

Preprocessed data finished

Vectorized data finished

Find Similarity finished


Predicted Rating: 6.52

Choosen book:              ISBN                       Book-Title  Book-Author  \
1104  0061000035  The Dark Wind (Jim Chee Novels)  t hillerman   

     Year-Of-Publication Book-Publisher                    Title-Tokens  \
1104                1990    HarperTorch  [dark, wind, jim, chee, novel]   

       Author-Tokens Publisher-Tokens  
1104  [t, hillerman]    [HarperTorch]  

Preprocessed data finished

Vectorized data finished

Find Similarity finished


Predicted Rating: 5.91

Choosen book:          

In [9]:
evaluate_model(0.01)


Choosen book:               ISBN                                         Book-Title  \
10734  0373223188  Till Death Us Do Part (43 Light St.) (Harlequi...   

      Book-Author Year-Of-Publication Book-Publisher  \
10734      r york                1995      Harlequin   

                                            Title-Tokens Author-Tokens  \
10734  [till, death, u, part, 43, light, st, harlequi...     [r, york]   

      Publisher-Tokens  
10734      [Harlequin]  

Preprocessed data finished

Vectorized data finished

Find Similarity finished


Predicted Rating: 5.13

Choosen book:               ISBN                                 Book-Title Book-Author  \
14279  0486266842  Turn of the Screw (Dover Thrift Editions)     h james   

      Year-Of-Publication      Book-Publisher  \
14279                1991  Dover Publications   

                                Title-Tokens Author-Tokens  \
14279  [turn, screw, dover, thrift, edition]    [h, james]   

            Publisher-Tokens 