In [1]:
import os
import sys
import re
import html
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import Reader, Dataset, SVD, dump
from surprise.model_selection import cross_validate  # thay cho evaluate
import surprise.accuracy as accuracy                 # giữ accuracy

from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features, get_mapper, get_tags
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse
import xml_to_dict

In [2]:
data_path = '../goodbooks-10k/'

In [3]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    return s

In [4]:
def get_books(data_path):
    metadata_directory = data_path + 'books_xml/books_xml'
    goodreads_to_bookid = get_mapper(data_path + 'books.csv')
    book_tags = get_tags(data_path + 'book_tags_with_bookid.csv', data_path + 'tags.csv')
    books = []
    
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        goodreads_id = raw_book['book']['id']
        book['id'] = goodreads_to_bookid[goodreads_id]
        book['title'] = raw_book['book']['title']
        book['image_url'] = raw_book['book']['image_url']
        book['url'] = raw_book['book']['url']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])
        
        books.append(book)
    return books

In [5]:
books = get_books(data_path)

In [8]:
df = pd.DataFrame(books)
df['id'] = df['id'].astype(int)
df = df.sort_values(by=['id'])
df = df.set_index('id')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

In [9]:
df.columns

Index(['title', 'image_url', 'url', 'author', 'description', 'popular_shelves',
       'tags'],
      dtype='object')

In [None]:
os.makedirs('../.tmp', exist_ok=True)
df.to_pickle('../.tmp/books_dataframe_reduced')

In [None]:

books1 = pd.read_pickle('../.tmp/books_dataframe_reduced')
print(books1.shape)
print(books1.dtypes)
display(books1.head(10))


(10000, 7)
title              object
image_url          object
url                object
author             object
description        object
popular_shelves    object
tags               object
dtype: object


Unnamed: 0_level_0,title,image_url,url,author,description,popular_shelves,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"The Hunger Games (The Hunger Games, #1)",https://images.gr-assets.com/books/1447303603m...,https://www.goodreads.com/book/show/2767052-th...,Suzanne Collins,winning will make you famous. losing means cer...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
2,Harry Potter and the Sorcerer's Stone (Harry P...,https://images.gr-assets.com/books/1474154022m...,https://www.goodreads.com/book/show/3.Harry_Po...,J.K. Rowling,harry potter's life is miserable. his parents ...,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...
3,"Twilight (Twilight, #1)",https://images.gr-assets.com/books/1361039443m...,https://www.goodreads.com/book/show/41865.Twil...,Stephenie Meyer,about three things i was absolutely positive.f...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
4,To Kill a Mockingbird,https://images.gr-assets.com/books/1361975680m...,https://www.goodreads.com/book/show/2657.To_Ki...,Harper Lee,the unforgettable novel of a childhood in a sl...,classics classics classics classics classics ...,classics classics classics classics classics ...
5,The Great Gatsby,https://images.gr-assets.com/books/1490528560m...,https://www.goodreads.com/book/show/4671.The_G...,F. Scott Fitzgerald,"the great gatsby, f. scott fitzgerald’s third ...",classics classics classics classics classics ...,classics classics classics classics classics ...
6,The Fault in Our Stars,https://images.gr-assets.com/books/1360206420m...,https://www.goodreads.com/book/show/11870085-t...,John Green,"there is an alternate cover edition here.""i fe...",young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
7,The Hobbit,https://images.gr-assets.com/books/1372847500m...,https://www.goodreads.com/book/show/5907.The_H...,J.R.R. Tolkien,in a hole in the ground there lived a hobbit. ...,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...
8,The Catcher in the Rye,https://images.gr-assets.com/books/1398034300m...,https://www.goodreads.com/book/show/5107.The_C...,J.D. Salinger,the hero-narrator of the catcher in the rye is...,classics classics classics classics classics ...,classics classics classics classics classics ...
9,"Angels & Demons (Robert Langdon, #1)",https://images.gr-assets.com/books/1303390735m...,https://www.goodreads.com/book/show/960.Angels...,Dan Brown,when world-renowned harvard symbologist robert...,fiction fiction fiction fiction fiction ficti...,fiction fiction fiction fiction fiction ficti...
10,Pride and Prejudice,https://images.gr-assets.com/books/1320399351m...,https://www.goodreads.com/book/show/1885.Pride...,Jane Austen,"“it is a truth universally acknowledged, that ...",classics classics classics classics classics ...,classics classics classics classics classics ...
