In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import gzip
import requests
import io
import os
from pyspark.sql import SparkSession
import pyspark.pandas as ps
import pyarrow
from data_processing import load_data



In [2]:
#spark = SparkSession.builder.appName('goodreads recsys').getOrCreate()
SparkSession.builder.appName('goodreads recsys').config("spark.driver.memory", "6g").getOrCreate()

24/04/07 13:50:37 WARN Utils: Your hostname, Seans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.216 instead (on interface en0)
24/04/07 13:50:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/07 13:50:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
local_copy=True
local_dir='data/'
file_names=None

"""
:param local_copy: bool: If False, data will be downloaded from online repository
:param local_dir: str: Directory containing local files, used if local_copy == True
:param file_names: dict: Filenames for books, authors, genres, interactions, and reviews if different from repo
:return: pyspark.pandas Dataframe containing books info
"""

books_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_books.json.gz'
authors_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_book_authors.json.gz'
genres_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_book_genres_initial.json.gz'
interactions_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_interactions.csv'
reviews_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_reviews_spoiler_raw.json.gz'

def extract_file_name(url):
    return url.split('/')[-1]

default_file_names = {'books': extract_file_name(books_url),
                      'authors': extract_file_name(authors_url),
                      'genres': extract_file_name(genres_url),
                      'interactions': extract_file_name(interactions_url),
                      'reviews': extract_file_name(reviews_url)}

if file_names:
    for key in [x for x in default_file_names.keys()]:
        if key not in file_names:
            file_names[key] = default_file_names[key]
else:
    file_names = default_file_names

if local_copy:
    books_path = local_dir + file_names['books']
    authors_path = local_dir + file_names['authors']
    genres_path = local_dir + file_names['genres']
    interactions_path = local_dir + file_names['interactions']
    reviews_path = local_dir + file_names['reviews']
else:
    books_path = io.BytesIO(requests.get(books_url).content)
    authors_path = io.BytesIO(requests.get(authors_url).content)
    genres_path = io.BytesIO(requests.get(genres_url).content)
    interactions_path = io.BytesIO(requests.get(interactions_url).content)
    reviews_path = io.BytesIO(requests.get(reviews_url).content)

In [None]:
#books_df = spark.read.json(gzip.open(books_path))
books_df = ps.read_json(gzip.open(books_path), lines=True, index_col='book_id')

In [None]:
authors_df = ps.read_json(gzip.open(authors_path), lines=True, index_col='author_id')
genres_df = ps.read_json(gzip.open(genres_path), lines=True, index_col='book_id')
reviews_df = ps.read_json(gzip.open(reviews_path), lines=True, index_col='book_id')
int_df = ps.read_csv(interactions_path)

In [None]:
books_df = books_df.drop(
    columns=['series', 'asin', 'kindle_asin', 'similar_books', 'link', 'url', 'image_url',
             'edition_information', 'title_without_series', 'popular_shelves', 'publisher'])

def extract_authors(authors_dict):
    return [author['author_id'] for author in authors_dict]

authors_column = books_df['authors'].apply(extract_authors)
books_df['author_id'] = authors_column
books_df = books_df.explode('author_id').set_index('book_id')
books_df['author_id'] = books_df['author_id'].fillna(0).astype('int64')
books_df = books_df.join(authors_df, how='inner', on='author_id', lsuffix='_book', rsuffix='_author')

genres_df = genres_df['genres'].apply(ps.Series).join(genres_df)
genres_df = genres_df.drop(columns=['genres']).fillna(0)
genres_df = genres_df.set_index('book_id')
books_df = books_df.join(genres_df, how='inner', on='book_id')