In [172]:
import json
import time # Added time
import pickle
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import pairwise_distances
from tqdm import tqdm


In [174]:
def read_raw_data(_num_samples, _fn):
    _df = pd.read_csv("goodreads_interactions.csv", nrows=_num_samples)
    _df = _df[_df.is_read == 1]
    _df = _df[0:_num_samples]
    _df.to_csv(f'goodreads_{_fn}.csv', index=False)

def build_rating_matrix(_df):
    _n_users = len(_df.user_id.unique()) + 1
    _n_books = _df.book_idx.max() + 1
    print(f'Users: {_n_users}')
    print(f'Books: {_n_books}')
    _ratings = np.zeros((_n_users, _n_books))
    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        i = row.user_id
        j = row.book_idx
        _ratings[i, j] = row.rating
    return _ratings

def recommend_item_similarity(_matrix, _eps, _n_latent):
    _item_svd = TruncatedSVD(n_components=_n_latent)
    _item_features = _item_svd.fit_transform(_matrix.transpose())
    print('Converting to sparse')
    _sparse_features = sparse.csr_matrix(_item_features)
    return _sparse_features

def generate_similarity_matrix(_features, _metric):
    assert _metric in ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
    print('Computing similarity')
    _similarity_matrix = pairwise_distances(_features, metric=_metric)
    return _similarity_matrix

def merge_meta(_meta_path, _map_path, _ratings):
    _meta = pd.read_csv(_meta_path)
    _map = pd.read_csv(_map_path)
    _ratings_map = _ratings.merge(_map, how='left', left_on='book_id', right_on='book_id_csv')
    _ratings_map = _ratings_map[['user_id', 'book_id_csv', 'is_read', 'rating', 'is_reviewed', 'book_id_y']]
    _ratings_map.columns = ['user_id', 'book_idx', 'is_read', 'rating', 'is_reviewed', 'book_id']
    _metadata_lookup = {}
    for _, row in _ratings_map.iterrows():
        _md = _meta[_meta['book_id'] == row['book_id']]
        if not _md.empty:
            _metadata_lookup[str(row.book_idx)] = {
                'title': _md['title'].values[0],
                'link': _md['link'].values[0]
            }
    return _ratings_map, _metadata_lookup


In [176]:
NS = 8000
FN = '8k'
EPS = 1e-9
FACTORS = 5  # Controls how much compression for the user book matrix
METRIC = 'euclidean'  # Similarity Distance Funtion, (SVD)


In [178]:
try:
    goodreads = pd.read_csv(f'goodreads_{FN}.csv')
except FileNotFoundError:
    read_raw_data(NS, FN)
    goodreads = pd.read_csv(f'goodreads_{FN}.csv')



In [180]:
ratings_meta, metadata_lookup = merge_meta(
    'book_metadata.csv',
    'book_id_map.csv',
    goodreads
)

with open(f'books_metadata_{FN}.json', 'w', encoding='utf-8') as m:
    json.dump(metadata_lookup, m)


In [182]:
ratings = build_rating_matrix(ratings_meta)
item_features = recommend_item_similarity(ratings, EPS, FACTORS)

start_time = time.time() # Start time
sim = generate_similarity_matrix(item_features, METRIC)
end_time = time.time() # End Time

print(f"Similarity computation time: {end_time - start_time:.2f} seconds")



Users: 11
Books: 7519


100%|██████████| 2024/2024 [00:00<00:00, 64077.23it/s]

Converting to sparse
Computing similarity





Similarity computation time: 0.40 seconds


In [184]:
with open(f'book_similarity_{FACTORS}_{FN}_{METRIC}.pkl', 'wb') as f:
    pickle.dump(sim, f)
print("Similarity matrix saved.")



Similarity matrix saved.


In [186]:
import json
import pickle

def test_recommender(_search, _similarity, _metadata):
    """
    A function to test our recommender system.
    :param _search: A book ID to search for.
    :param _similarity: Our recommender similarity matrix.
    :param _metadata: Mapping of book ID to title.
    :return: List of titles of top 5 most similar books.
    """
    row_sims = _similarity[_search, ]
    res = sorted(range(len(row_sims)), key=lambda sub: row_sims[sub])[-5:]
    print('Searched for book:', _metadata[str(_search)]['title'])
    for j, _ in enumerate(res):
        print(f'Match {j + 1}: {_metadata[str(res[j])]["title"]}')

In [188]:
# Match the build paramters if changed
FN = '8k'
FACTORS = 5  # Compression
METRIC = 'euclidean'  # SVD

SIM_PATH = f'book_similarity_{FACTORS}_{FN}_{METRIC}.pkl'
META_PATH = f'books_metadata_{FN}.json'

# Load similarity matrix
with open(SIM_PATH, 'rb') as f:
    sim = pickle.load(f)
print("Similarity matrix loaded.")

# Load metadata
with open(META_PATH, 'r', encoding='utf-8') as m:
    metadata_lookup = json.load(m)
print("Metadata loaded.")

Similarity matrix loaded.
Metadata loaded.


In [190]:
# Book ID 948 
test_recommender(948, sim, metadata_lookup)


Searched for book: The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy, #1-5)
Match 1: Memoirs of a Geisha
Match 2: Catching Fire (The Hunger Games, #2)
Match 3: The Scorch Trials (Maze Runner, #2)
Match 4: Pride and Prejudice
Match 5: Insurgent (Divergent, #2)


In [244]:
import json
import time
import pickle
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import pairwise_distances
from tqdm import tqdm


def merge_meta(_meta_path, _map_path, _ratings):
    """
    Merges book metadata with ratings.
    Returns ratings with book_idx (needed for building matrix).
    """
    _meta = pd.read_csv(_meta_path)
    _map = pd.read_csv(_map_path)
    _ratings_map = _ratings.merge(_map, how='left',
                                  left_on='book_id', right_on='book_id_csv')
    _ratings_map = _ratings_map[['user_id', 'book_id_csv', 'is_read',
                                 'rating', 'is_reviewed', 'book_id_y']]
    _ratings_map.columns = ['user_id', 'book_idx', 'is_read',
                            'rating', 'is_reviewed', 'book_id']
    return _ratings_map


def build_rating_matrix(_df):
    _n_users = len(_df.user_id.unique()) + 1
    _n_books = _df.book_idx.max() + 1
    print(f'Users: {_n_users}')
    print(f'Books: {_n_books}')
    _ratings = np.zeros((_n_users, _n_books))
    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        i = row.user_id
        j = row.book_idx
        _ratings[i, j] = row.rating
    return _ratings


def recommend_user_similarity(_matrix, _eps, _n_latent):
    svd = TruncatedSVD(n_components=_n_latent)
    user_features = svd.fit_transform(_matrix)
    print('Converting to sparse')
    return sparse.csr_matrix(user_features)


def generate_similarity_matrix(_features, _metric):
    print('Computing similarity...')
    _similarity_matrix = pairwise_distances(_features, metric=_metric)
    return _similarity_matrix


# --- Main execution ---

NS = 5000           # Sample size
FN = '5k'           # File name suffix
EPS = 1e-9          # Epsilon for numerical stability (not used here directly)
FACTORS = 2         # Latent factors for SVD
METRIC = 'cityblock'   # Similarity metric

# Load raw ratings file
ratings_df_raw = pd.read_csv(f'goodreads_{FN}.csv')

# Merge metadata to get book_idx
ratings_df = merge_meta('book_metadata.csv', 'book_id_map.csv', ratings_df_raw)

# Build user-item matrix
ratings_matrix = build_rating_matrix(ratings_df)

# Generate user similarity features
user_features = recommend_user_similarity(ratings_matrix, EPS, FACTORS)

# Compute user-user similarity matrix
start = time.time()
user_sim = generate_similarity_matrix(user_features, METRIC)
end = time.time()
print(f'User similarity computation time: {end - start:.2f} seconds')

# Save similarity matrix to file
with open(f'user_similarity_{FACTORS}_{FN}_{METRIC}.pkl', 'wb') as f:
    pickle.dump(user_sim, f)

print("User similarity matrix saved.")


Users: 7
Books: 7076


100%|██████████| 1329/1329 [00:00<00:00, 72838.14it/s]

Converting to sparse
Computing similarity...
User similarity computation time: 0.00 seconds
User similarity matrix saved.





In [246]:
# Check to see what the user ID range is

print(user_sim.shape)


(7, 7)


In [250]:
import json
import pickle
import pandas as pd


def test_user_recommender(_user_id, _similarity, _ratings_df, _metadata, _top_n=5):
    """
    Prints the top N most similar users to the given user_id and their top-rated books.

    :param _user_id: The user ID to search for.
    :param _similarity: The user-user similarity matrix.
    :param _ratings_df: DataFrame with user-book ratings (must include book_idx and rating).
    :param _metadata: Dict with book_idx -> {title, link}.
    :param _top_n: Number of similar users to return.
    """
    user_similarities = _similarity[_user_id]
    similar_users = sorted(
        [(i, sim) for i, sim in enumerate(user_similarities) if i != _user_id],
        key=lambda x: x[1]
    )[:_top_n]  # Closest users (lowest distance for cosine)

    print(f"\nMost similar users to User {_user_id}:\n")
    for rank, (uid, sim) in enumerate(similar_users, start=1):
        print(f"User {uid} (Similarity score: {sim:.4f})")

        user_books = _ratings_df[_ratings_df.user_id == uid]
        top_books = user_books.sort_values(by='rating', ascending=False).head(3)

        for _, row in top_books.iterrows():
            book_idx = str(int(row.book_idx))  # Ensure string keys match metadata keys
            if book_idx in _metadata:
                print(f"  - {row.rating:.1f}: {_metadata[book_idx]['title']}")
        print("")


if __name__ == "__main__":
    # Match these to the build script
    FN = '5k'
    FACTORS = 2
    METRIC = 'cityblock'

    # Load similarity matrix
    with open(f'user_similarity_{FACTORS}_{FN}_{METRIC}.pkl', 'rb') as f:
        user_sim = pickle.load(f)

    # Load metadata
    with open(f'books_metadata_{FN}.json', 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    # Load and merge ratings
    ratings_raw = pd.read_csv(f'goodreads_{FN}.csv')

    def merge_meta(_meta_path, _map_path, _ratings):
        _meta = pd.read_csv(_meta_path)
        _map = pd.read_csv(_map_path)
        _ratings_map = _ratings.merge(_map, how='left',
                                      left_on='book_id', right_on='book_id_csv')
        _ratings_map = _ratings_map[['user_id', 'book_id_csv', 'is_read',
                                     'rating', 'is_reviewed', 'book_id_y']]
        _ratings_map.columns = ['user_id', 'book_idx', 'is_read',
                                'rating', 'is_reviewed', 'book_id']
        return _ratings_map

    ratings_df = merge_meta('book_metadata.csv', 'book_id_map.csv', ratings_raw)

    # Test user similarity for a given user
    test_user_id = 3  # You can try different values from the ID check
    test_user_recommender(test_user_id, user_sim, ratings_df, metadata)



Most similar users to User 3:

User 6 (Similarity score: 0.6801)

User 4 (Similarity score: 2.9406)
  - 5.0: The Fault in Our Stars
  - 5.0: Wonder
  - 5.0: Valley of the Dolls

User 1 (Similarity score: 5.0589)
  - 5.0: Ramona Forever (Ramona, #7)
  - 5.0: The Fellowship of the Ring (The Lord of the Rings, #1)
  - 5.0: A Bear Called Paddington (Paddington, #1)

User 2 (Similarity score: 5.5108)
  - 5.0: Peace Like a River
  - 5.0: Little Bee
  - 5.0: The Poisonwood Bible

User 5 (Similarity score: 68.8338)
  - 5.0: The Hunger Games (The Hunger Games, #1)
  - 5.0: Divergent (Divergent, #1)
  - 5.0: Resolved to Rule (Blood and Snow, #11)



In [252]:
import zipfile

# Define the name of your zip file
zip_name = 'Roy_Phelps_HW2_Code.zip'

# List of files to include (update these as needed)
files_to_zip = [
    'Roy_Phelps_HW2_Code.ipynb'
]

# Create the zip file
with zipfile.ZipFile(zip_name, 'w') as z:
    for file in files_to_zip:
        z.write(file)

print(f"{zip_name} created with {len(files_to_zip)} files.")

Roy_Phelps_HW2_Code.zip created with 1 files.
