In [1]:
import sys
import warnings

warnings.filterwarnings('ignore')
sys.path.append(r"C:\Users\13477\Desktop\New Adventure\Goodreads\goodreads_prod")

from static import *
from UserScraper import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

from joblib import Parallel, delayed
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def process_raw_books(books):
    res = books.drop_duplicates(subset = ['title'])
    res = res.dropna()
    return res

def process_raw_reviews(reviews):
    """ Make sure each user only reviews a book once """
    res = reviews.drop_duplicates(subset = ['title', 'user_id'], keep = 'first')
    res = res.dropna()

    return res


### Data preprocessing

#### Load data from parquets (scraped from goodreads)

In [2]:
# 16,578 books, 476,364 reviews, 9466 complete user profiles 
all_books = pd.read_parquet("all_books_final.parquet")
all_books = process_raw_books(all_books)
all_books_ratings = all_books[['title', 'rating', 'num_ratings']]

all_reviews = pd.read_parquet("english_reviews_final.parquet")
all_reviews = process_raw_reviews(all_reviews)

users_data = pd.read_parquet("users_data_final.parquet")
users_data['is_notable'] = users_data['is_best_reviewer'] | users_data['is_most_followed']

all_books = all_books[all_books.title.isin(all_reviews.title)]
users_with_data = users_data.user_id.unique()

### Book rating normalizer

In [3]:
def book_rating_std(mean_rating, num_ratings, five_stars, four_stars, three_stars, two_stars, one_star):
    if num_ratings <= 1:
        return 0  # or raise an error, depending on your use case
    
    rating_values = [5, 4, 3, 2, 1]
    rating_freqs = [five_stars, four_stars, three_stars, two_stars, one_star]
    
    numerator = sum(f * (v - mean_rating) ** 2 for v, f in zip(rating_values, rating_freqs))
    var = numerator / (num_ratings - 1)
    
    return np.sqrt(var)

def rating_zscore(r, mean_ratings, std_ratings):
    res = (r - mean_ratings)/std_ratings
    return res

def construct_book_rating_stats(all_books):
    books_rating_stats = all_books[['title', 'rating', 'num_ratings', 'five_stars', 'four_stars', 'three_stars', 'two_stars', 'one_star']]
    books_rating_stats = books_rating_stats.rename(columns = {'rating': 'mean_rating'})
    books_rating_stats = books_rating_stats.set_index('title')

    # Apply to DataFrame
    books_rating_stats['rating_std'] = books_rating_stats.apply(
        lambda row: book_rating_std(
            row['mean_rating'],
            row['num_ratings'],
            row['five_stars'],
            row['four_stars'],
            row['three_stars'],
            row['two_stars'],
            row['one_star']
        ),
        axis=1
    )

    # get zscore
    mean_ratings = books_rating_stats.mean_rating.values
    std_ratings = books_rating_stats.rating_std.values
    
    for i in range(5,0,-1):
        books_rating_stats[f'{i}_zscore'] = rating_zscore(i, mean_ratings, std_ratings)

    return books_rating_stats

In [4]:
books_rating_stats = pd.read_parquet('book_rating_stats.parquet')

#### Label reviews with genres

In [5]:
def construct_book_genre_labels(all_books):
    mlb = MultiLabelBinarizer(classes=genres)
    genre_matrix = mlb.fit_transform(all_books['genres'])
    genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=all_books.index)
    genre_labels = pd.concat([all_books[['title']], genre_df], axis=1)

    return genre_labels

def label_reviews_with_genre(all_reviews, genre_labels):
    all_labeled_reviews = all_reviews.merge(
        genre_labels, 
        on='title', 
        how='inner'
    )

    all_labeled_reviews = all_labeled_reviews.drop_duplicates(subset=['title', 'user_id', 'rating'])
    return all_labeled_reviews

genre_labels = construct_book_genre_labels(all_books)
all_labeled_reviews = label_reviews_with_genre(all_reviews, genre_labels)

#### User genre counts

For each user (col), how many books from each genre (row) have they read?

In [6]:
def get_user_genre_counts(reviews):
    
    user_genre_counts = reviews.groupby('user_id')[genres].sum().T  # genres as index
    num_reviews_by_user = reviews.groupby('user_id')['title'].count()
    user_genre_pct = user_genre_counts.div(num_reviews_by_user, axis = 1)

    return user_genre_counts, user_genre_pct

In [7]:
user_genre_counts, user_genre_pct = get_user_genre_counts(all_labeled_reviews)
compact_labeled_reviews = all_labeled_reviews[all_labeled_reviews.user_id.isin(users_with_data)]
compact_user_genre_count, compact_user_genre_pct = get_user_genre_counts(compact_labeled_reviews)

In [8]:
genre_pcts_means = user_genre_pct.mean(axis = 1)
genre_pcts_stds =  user_genre_pct.std(axis = 1)
genre_reading_stats = pd.DataFrame({'genre_pct_mean': genre_pcts_means, 'genre_pct_std': genre_pcts_stds})

#### User item matrix

In [9]:
def construct_user_item_matrix(reviews):
    reviews_grouped = reviews.groupby(['user_id', 'title'])['rating'].mean().reset_index()
    user_item_matrix = reviews_grouped.pivot(index='user_id', columns='title', values='rating')
    user_item_matrix = user_item_matrix.fillna(0)

    return user_item_matrix

def condense_user_item_matrix(user_item_matrix, n = 3):

    filtered_matrix = user_item_matrix[user_item_matrix.astype(bool).sum(axis=1) >= n]
    filtered_matrix = filtered_matrix.loc[:, filtered_matrix.astype(bool).sum(axis=0) >= n]

    return filtered_matrix

def center_user_item_matrix(user_item_matrix):
    """ Assumes users are rows
        Subtracts each user's rating by its mean ratings 
        (computed from non zero ratings)
    """
    
    tmp = user_item_matrix.replace(0, np.NaN)
    tmp_means = tmp.mean(axis = 1)

    new_tmp = tmp.sub(tmp_means, axis = 0)
    new_tmp = new_tmp.fillna(0)

    return new_tmp

In [10]:
def get_title_zscore_dict(ex_title, books_rating_stats = books_rating_stats):
    z_score_cols = ['5_zscore', '4_zscore', '3_zscore', '2_zscore', '1_zscore']
    rating_values = [5, 4, 3, 2, 1]

    title_zscores = books_rating_stats[books_rating_stats.title == ex_title][z_score_cols].values.ravel()
    res_dict = {r: z for r, z in zip(rating_values, title_zscores)}

    return res_dict

def get_zscore_for_title(user_item_matrix, title):
    zscore_dict = get_title_zscore_dict(title)
    
    title_col = user_item_matrix[title]
    title_col = title_col.replace(zscore_dict).where(title_col != 0, 0)

    return title_col
    

In [11]:
main_user_item_matrix = construct_user_item_matrix(compact_labeled_reviews)

In [12]:
centered_zscore_matrix = pd.read_parquet("zscore_normed_centered_user_item_matrix.parquet")

### Genre recommender

#### Find top reviewers for genre

In [13]:
def get_score(count, pct, alpha = 1):
    score = count * pct**alpha
    return score

def user_read_counts_for_genre(my_genre, user_genre_counts, user_genre_pct):
    genre_review_count_ranked = user_genre_counts.loc[my_genre, :].sort_values(ascending = False)
    genre_pct_of_reviews_ranked = user_genre_pct.loc[my_genre, :].sort_values(ascending = False)

    res = pd.DataFrame({"review_count": genre_review_count_ranked, "review_pct": genre_pct_of_reviews_ranked})
    return res

def get_genre_ranker(my_genre, user_genre_counts, user_genre_pct, alpha = 1):
    user_read_counts = user_read_counts_for_genre(my_genre, user_genre_counts, user_genre_pct)
    user_read_counts['score'] = get_score(user_read_counts['review_count'], user_read_counts['review_pct'], alpha = alpha)

    user_read_counts = user_read_counts[user_read_counts.review_count > 0]
    user_read_counts = user_read_counts.sort_values(by = 'score', ascending = False)
    return user_read_counts

def get_top_n_reviewers(ranker, n):
    top_n = ranker.head(n)
    top_n['score_normed'] = top_n['score']/np.sum(top_n['score'])

    return top_n

#### Suggest books from top reviewers

In [14]:
def filter_reviews_for_genre(my_genre, labeled_reviews):
    reviews_filtered = labeled_reviews[labeled_reviews[my_genre] == 1]
    reviews_filtered = reviews_filtered.drop(columns = my_genre)
    return reviews_filtered[['title', 'user_id', 'rating']]

def get_expert_user_item_matrix(user_item_matrix, experts):
    expert_user_item_matrix =  user_item_matrix[user_item_matrix.index.isin(experts)]
    expert_user_item_matrix = expert_user_item_matrix.loc[experts]

    return expert_user_item_matrix

def adjust_expert_user_item_matrix(expert_user_item_matrix, adjust_value = 3):
    """ Adjust ratings downward by adjust_value"""
    adjusted_expert_user_item_matrix = expert_user_item_matrix.where(expert_user_item_matrix == 0, expert_user_item_matrix - adjust_value)
    return adjusted_expert_user_item_matrix

def lookup_rating(user_item_matrix, user_id, book_name):
    return user_item_matrix.loc[user_id, book_name]

def ratings_of_those_who_read(book_name, top_n_reviewers, expert_user_item_matrix):
    experts = top_n_reviewers.index
    amount_of_say = top_n_reviewers['score_normed']
    
    wavgs = pd.DataFrame(amount_of_say)
    wavgs['book_rating'] = [lookup_rating(expert_user_item_matrix, u, book_name) for u in experts]
    wavgs = wavgs[wavgs.book_rating != 0]

    return wavgs

def avg_expert_rating(book_name, top_n_reviewers, expert_user_item_matrix):
    amount_of_say = top_n_reviewers['score_normed']
    experts = top_n_reviewers.index
    
    wavgs = ratings_of_those_who_read(book_name, top_n_reviewers, expert_user_item_matrix)
    res = np.dot(wavgs['score_normed'], wavgs['book_rating'])/np.sum(wavgs['score_normed'])
    
    return res, len(wavgs)

def get_expert_ratings(expert_user_item_matrix, top_n_reviewers):
    """ sum(amount of say * rating) for everyone who rated the book for each book
    
    The more people who interacted...the more the score will be affected
    E.g. 10 people who rated positive > 5 people who rated positive
    """
    amount_of_say = top_n_reviewers['score_normed']

    expert_ratings = expert_user_item_matrix.T.dot(amount_of_say)
    expert_ratings= pd.DataFrame(expert_ratings)
    
    expert_ratings.columns = ['expert_metric']
    expert_ratings = expert_ratings.drop_duplicates()
    expert_ratings = expert_ratings.sort_values(by = 'expert_metric', ascending = False)

    return expert_ratings

def merge_expert_with_overall(expert_rating, all_books_rating, num_reviewers = 50):
    merged = expert_rating.merge(all_books_ratings, left_index=True, right_on='title', how='inner')
    merged = merged.set_index('title')
    
    merged = merged[['expert_wavg_rating', f'experts_who_read_out_of_{num_reviewers}', 
                     'expert_metric', 'rating', 'num_ratings']]

    return merged

In [15]:
def format_genre_top_reviewers(genre_top_reviewers, n =10):
    genre_top_reviewers = genre_top_reviewers.iloc[:,:2]
    genre_top_reviewers.columns = ['genre_rating_count', 'pct_from_this_genre']
    genre_top_reviewers = genre_top_reviewers.round(2)
    genre_top_reviewers.index = genre_top_reviewers.index.str.replace(r'^\d+-', '', regex=True)
    return genre_top_reviewers.head(n)


def get_recommendation_from_top(ranker, user_item_matrix = main_user_item_matrix, num_reviewers = 50,
                                books_ratings = all_books_ratings, how_many = 10):
    
    top_n = get_top_n_reviewers(ranker, num_reviewers)
    experts = top_n.index
    amount_of_say = top_n['score_normed']

    """ maybe write this as another atom func """
    # user_item_matrix for top reviewers of this genre
    expert_user_item_matrix = get_expert_user_item_matrix(user_item_matrix, top_n.index)
    adjusted_expert_user_item_matrix = center_user_item_matrix(expert_user_item_matrix)
    
    adjusted_expert_ratings = get_expert_ratings(adjusted_expert_user_item_matrix, top_n)
    adjusted_expert_ratings = adjusted_expert_ratings.round(2)
    adjusted_expert_ratings[['expert_wavg_rating',f'experts_who_read_out_of_{num_reviewers}']] = [avg_expert_rating(book, top_n, expert_user_item_matrix) for book in adjusted_expert_ratings.index]

    best = adjusted_expert_ratings.head(how_many)
    best = merge_expert_with_overall(best, all_books_ratings, num_reviewers = num_reviewers)

    controversial = adjusted_expert_ratings.tail(how_many).sort_values(by = 'expert_metric')
    controversial = merge_expert_with_overall(controversial, all_books_ratings, num_reviewers = num_reviewers)

    best = best.drop(columns = 'expert_metric')
    controversial = controversial.drop(columns = 'expert_metric')

    column_labels = ['their_rating', 'count', 'overall_rating', 'overall_ratings_count']
    best.columns =  column_labels
    controversial.columns = column_labels  

    best = best.round(2)

    return best, controversial, top_n.head(num_reviewers)

def top_n_genre_expert_recommendations(my_genre, all_labeled_reviews = all_labeled_reviews, 
                                       user_genre_counts = user_genre_counts, 
                                       user_genre_pct = user_genre_pct, num_reviewers = 50,
                                       how_many = 10, alpha = 1.2):
    
    # filter reviews from this genre and transform to user_item_matrix
    genre_filtered_reviews = filter_reviews_for_genre(my_genre, all_labeled_reviews)
    genre_user_item_matrix = construct_user_item_matrix(genre_filtered_reviews)
    genre_user_item_matrix = condense_user_item_matrix(genre_user_item_matrix)

    # use top reviewers to decide which books to recommend
    genre_ranker = get_genre_ranker(my_genre, user_genre_counts, user_genre_pct, alpha = alpha)
    
    top_n = get_top_n_reviewers(genre_ranker, num_reviewers)
    experts = top_n.index
    amount_of_say = top_n['score_normed']

    best, controversial, genre_top_reviewers = get_recommendation_from_top(genre_ranker, genre_user_item_matrix)
    genre_top_reviewers = format_genre_top_reviewers(genre_top_reviewers)

    best.columns = [['top_reader_rating', 'ratings_count', 'overall_rating', 'overall_ratings_count']]
    
    return  best, controversial, genre_top_reviewers

In [16]:
best, controversial, genre_top_reviewers = top_n_genre_expert_recommendations("Classics")

In [17]:
best

Unnamed: 0_level_0,top_reader_rating,ratings_count,overall_rating,overall_ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
King Lear,4.85,14.0,3.91,230136
Othello,4.73,12.0,3.89,423128
Mrs Dalloway,4.67,15.0,3.79,332700
To the Lighthouse,4.66,14.0,3.81,203030
The Fellowship of the Ring,5.0,7.0,4.4,3003685
The Epic of Gilgamesh,4.94,7.0,3.75,112588
The Hobbit,4.9,8.0,4.29,4293576
Inferno,5.0,8.0,4.03,197452
Memoirs of Hadrian,4.61,10.0,4.22,33417
The Plague,4.65,14.0,4.02,293907


## Recommender!

#### Load user data from goodreads

In [18]:
def get_user_info(url):
    user = UserMetaData(url, review_pages=4)
    user.get_metadata()

    user_metadata = user.retrieve_metadata()

    user.get_review_info()
    user_reviews = user.retrieve_reviews()

    return user_metadata, user_reviews

def review_coverage(user_reviews, all_books = all_books):
    coverage = len(user_reviews[user_reviews.title.isin(all_books.title)])/len(user_reviews)
    return coverage

def replace_zeros_with_nonzero_mean(arr):
    arr = arr.copy()  # avoid modifying original array
    non_zero_mean = arr[arr != 0].mean()
    arr[arr == 0] = non_zero_mean
    return arr

def load_user_reviews(user_id):
    base_url = 'https://www.goodreads.com/user/show/'
    user_url = base_url+user_id

    user_metadata, user_reviews = get_user_info(user_url)
    
    user_reviews = pd.DataFrame(user_reviews)
    user_reviews['rating'] = replace_zeros_with_nonzero_mean(user_reviews.rating.values)
    return user_reviews

### User neighborhood recommender: ratings

In [19]:
def get_user_specific_matrix(user_books, matrix = centered_zscore_matrix):
    """ User specific matrix: only contains books rated by user
        and other users who have read at least one of those books"""
    
    centered_zscore_matrix_T = matrix.T
    reduced_item_user_matrix = centered_zscore_matrix_T[centered_zscore_matrix_T.index.isin(user_books)]
    user_specific_matrix = remove_users_who_have_no_opinion(reduced_item_user_matrix)
    
    return user_specific_matrix

def get_title_zscore_dict(ex_title, books_rating_stats = books_rating_stats):
    z_score_cols = ['5_zscore', '4_zscore', '3_zscore', '2_zscore', '1_zscore']
    rating_values = [5, 4, 3, 2, 1]

    title_zscores = books_rating_stats[books_rating_stats.title == ex_title][z_score_cols].values.ravel()
    res_dict = {r: z for r, z in zip(rating_values, title_zscores)}

    return res_dict

def get_single_title_rating_zscore(title, rating):
    assert rating in {1,2,3,4,5}, "Rating must be an int from 1-5"
    
    zscore_dict = get_title_zscore_dict(title)
    if zscore_dict:
        res = zscore_dict[rating]
        return res

    return -1

def get_zscore_ratings_for_title_list(titles, ratings):
    res = [get_single_title_rating_zscore(title, rating) for title, rating in zip(titles, ratings)]
    return res

def remove_users_who_have_no_opinion(reduced_item_user_matrix):
    M = reduced_item_user_matrix.T
    M = M.loc[~(M == 0).all(axis=1)]
    return M.T

def get_user_zscores_normed(user_id, user_reviews):
    user_zscores_normed = user_reviews[['title']]
    user_zscores_normed['zscore_normed_rating'] = get_zscore_ratings_for_title_list(user_reviews.title, user_reviews.rating)

    # filter out books user read that (unfortunately) are not in existing book database (~16,000 books)
    user_zscores_normed = user_zscores_normed[user_zscores_normed.zscore_normed_rating != -1]

    # center this user's reviews (make mean = 0)
    user_zscores_normed['zscore_normed_rating'] = user_zscores_normed['zscore_normed_rating'] - np.mean(user_zscores_normed['zscore_normed_rating'])

    user_zscores_normed = user_zscores_normed.set_index('title')
    user_zscores_normed.columns = [user_id]

    return user_zscores_normed

""" cosine similarity calculations """
def elements_in_both_arr(a, b):
    # Create mask for non-zero values in both arrays
    mask = (a != 0) & (b != 0)
    
    # Apply mask to filter both arrays
    a_filtered = a[mask]
    b_filtered = b[mask]

    return a_filtered, b_filtered

def cosine_similarity_2_arrs(a, b):
    res = cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0,0]
    return res

def cos_similarity_nonzero_entries(arr1, arr2):
    a_filtered, b_filtered = elements_in_both_arr(arr1, arr2)

    res = cosine_similarity_2_arrs(a_filtered, b_filtered)
    sample_size = len(a_filtered)
    
    return res, sample_size

In [20]:
def get_user_similarities_ranker_by_taste(user_zscores_normed, user_specific_matrix, alpha = 1.5):
    user_specific_matrix_joined = user_specific_matrix.merge(user_zscores_normed, left_index = True,
                                                                                      right_index= True)

    # Assume user_specific_matrix_joined is your DataFrame
    user_ratings_column = user_specific_matrix_joined.iloc[:, -1].values  # last column
    results = {}
    
    for col in user_specific_matrix_joined.columns[:-1]:  # all columns except the last
        similarity, sample_size = cos_similarity_nonzero_entries(user_specific_matrix_joined[col].values, user_ratings_column)
        results[col] = {
            'cosine_similarity': similarity,
            'sample_size': sample_size
        }
    
    # Convert to DataFrame if needed
    user_similarity_ranking = pd.DataFrame.from_dict(results, orient='index')
    user_similarity_ranking['score'] = get_score(count = user_similarity_ranking['sample_size'],
                                                 pct = user_similarity_ranking['cosine_similarity'],
                                                 alpha = alpha)

    # small issue: negative cos similarities won't work when raised to fraction of power
    user_similarity_ranking = user_similarity_ranking[~user_similarity_ranking.score.isna()]
    user_similarity_ranking = user_similarity_ranking.sort_values('score', ascending=False)
    return user_similarity_ranking



    

In [21]:
def format_your_closest_reviewers_by_taste(your_reviewers, n = 10):
    your_reviewers = your_reviewers[['cosine_similarity', 'sample_size']]
    your_reviewers.columns = ['rating_similarity', 'books_in_common']
    your_reviewers = your_reviewers.round(2)
    your_reviewers.index = your_reviewers.index.str.replace(r'^\d+-', '', regex=True)
    
    return your_reviewers.head(n)

def recommend_books_by_user_book_ratings_similarity(user_id, user_reviews):
    user_books = user_reviews.title.unique()

    # get zscore-normed ratings for this user and other relevant users
    user_zscores_normed = get_user_zscores_normed(user_id, user_reviews)
    user_specific_matrix = get_user_specific_matrix(user_books)

    # find similar users and aggregate their suggestions for you
    user_specific_matrix_joined = user_specific_matrix.merge(user_zscores_normed, left_index = True,
                                                                                      right_index= True)

    user_similarities_ranker_by_taste = get_user_similarities_ranker_by_taste(user_zscores_normed, user_specific_matrix)
    user_book_recommendations_by_taste, feeling_lucky_taste, your_reviewers_taste = get_recommendation_from_top(user_similarities_ranker_by_taste, num_reviewers = 20)

    your_reviewers_taste = format_your_closest_reviewers_by_taste(your_reviewers_taste)
    
    return user_book_recommendations_by_taste, feeling_lucky_taste, your_reviewers_taste

In [22]:
user_id = '155041466-jamie-ren'

user_reviews = load_user_reviews(user_id)
user_books = user_reviews.title.unique()

In [23]:
user_book_recommendations_by_taste, feeling_lucky_taste, your_reviewers_by_taste = recommend_books_by_user_book_ratings_similarity(user_id, user_reviews)

In [24]:
user_book_recommendations_by_taste

Unnamed: 0_level_0,their_rating,count,overall_rating,overall_ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Return of the King,5.0,3.0,4.57,987348
The Vegetarian,4.78,4.0,3.64,283178
To Kill a Mockingbird,5.0,3.0,4.26,6592149
The Priory of the Orange Tree,5.0,3.0,4.18,252631
A Clash of Kings,5.0,3.0,4.42,989657
A Gentleman in Moscow,5.0,4.0,4.32,622771
The Lost Man,4.77,4.0,4.16,106166
Romeo and Juliet,5.0,2.0,3.74,2738308
Harry Potter and the Goblet of Fire,4.73,4.0,4.57,4009564
Man's Search for Meaning,4.79,4.0,4.37,811249


In [25]:
your_reviewers_by_taste

Unnamed: 0,rating_similarity,books_in_common
tharindu-dissanayake,0.77,5
henry-avila,0.98,3
sean-barrs,0.52,7
chai-thelibrairie-on-tiktok,1.0,2
charlotte-may,1.0,2
maria-clara,1.0,2
eden-prosper,0.99,2
ying-ying,0.99,2
pakinam-mahmoud,0.97,2
debra,0.97,2


### User similarity recommender

In [44]:
def get_user_genre_counts_and_pcts(user_reviews, genre_labels = genre_labels):
    this_user_reviews_labeled = label_reviews_with_genre(user_reviews, genre_labels)
    this_user_genre_counts, this_user_genre_pct = get_user_genre_counts(this_user_reviews_labeled)

    return this_user_genre_counts, this_user_genre_pct

def get_user_similarities_ranker_by_genre(this_user_genre_pct, other_users_genre_pct = compact_user_genre_pct):
    # construct matrix
    M = other_users_genre_pct.values
    v = this_user_genre_pct.values
    similarities = cosine_similarity(M.T, v.T).ravel()
    
    other_users = other_users_genre_pct.T.index
    similarity_ranker = pd.DataFrame({'other_users': other_users, 'genre_similarity': similarities})
    similarity_ranker = similarity_ranker.sort_values(by = 'genre_similarity', ascending = False)
    similarity_ranker['score'] = similarity_ranker['genre_similarity']
    similarity_ranker = similarity_ranker.set_index("other_users")
    
    return similarity_ranker

def format_your_closest_reviewers_by_genre(your_reviewers, n = 10):
    your_reviewers = your_reviewers.iloc[:,:1]
    your_reviewers.columns = ['genre_similarity_to_you']
    your_reviewers = your_reviewers.round(2)
    your_reviewers.index = your_reviewers.index.str.replace(r'^\d+-', '', regex=True)
    
    return your_reviewers.head(n)

def recommend_books_by_user_genre_reading_pattern_similarity(user_reviews, genre_labels = genre_labels, how_many = 10, n = 10):
    # get users with similar genre reading patterns as you
    this_user_genre_counts, this_user_genre_pct = get_user_genre_counts_and_pcts(user_reviews, genre_labels = genre_labels)
    genre_similarity_ranker = get_user_similarities_ranker_by_genre(this_user_genre_counts)

    user_book_recommendations_by_genre, feeling_lucky_genre, your_reviewers_genre = get_recommendation_from_top(genre_similarity_ranker, how_many = how_many)
    your_reviewers_genre = format_your_closest_reviewers_by_genre(your_reviewers_genre, n)    
    
    return user_book_recommendations_by_genre, feeling_lucky_genre, your_reviewers_genre

In [45]:
user_book_recommendations_by_genre, feeling_lucky_genre, your_reviewers_genre = recommend_books_by_user_genre_reading_pattern_similarity(user_reviews, how_many = 30)

In [46]:
user_book_recommendations_by_genre

Unnamed: 0_level_0,their_rating,count,overall_rating,overall_ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Outsider,5.0,2.0,4.01,350875
The Master and Margarita,5.0,2.0,4.29,392346
The Heart of a Dog,5.0,2.0,4.09,66748
The 7 Habits of Highly Effective People: Powerful Lessons in Personal Change,5.0,3.0,4.16,792142
How to Stop Worrying and Start Living,4.66,3.0,4.16,112804
Me Talk Pretty One Day,4.0,3.0,4.01,711248
The Laws of Human Nature,5.0,2.0,4.35,25853
"The Body Keeps the Score: Brain, Mind, and Body in the Healing of Trauma",5.0,2.0,4.37,242684
Death of a Salesman,5.0,2.0,3.58,249761
The Tyranny of Merit: What's Become of the Common Good?,5.0,1.0,4.19,12607


In [47]:
your_reviewers_genre

Unnamed: 0_level_0,genre_similarity_to_you
other_users,Unnamed: 1_level_1
odai-al-saeed,0.95
ammit-p-chawda,0.94
gary,0.94
mustafa-hasan,0.94
rahul,0.94
andrew-padilla,0.94
obied-alahmed,0.94
kaveh,0.94
mekhala-bhatt,0.93
sanjay,0.93


#### Guess their fav genre!

In [30]:
def genre_pct_zscore(val, genre, genre_reading_stats = genre_reading_stats):
    stats = genre_reading_stats.loc[genre]
    mean, std = stats[0], stats[1]

    z_score = (val - mean)/std
    return z_score

def get_genres_zscores(pcts, genres):
    res = [genre_pct_zscore(pct, genre) for pct, genre in zip(pcts, genres)]
    return res

def get_user_genre_zscores(user_reviews, genre_labels = genre_labels):
    this_user_genre_counts, this_user_genre_pct = get_user_genre_counts_and_pcts(user_reviews, genre_labels)
    this_user_genre_pct['z_score'] = get_genres_zscores(this_user_genre_pct.iloc[:,0], this_user_genre_pct.index)
    
    this_user_genre_pct = this_user_genre_pct.sort_values(by = 'z_score', ascending = False)
    this_user_genre_pct.columns = ['pct_your_books', 'z_score']
    this_user_genre_pct = this_user_genre_pct.round(2)

    return this_user_genre_pct

In [31]:
user_genre_z_scores = get_user_genre_zscores(user_reviews)

In [32]:
def recommend_best_books_from_user_fav_genre(user_reviews, remove_already_read = False):
    user_books = user_reviews.title.unique()
    
    user_genre_z_scores = get_user_genre_zscores(user_reviews)
    fav_genre = user_genre_z_scores.index[0]

    best, controversial, genre_top_reviewers = top_n_genre_expert_recommendations(fav_genre)
    return fav_genre, user_genre_z_scores, best, controversial, genre_top_reviewers
    
    

In [33]:
fav_genre, user_genre_z_scores, best, controversial, genre_top_reviewers = recommend_best_books_from_user_fav_genre(user_reviews)

In [34]:
best

Unnamed: 0_level_0,top_reader_rating,ratings_count,overall_rating,overall_ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"In Search of Lost Time, Volume 1: The Way by Swann's",4.88,10.0,4.15,65076
Tractatus Logico Philosophicus,4.66,8.0,4.1,21407
The Symposium,4.6,9.0,4.09,83867
Phenomenology of Spirit,4.7,8.0,3.96,19718
Swann's Way,4.87,9.0,4.15,65076
Introduction to Metaphysics,4.61,6.0,4.03,4260
An Enquiry Concerning Human Understanding,4.24,5.0,3.94,21653
The Elementary Forms of the Religious Life,4.3,4.0,3.93,3660
Darwin's Dangerous Idea: Evolution and the Meanings of Life,4.47,5.0,4.07,16774
Capital: A Critique of Political Economy Volume 1,4.23,8.0,4.09,22854


In [35]:
user_genre_z_scores.head(5)

Unnamed: 0,pct_your_books,z_score
Philosophy,0.45,1.13
Psychology,0.42,0.94
Business,0.29,0.92
Self Help,0.42,0.86
Classics,0.23,0.62


#### Lastly, can remove books from recs which user has already read

Also, can add bonus to books that haven't been read by everyone. Also handle exceptions for if I can't load user bookreads.

Maybe for readers who have read a lot of books...suggest books that they haven't already read.

The problem is, the dot product method is biased to popular books everyone has read...find a way to account for that

best[~best.index.isin(ex)]