In [1]:
import sys
import warnings

warnings.filterwarnings('ignore')
sys.path.append(r"C:\Users\13477\Desktop\New Adventure\Goodreads\goodreads_prod")

from static import *
from UserScraper import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

from joblib import Parallel, delayed
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def process_raw_books(books):
    res = books.drop_duplicates(subset = ['title'])
    res = res.dropna()
    return res

def process_raw_reviews(reviews):
    """ Make sure each user only reviews a book once """
    res = reviews.drop_duplicates(subset = ['title', 'user_id'], keep = 'first')
    res = res.dropna()

    return res


### Data preprocessing

#### Load data from parquets (scraped from goodreads)

In [2]:
# 16,578 books, 476,364 reviews, 9466 complete user profiles 
all_books = pd.read_parquet("all_books_final.parquet")
all_books = process_raw_books(all_books)
all_books_ratings = all_books[['title', 'rating', 'num_ratings']]

all_reviews = pd.read_parquet("english_reviews_final.parquet")
all_reviews = process_raw_reviews(all_reviews)

users_data = pd.read_parquet("users_data_final.parquet")

all_books = all_books[all_books.title.isin(all_reviews.title)]
users_with_data = users_data.user_id.unique()

In [3]:
users_data['is_notable'] = users_data['is_best_reviewer'] | users_data['is_most_followed']
users_data[users_data.is_notable]

Unnamed: 0,user_url,user_id,name,num_ratings,avg_rating,num_reviews,is_best_reviewer,reviewer_rank,is_most_followed,follow_rank,is_notable
26,https://www.goodreads.com/user/show/28020639-i...,28020639-iben-frederiksen,Iben Frederiksen,496,3.68,308,True,55,True,24,True
37,https://www.goodreads.com/user/show/35915119-a...,35915119-ashley-daviau,Ashley Daviau,2248,4.10,2137,True,37,False,0,True
40,https://www.goodreads.com/user/show/46560610-b...,46560610-beatriz,Beatriz,1029,3.37,946,True,1,True,9,True
119,https://www.goodreads.com/user/show/15736557-l...,15736557-lynne-king,Lynne King,862,4.07,499,True,9,True,41,True
127,https://www.goodreads.com/user/show/59214246-s...,59214246-sara-lowe,Sara Lowe,81,4.12,17,True,30,True,41,True
...,...,...,...,...,...,...,...,...,...,...,...
14540,https://www.goodreads.com/user/show/132812492,132812492,الف‌م‌ی‌ر,95,3.87,34,True,90,False,0,True
14541,https://www.goodreads.com/user/show/12543333,12543333,حماس,319,2.96,252,True,58,False,0,True
14587,https://www.goodreads.com/user/show/20234777-t...,20234777-tawallah,Tawallah,1624,3.36,1125,True,2,False,0,True
14604,https://www.goodreads.com/user/show/18009940-dean,18009940-dean,Dean,558,4.51,533,True,58,False,0,True


### Book rating normalizer

In [82]:
def book_rating_std(mean_rating, num_ratings, five_stars, four_stars, three_stars, two_stars, one_star):
    if num_ratings <= 1:
        return 0  # or raise an error, depending on your use case
    
    rating_values = [5, 4, 3, 2, 1]
    rating_freqs = [five_stars, four_stars, three_stars, two_stars, one_star]
    
    numerator = sum(f * (v - mean_rating) ** 2 for v, f in zip(rating_values, rating_freqs))
    var = numerator / (num_ratings - 1)
    
    return np.sqrt(var)

def rating_zscore(r, mean_ratings, std_ratings):
    res = (r - mean_ratings)/std_ratings
    return res

def construct_book_rating_stats(all_books):
    books_rating_stats = all_books[['title', 'rating', 'num_ratings', 'five_stars', 'four_stars', 'three_stars', 'two_stars', 'one_star']]
    books_rating_stats = books_rating_stats.rename(columns = {'rating': 'mean_rating'})
    books_rating_stats = books_rating_stats.set_index('title')

    # Apply to DataFrame
    books_rating_stats['rating_std'] = books_rating_stats.apply(
        lambda row: book_rating_std(
            row['mean_rating'],
            row['num_ratings'],
            row['five_stars'],
            row['four_stars'],
            row['three_stars'],
            row['two_stars'],
            row['one_star']
        ),
        axis=1
    )

    # get zscore
    mean_ratings = books_rating_stats.mean_rating.values
    std_ratings = books_rating_stats.rating_std.values
    
    for i in range(5,0,-1):
        books_rating_stats[f'{i}_zscore'] = rating_zscore(i, mean_ratings, std_ratings)

    return books_rating_stats

In [83]:
books_rating_stats = pd.read_parquet('book_rating_stats.parquet')
books_rating_stats.head()

Unnamed: 0,title,mean_rating,num_ratings,five_stars,four_stars,three_stars,two_stars,one_star,rating_std,5_zscore,4_zscore,3_zscore,2_zscore,1_zscore
0,Ways of Seeing,3.93,409891,180540,104714,68443,27972,28222,1.222802,0.87504,0.057246,-0.760548,-1.578342,-2.396137
1,The Story of Art,3.96,441491,205333,104059,73087,29183,29829,1.225518,0.848621,0.032639,-0.783343,-1.599324,-2.415306
2,Steal Like an Artist: 10 Things Nobody Told Yo...,3.96,317933,137177,87225,55875,18949,18707,1.171995,0.887376,0.03413,-0.819116,-1.672363,-2.525609
3,The New Drawing on the Right Side of the Brain,3.87,368384,157253,89353,66495,26330,28953,1.258219,0.898095,0.103321,-0.691453,-1.486228,-2.281002
4,The Artist's Way: A Spiritual Path to Higher C...,3.95,115356,49092,32508,19489,7580,6687,1.173334,0.894886,0.042614,-0.809659,-1.661931,-2.514204


#### Label reviews with genres

In [4]:
def construct_book_genre_labels(all_books):
    mlb = MultiLabelBinarizer(classes=genres)
    genre_matrix = mlb.fit_transform(all_books['genres'])
    genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=all_books.index)
    genre_labels = pd.concat([all_books[['title']], genre_df], axis=1)

    return genre_labels

def label_reviews_with_genre(all_reviews, genre_labels):
    all_labeled_reviews = all_reviews.merge(
        genre_labels, 
        on='title', 
        how='inner'
    )

    all_labeled_reviews = all_labeled_reviews.drop_duplicates(subset=['title', 'user_id', 'rating'])
    return all_labeled_reviews

genre_labels = construct_book_genre_labels(all_books)
all_labeled_reviews = label_reviews_with_genre(all_reviews, genre_labels)

#### User genre counts

For each user (col), how many books from each genre (row) have they read?

In [5]:
def get_user_genre_counts(reviews):
    
    user_genre_counts = reviews.groupby('user_id')[genres].sum().T  # genres as index
    num_reviews_by_user = reviews.groupby('user_id')['title'].count()
    user_genre_pct = user_genre_counts.div(num_reviews_by_user, axis = 1)

    return user_genre_counts, user_genre_pct

In [6]:
user_genre_counts, user_genre_pct = get_user_genre_counts(all_labeled_reviews)

In [7]:
compact_labeled_reviews = all_labeled_reviews[all_labeled_reviews.user_id.isin(users_with_data)]
compact_user_genre_count, compact_genre_pct = get_user_genre_counts(compact_labeled_reviews)

#### User item matrix

In [238]:
def construct_user_item_matrix(reviews):
    reviews_grouped = reviews.groupby(['user_id', 'title'])['rating'].mean().reset_index()
    user_item_matrix = reviews_grouped.pivot(index='user_id', columns='title', values='rating')
    user_item_matrix = user_item_matrix.fillna(0)

    return user_item_matrix

def condense_user_item_matrix(user_item_matrix, n = 3):

    filtered_matrix = user_item_matrix[user_item_matrix.astype(bool).sum(axis=1) >= n]
    filtered_matrix = filtered_matrix.loc[:, filtered_matrix.astype(bool).sum(axis=0) >= n]

    return filtered_matrix

def center_user_item_matrix(user_item_matrix):
    """ Assumes users are rows
        Subtracts each user's rating by its mean ratings 
        (computed from non zero ratings)
    """
    
    tmp = user_item_matrix.replace(0, np.NaN)
    tmp_means = tmp.mean(axis = 1)

    new_tmp = tmp.sub(tmp_means, axis = 0)
    new_tmp = new_tmp.fillna(0)

    return new_tmp

In [89]:
main_user_item_matrix = construct_user_item_matrix(compact_labeled_reviews)

In [139]:
def get_title_zscore_dict(ex_title, books_rating_stats = books_rating_stats):
    z_score_cols = ['5_zscore', '4_zscore', '3_zscore', '2_zscore', '1_zscore']
    rating_values = [5, 4, 3, 2, 1]

    title_zscores = books_rating_stats[books_rating_stats.title == ex_title][z_score_cols].values.ravel()
    res_dict = {r: z for r, z in zip(rating_values, title_zscores)}

    return res_dict

def get_zscore_for_title(user_item_matrix, title):
    zscore_dict = get_title_zscore_dict(title)
    
    title_col = user_item_matrix[title]
    title_col = title_col.replace(zscore_dict).where(title_col != 0, 0)

    return title_col
    

In [137]:
# Apply to every title (column) in the DataFrame
zscore_matrix = pd.read_parquet('zscore_normed_user_item_matrix.parquet')

In [142]:
centered_zscore_matrix = center_user_item_matrix(zscore_matrix)

### Genre recommender

#### Z-score normalized user item matrix

In [9]:
my_genre = 'Philosophy'

#### Find top reviewers for genre

In [10]:
def get_score(count, pct, alpha = 1):
    score = count * pct**alpha
    return score

def user_read_counts_for_genre(my_genre, user_genre_counts, user_genre_pct):
    genre_review_count_ranked = user_genre_counts.loc[my_genre, :].sort_values(ascending = False)
    genre_pct_of_reviews_ranked = user_genre_pct.loc[my_genre, :].sort_values(ascending = False)

    res = pd.DataFrame({"review_count": genre_review_count_ranked, "review_pct": genre_pct_of_reviews_ranked})
    return res

def get_genre_ranker(my_genre, user_genre_counts, user_genre_pct, alpha = 1):
    user_read_counts = user_read_counts_for_genre(my_genre, user_genre_counts, user_genre_pct)
    user_read_counts['score'] = get_score(user_read_counts['review_count'], user_read_counts['review_pct'], alpha = alpha)

    user_read_counts = user_read_counts[user_read_counts.review_count > 0]
    user_read_counts = user_read_counts.sort_values(by = 'score', ascending = False)
    return user_read_counts

def get_top_n_reviewers(ranker, n):
    top_n = ranker.head(n)
    top_n['score_normed'] = top_n['score']/np.sum(top_n['score'])

    return top_n

#### Suggest books from top reviewers

In [11]:
def filter_reviews_for_genre(my_genre, labeled_reviews):
    reviews_filtered = labeled_reviews[labeled_reviews[my_genre] == 1]
    reviews_filtered = reviews_filtered.drop(columns = my_genre)
    return reviews_filtered[['title', 'user_id', 'rating']]

def get_expert_user_item_matrix(user_item_matrix, experts):
    expert_user_item_matrix =  user_item_matrix[user_item_matrix.index.isin(experts)]
    expert_user_item_matrix = expert_user_item_matrix.loc[experts]

    return expert_user_item_matrix


def adjust_expert_user_item_matrix(expert_user_item_matrix, adjust_value = 3):
    """ Adjust ratings downward by adjust_value"""
    adjusted_expert_user_item_matrix = expert_user_item_matrix.where(expert_user_item_matrix == 0, expert_user_item_matrix - adjust_value)
    return adjusted_expert_user_item_matrix

def lookup_rating(user_item_matrix, user_id, book_name):
    return user_item_matrix.loc[user_id, book_name]

def ratings_of_those_who_read(book_name, top_n_reviewers, expert_user_item_matrix):
    experts = top_n_reviewers.index
    amount_of_say = top_n_reviewers['score_normed']
    
    wavgs = pd.DataFrame(amount_of_say)
    wavgs['book_rating'] = [lookup_rating(expert_user_item_matrix, u, book_name) for u in experts]
    wavgs = wavgs[wavgs.book_rating != 0]

    return wavgs

def avg_expert_rating(book_name, top_n_reviewers, expert_user_item_matrix):
    amount_of_say = top_n_reviewers['score_normed']
    experts = top_n_reviewers.index
    
    wavgs = ratings_of_those_who_read(book_name, top_n_reviewers, expert_user_item_matrix)
    res = np.dot(wavgs['score_normed'], wavgs['book_rating'])/np.sum(wavgs['score_normed'])
    
    return res, len(wavgs)

def get_expert_ratings(expert_user_item_matrix, top_n_reviewers):
    """ sum(amount of say * rating) for everyone who rated the book for each book
    
    The more people who interacted...the more the score will be affected
    E.g. 10 people who rated positive > 5 people who rated positive
    """
    amount_of_say = top_n_reviewers['score_normed']

    expert_ratings = expert_user_item_matrix.T.dot(amount_of_say)
    expert_ratings= pd.DataFrame(expert_ratings)
    
    expert_ratings.columns = ['expert_metric']
    expert_ratings = expert_ratings.drop_duplicates()
    expert_ratings = expert_ratings.sort_values(by = 'expert_metric', ascending = False)

    return expert_ratings

def merge_expert_with_overall(expert_rating, all_books_rating, num_reviewers = 50):
    merged = expert_rating.merge(all_books_ratings, left_index=True, right_on='title', how='inner')
    merged = merged.set_index('title')
    
    merged = merged[['expert_wavg_rating', f'experts_who_read_out_of_{num_reviewers}', 
                     'expert_metric', 'rating', 'num_ratings']]

    return merged

In [12]:
def top_n_genre_expert_recommendations(my_genre, all_labeled_reviews, user_genre_counts, user_genre_pct, 
                                       num_reviewers = 50, how_many = 10, alpha = 1.2):
    # filter reviews from this genre and transform to user_item_matrix
    genre_filtered_reviews = filter_reviews_for_genre(my_genre, all_labeled_reviews)
    genre_user_item_matrix = construct_user_item_matrix(genre_filtered_reviews)
    genre_user_item_matrix = condense_user_item_matrix(genre_user_item_matrix)

    # use top reviewers to decide which books to recommend
    genre_ranker = get_genre_ranker(my_genre, user_genre_counts, user_genre_pct, alpha = alpha)
    
    top_n = get_top_n_reviewers(genre_ranker, num_reviewers)
    experts = top_n.index
    amount_of_say = top_n['score_normed']


    """ maybe write this as another atom func """
    # user_item_matrix for top reviewers of this genre
    expert_user_item_matrix = get_expert_user_item_matrix(genre_user_item_matrix, top_n.index)
    adjusted_expert_user_item_matrix = center_user_item_matrix(expert_user_item_matrix)

    adjusted_expert_ratings = get_expert_ratings(adjusted_expert_user_item_matrix, top_n)
    adjusted_expert_ratings[['expert_wavg_rating',f'experts_who_read_out_of_{num_reviewers}']] = [avg_expert_rating(book, top_n, expert_user_item_matrix) for book in adjusted_expert_ratings.index]
    # adjusted_expert_ratings['experts_who_read'] = adjusted_expert_ratings['experts_who_read']/num_reviewers
    # adjusted_expert_ratings['experts_who_read'] = adjusted_expert_ratings['experts_who_read'].apply(lambda x: f"{x * 100:.0f}%")

    adjusted_expert_ratings = adjusted_expert_ratings.round(2)
    adjusted_expert_ratings = adjusted_expert_ratings[['expert_wavg_rating', f'experts_who_read_out_of_{num_reviewers}', 'expert_metric']]

    best = adjusted_expert_ratings.head(how_many)
    controversial = adjusted_expert_ratings.tail(how_many).sort_values(by = 'expert_metric')
    
    best = merge_expert_with_overall(best, all_books_ratings)
    controversial = merge_expert_with_overall(controversial, all_books_ratings)

    return best, controversial, top_n

In [13]:
best, controversial, your_reviewers = top_n_genre_expert_recommendations('Philosophy', all_labeled_reviews,
                                                         user_genre_counts, user_genre_pct)

In [14]:
best

Unnamed: 0_level_0,expert_wavg_rating,experts_who_read_out_of_50,expert_metric,rating,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"In Search of Lost Time, Volume 1: The Way by Swann's",4.88,10.0,0.19,4.15,65076
Tractatus Logico Philosophicus,4.66,8.0,0.18,4.1,21407
The Symposium,4.6,9.0,0.17,4.09,83867
Phenomenology of Spirit,4.7,8.0,0.17,3.96,19718
Swann's Way,4.87,9.0,0.17,4.15,65076
Introduction to Metaphysics,4.61,6.0,0.16,4.03,4260
An Enquiry Concerning Human Understanding,4.24,5.0,0.16,3.94,21653
The Elementary Forms of the Religious Life,4.3,4.0,0.13,3.93,3660
Darwin's Dangerous Idea: Evolution and the Meanings of Life,4.47,5.0,0.13,4.07,16774
Capital: A Critique of Political Economy Volume 1,4.23,8.0,0.13,4.09,22854


In [15]:
controversial

Unnamed: 0_level_0,expert_wavg_rating,experts_who_read_out_of_50,expert_metric,rating,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cosmos and Psyche: Intimations of a New World View,1.6,3.0,-0.25,4.25,1213
The Closing of the American Mind,2.11,5.0,-0.25,3.76,5822
Being and Nothingness,3.27,10.0,-0.21,3.99,33625
Industrial Society and Its Future,1.33,4.0,-0.21,3.86,11534
12 Rules for Life: An Antidote to Chaos,1.91,6.0,-0.19,3.92,252695
The Varieties of Religious Experience,2.9,8.0,-0.19,4.01,12570
The Road to Serfdom,2.02,5.0,-0.18,4.15,25121
"Enlightenment Now: The Case for Reason, Science, Humanism, and Progress",2.55,3.0,-0.18,4.2,31404
The Teachings of Don Juan: A Yaqui Way of Knowledge,2.59,4.0,-0.18,3.94,44399
"A Study of History, Abridgement of Vols 1-6",1.72,2.0,-0.18,4.1,518


In [16]:
your_reviewers

Unnamed: 0_level_0,review_count,review_pct,score,score_normed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6243880-beauregard-bottomley,91,0.710938,60.428122,0.058576
175635-trevor,125,0.428082,45.158822,0.043774
60074558-xander,54,0.857143,44.880492,0.043505
25683251,78,0.537931,37.065215,0.035929
1651956-riku-sayuj,93,0.444976,35.195484,0.034116
14046996-roy-lotz,94,0.429224,34.068133,0.033024
42133960-peiman-e-iran,62,0.601942,33.717631,0.032684
3897817-morgan-blackledge,69,0.526718,31.969986,0.03099
974210-erik-graff,83,0.448649,31.722378,0.03075
108995717-dan,32,0.941176,29.754678,0.028842


### User similarity recommender

In [177]:
user_id = '155041466'

# user_id = '153156500-michelle-lee'
base_url = 'https://www.goodreads.com/user/show/'
user_url = base_url+user_id

def get_user_info(url):
    user = UserMetaData(url, review_pages=4)
    user.get_metadata()

    user_metadata = user.retrieve_metadata()

    user.get_review_info()
    user_reviews = user.retrieve_reviews()

    return user_metadata, user_reviews

def review_coverage(user_reviews, all_books = all_books):
    coverage = len(user_reviews[user_reviews.title.isin(all_books.title)])/len(user_reviews)
    return coverage

In [18]:
def replace_zeros_with_nonzero_mean(arr):
    arr = arr.copy()  # avoid modifying original array
    non_zero_mean = arr[arr != 0].mean()
    print(non_zero_mean)
    arr[arr == 0] = non_zero_mean
    return arr

In [19]:
user_metadata, user_reviews = get_user_info(user_url)
user_reviews = pd.DataFrame(user_reviews)
user_reviews['rating'] = replace_zeros_with_nonzero_mean(user_reviews.rating.values)

4.517241379310345


In [20]:
user_reviews.head()

Unnamed: 0,user_id,title_id,title,rating,votes
0,155041466,1052.The_Richest_Man_in_Babylon,The Richest Man in Babylon,4,0
1,155041466,4866.How_to_Stop_Worrying_and_Start_Living,How to Stop Worrying and Start Living: Time-Te...,4,0
2,155041466,61439040-1984,1984,5,0
3,155041466,51893.Thus_Spoke_Zarathustra,Thus Spoke Zarathustra,4,0
4,155041466,6708.The_Power_of_Now,The Power of Now: A Guide to Spiritual Enlight...,5,0


In [229]:
user_reviews

Unnamed: 0,user_id,title_id,title,rating,votes,zscore_normed_rating
0,155041466,1052.The_Richest_Man_in_Babylon,The Richest Man in Babylon,4,0,-0.244134
1,155041466,4866.How_to_Stop_Worrying_and_Start_Living,How to Stop Worrying and Start Living: Time-Te...,4,0,-0.163792
2,155041466,61439040-1984,1984,5,0,-1.0
3,155041466,51893.Thus_Spoke_Zarathustra,Thus Spoke Zarathustra,4,0,-0.069174
4,155041466,6708.The_Power_of_Now,The Power of Now: A Guide to Spiritual Enlight...,5,0,0.81198
5,155041466,857333.The_Art_of_Learning,The Art of Learning: A Journey in the Pursuit ...,5,0,-1.0
6,155041466,612188.Memories_Dreams_Reflections,"Memories, Dreams, Reflections",5,0,0.819219
7,155041466,30013.Prelude_to_Foundation,Prelude to Foundation,5,0,-1.0
8,155041466,13079982-fahrenheit-451,Fahrenheit 451,5,0,-1.0
9,155041466,1303.The_48_Laws_of_Power,The 48 Laws of Power,4,0,-0.094543


##### Compact user reviews

In [21]:
compact_labeled_reviews = all_labeled_reviews[all_labeled_reviews.user_id.isin(users_with_data)]
compact_user_genre_count, compact_genre_pct = get_user_genre_counts(compact_labeled_reviews)

In [22]:
genre_labels = construct_book_genre_labels(all_books)
all_labeled_reviews = label_reviews_with_genre(all_reviews, genre_labels)

all_labeled_reviews = compact_labeled_reviews.copy()

In [23]:
this_user_reviews_labeled = label_reviews_with_genre(user_reviews, genre_labels)

In [24]:
this_user_genre_counts, this_user_genre_pct = get_user_genre_counts(this_user_reviews_labeled)

In [25]:
M = compact_genre_pct.values
v = this_user_genre_pct.values

In [26]:
similarities = cosine_similarity(M.T, v.T).ravel()

In [27]:
other_users = compact_genre_pct.T.index

In [28]:
similarity_ranker = pd.DataFrame({'other_users': other_users, 'genre_similarity': similarities})
similarity_ranker = similarity_ranker.sort_values(by = 'genre_similarity', ascending = False)
similarity_ranker.head(10)

Unnamed: 0,other_users,genre_similarity
2675,1741426-odai-al-saeed,0.95429
8715,86266847-ammit-p-chawda,0.942506
3919,25664045-gary,0.941953
485,10907075-mustafa-hasan,0.940251
8284,7747793-rahul,0.938931
268,104930325-andrew-padilla,0.937633
1757,14202578-obied-alahmed,0.937579
3139,19839938-kaveh,0.93568
7752,69159766-mekhala-bhatt,0.934362
92,10179929-sanjay,0.932416


In [29]:
similarity_ranker['score'] = similarity_ranker['genre_similarity']
similarity_ranker = similarity_ranker.set_index("other_users")
similarity_ranker.head()

Unnamed: 0_level_0,genre_similarity,score
other_users,Unnamed: 1_level_1,Unnamed: 2_level_1
1741426-odai-al-saeed,0.95429,0.95429
86266847-ammit-p-chawda,0.942506,0.942506
25664045-gary,0.941953,0.941953
10907075-mustafa-hasan,0.940251,0.940251
7747793-rahul,0.938931,0.938931


In [31]:
'16562982-ahmad-khater' in main_user_item_matrix.index

True

In [32]:
def get_recommendation_from_top(ranker, user_item_matrix = main_user_item_matrix, num_reviewers = 50):
    top_n = get_top_n_reviewers(ranker, num_reviewers)
    experts = top_n.index
    amount_of_say = top_n['score_normed']

    """ maybe write this as another atom func """
    # user_item_matrix for top reviewers of this genre
    expert_user_item_matrix = get_expert_user_item_matrix(user_item_matrix, top_n.index)
    adjusted_expert_user_item_matrix = center_user_item_matrix(expert_user_item_matrix)
    
    adjusted_expert_ratings = get_expert_ratings(adjusted_expert_user_item_matrix, top_n)
    adjusted_expert_ratings[['expert_wavg_rating',f'experts_who_read_out_of_{num_reviewers}']] = [avg_expert_rating(book, top_n, expert_user_item_matrix) for book in adjusted_expert_ratings.index]

    return adjusted_expert_ratings

In [33]:
ratings_for_me = get_recommendation_from_top(similarity_ranker)

In [34]:
ratings_for_me.head(30)

Unnamed: 0_level_0,expert_metric,expert_wavg_rating,experts_who_read_out_of_50
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Outsider,0.070943,5.0,2.0
The Master and Margarita,0.068116,5.0,2.0
The Heart of a Dog,0.065169,5.0,2.0
The 7 Habits of Highly Effective People: Powerful Lessons in Personal Change,0.057079,5.0,3.0
How to Stop Worrying and Start Living,0.056167,4.662519,3.0
Me Talk Pretty One Day,0.052101,4.003652,3.0
The Laws of Human Nature,0.050922,5.0,2.0
"The Body Keeps the Score: Brain, Mind, and Body in the Healing of Trauma",0.049385,5.0,2.0
Death of a Salesman,0.04935,5.0,2.0
The Tyranny of Merit: What's Become of the Common Good?,0.047535,5.0,1.0


In [145]:
centered_zscore_matrix.head()

Unnamed: 0_level_0,"""A Problem from Hell"": America and the Age of Genocide","""Don't You Know Who I Am?"": How to Stay Sane in an Era of Narcissism, Entitlement, and Incivility","""Stand Back,"" Said the Elephant, ""I'm Going to Sneeze!""","""Surely You're Joking, Mr. Feynman!"": Adventures of a Curious Character",#ACCELERATE: Manifesto for an Accelerationist Politics,"#NaNoWri War Z, Hugh Howey Must Die",'Tis the Damn Season,"(Not that You Asked): Rants, Exploits, and Obsessions",... But I'm NOT Racist!: Tools for Well-Meaning Whites,...And Ladies of the Club,...,gods with a little g,"iGen: Why Today’s Super-Connected Kids Are Growing Up Less Rebellious, More Tolerant, Less Happy--and Completely Unprepared for Adulthood--and What That Means for the Rest of Us",phineas gage,the prophet,unSweetined,ट्वेल्थ फेल | Twelfth Fail | 12th Fail,ठीक तुम्हारे पीछे [Theek Tumhare Peechhe],はぴまり~Happy Marriage!?~ 1,バッカーノ！The Rolling Bootlegs,默读 [Mo Du] The Light in the Night
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-otis-chandler,0.0,0.0,0.0,0.438354,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001905-lynn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100019622-vonda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100022104-jasmine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10010399-kathleen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find users with similar taste/ratings

In [197]:
def get_title_zscore_dict(ex_title, books_rating_stats = books_rating_stats):
    z_score_cols = ['5_zscore', '4_zscore', '3_zscore', '2_zscore', '1_zscore']
    rating_values = [5, 4, 3, 2, 1]

    title_zscores = books_rating_stats[books_rating_stats.title == ex_title][z_score_cols].values.ravel()
    res_dict = {r: z for r, z in zip(rating_values, title_zscores)}

    return res_dict

def get_single_title_rating_zscore(title, rating):
    assert rating in {1,2,3,4,5}, "Rating must be an int from 1-5"
    
    zscore_dict = get_title_zscore_dict(title)
    if zscore_dict:
        res = zscore_dict[rating]
        return res

    return -1

def get_zscore_ratings_for_title_list(titles, ratings):
    res = [get_single_title_rating_zscore(title, rating) for title, rating in zip(titles, ratings)]
    return res
    

In [198]:
user_books = user_reviews.title.unique()
user_reviews['zscore_normed_rating'] = get_zscore_ratings_for_title_list(user_reviews.title, user_reviews.rating)
user_reviews.head()

Unnamed: 0,user_id,title_id,title,rating,votes,zscore_normed_rating
0,155041466,1052.The_Richest_Man_in_Babylon,The Richest Man in Babylon,4,0,-0.244134
1,155041466,4866.How_to_Stop_Worrying_and_Start_Living,How to Stop Worrying and Start Living: Time-Te...,4,0,-0.163792
2,155041466,61439040-1984,1984,5,0,-1.0
3,155041466,51893.Thus_Spoke_Zarathustra,Thus Spoke Zarathustra,4,0,-0.069174
4,155041466,6708.The_Power_of_Now,The Power of Now: A Guide to Spiritual Enlight...,5,0,0.81198


In [190]:
centered_zscore_matrix_T = centered_zscore_matrix.T
reduced_item_user_matrix = centered_zscore_matrix_T[centered_zscore_matrix_T.index.isin(user_books)]

In [212]:
user_zscores_normed = user_reviews[['title', 'zscore_normed_rating']]
user_zscores_normed = user_zscores_normed[user_zscores_normed.zscore_normed_rating != -1]
user_zscores_normed['zscore_normed_rating'] = user_zscores_normed['zscore_normed_rating'] - np.mean(user_zscores_normed['zscore_normed_rating'])
user_zscores_normed = user_zscores_normed.set_index('title')
user_zscores_normed.columns = [user_id]
user_zscores_normed

Unnamed: 0_level_0,155041466
title,Unnamed: 1_level_1
The Richest Man in Babylon,-0.386553
How to Stop Worrying and Start Living: Time-Tested Methods for Conquering Worry,-0.306211
Thus Spoke Zarathustra,-0.211592
The Power of Now: A Guide to Spiritual Enlightenment,0.669562
"Memories, Dreams, Reflections",0.676801
The 48 Laws of Power,-0.236962
The Prince,0.017762
Life of Pi,-0.084859
The Plague,-1.268756
How to Win Friends & Influence People,0.654311


In [223]:
def remove_users_who_have_no_opinion(reduced_item_user_matrix):
    M = reduced_item_user_matrix.T
    M = M.loc[~(M == 0).all(axis=1)]
    return M.T

In [230]:
user_specific_matrix = remove_users_who_have_no_opinion(reduced_item_user_matrix)
user_specific_matrix_joined = user_specific_matrix.merge(user_zscores_normed, left_index = True,
                                                                                      right_index= True)

user_specific_matrix_joined.head()

Unnamed: 0,100759205-kat,10077465-anurag-vaishnav,100778714-andrea-hartmann,100790884-abby-moore,100807586-nellian,101358081-imme-van-gorp,101693182-tharindu-dissanayake,10171516-jessica,101718720-kamal,10179929-sanjay,...,9711335-mallory,972022-emily,974210-erik-graff,97513431-tamoghna-biswas,97546350-giorgia-reads,98090277-juju,9882308-atlas,99413969-simon-ri,996039-rosieface,155041466
12 Rules for Life: An Antidote to Chaos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.783463
All the Light We Cannot See,0.0,1.161975,0.0,-0.91693,0.0,0.0,0.0,-0.317322,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645326
Brave New World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.873131
Can't Hurt Me: Master Your Mind and Defy the Odds,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106675,0.0,-0.453185
Crime and Punishment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.4418


In [278]:
def elements_in_both_arr(a, b):
    # Create mask for non-zero values in both arrays
    mask = (a != 0) & (b != 0)
    
    # Apply mask to filter both arrays
    a_filtered = a[mask]
    b_filtered = b[mask]

    return a_filtered, b_filtered

def cosine_similarity_2_arrs(a, b):
    res = cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0,0]
    return res

def cos_similarity(arr1, arr2):
    a_filtered, b_filtered = elements_in_both_arr(arr1, arr2)

    res = cosine_similarity_2_arrs(a_filtered, b_filtered)
    sample_size = len(a_filtered)
    
    return res, sample_size

In [283]:
arr1 = user_specific_matrix_joined.iloc[:, -1].values
arr2 = user_specific_matrix_joined.iloc[:, -1].values

In [284]:
cos, s = cos_similarity(arr1, arr2)

In [289]:
# Assume user_specific_matrix_joined is your DataFrame
arr2 = user_specific_matrix_joined.iloc[:, -1].values  # last column
results = {}

for col in user_specific_matrix_joined.columns[:-1]:  # all columns except the last
    similarity, sample_size = cos_similarity(user_specific_matrix_joined[col].values, arr2)
    results[col] = {
        'cosine_similarity': similarity,
        'sample_size': sample_size
    }

# Convert to DataFrame if needed
results_user_specific_matrix_joined = pd.DataFrame.from_dict(results, orient='index')

In [303]:
user_similarity_ranking = results_user_specific_matrix_joined.sort_values(by = 'sample_size', ascending = False)
user_similarity_ranking['score'] = user_similarity_ranking['cosine_similarity'] * user_similarity_ranking['sample_size']
user_similarity_ranking = user_similarity_ranking.sort_values('score', ascending=False)

In [307]:
user_similarity_ranking.head(20)

Unnamed: 0,cosine_similarity,sample_size,score
101693182-tharindu-dissanayake,0.765807,5,3.829036
27788046-sean-barrs,0.519788,7,3.638516
5431458-henry-avila,0.98008,3,2.940241
30181442-yun,0.443001,6,2.658006
124132123-lisa-of-troy,0.277684,8,2.221475
42130592-chai-thelibrairie-on-tiktok,0.999983,2,1.999967
14002983-charlotte-may,0.998367,2,1.996734
46308842-maria-clara,0.997412,2,1.994825
2120682-eden-prosper,0.992717,2,1.985433
38342050-ying-ying,0.986776,2,1.973552


In [312]:
ratings_for_me_2 = get_recommendation_from_top(user_similarity_ranking, num_reviewers = 20)

In [314]:
ratings_for_me_2.head(10)

Unnamed: 0_level_0,expert_metric,expert_wavg_rating,experts_who_read_out_of_20
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
To Kill a Mockingbird,0.243911,5.0,3.0
The Return of the King,0.242299,5.0,3.0
A Clash of Kings,0.234275,5.0,3.0
The Vegetarian,0.227397,4.809098,4.0
The Priory of the Orange Tree,0.208431,5.0,3.0
The Book of Hope: A Survival Guide for Trying Times,0.200505,5.0,3.0
The Song of Achilles,0.198605,4.479504,5.0
A Gentleman in Moscow,0.196139,5.0,4.0
Charlie and the Chocolate Factory,0.191877,4.553865,4.0
The Picture of Dorian Gray,0.19004,4.613224,5.0
