In [31]:
import sys
import warnings

warnings.filterwarnings('ignore')
sys.path.append(r"C:\Users\13477\Desktop\New Adventure\Goodreads\goodreads_prod")

from static import *
from UserScraper import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

from joblib import Parallel, delayed
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def process_raw_books(books):
    res = books.drop_duplicates(subset = ['title'])
    res = res.dropna()
    return res

def process_raw_reviews(reviews):
    """ Make sure each user only reviews a book once """
    res = reviews.drop_duplicates(subset = ['title', 'user_id'], keep = 'first')
    res = res.dropna()

    return res


### Data preprocessing

#### Load data from parquets (scraped from goodreads)

In [2]:
# 16,578 books, 476,364 reviews, 9466 complete user profiles 
all_books = pd.read_parquet("all_books_final.parquet")
all_books = process_raw_books(all_books)
all_books_ratings = all_books[['title', 'rating', 'num_ratings']]

all_reviews = pd.read_parquet("english_reviews_final.parquet")
all_reviews = process_raw_reviews(all_reviews)

users_data = pd.read_parquet("users_data_final.parquet")

all_books = all_books[all_books.title.isin(all_reviews.title)]
users_with_data = users_data.user_id.unique()

In [3]:
users_data['is_notable'] = users_data['is_best_reviewer'] | users_data['is_most_followed']
users_data[users_data.is_notable]

Unnamed: 0,user_url,user_id,name,num_ratings,avg_rating,num_reviews,is_best_reviewer,reviewer_rank,is_most_followed,follow_rank,is_notable
26,https://www.goodreads.com/user/show/28020639-i...,28020639-iben-frederiksen,Iben Frederiksen,496,3.68,308,True,55,True,24,True
37,https://www.goodreads.com/user/show/35915119-a...,35915119-ashley-daviau,Ashley Daviau,2248,4.10,2137,True,37,False,0,True
40,https://www.goodreads.com/user/show/46560610-b...,46560610-beatriz,Beatriz,1029,3.37,946,True,1,True,9,True
119,https://www.goodreads.com/user/show/15736557-l...,15736557-lynne-king,Lynne King,862,4.07,499,True,9,True,41,True
127,https://www.goodreads.com/user/show/59214246-s...,59214246-sara-lowe,Sara Lowe,81,4.12,17,True,30,True,41,True
...,...,...,...,...,...,...,...,...,...,...,...
14540,https://www.goodreads.com/user/show/132812492,132812492,الف‌م‌ی‌ر,95,3.87,34,True,90,False,0,True
14541,https://www.goodreads.com/user/show/12543333,12543333,حماس,319,2.96,252,True,58,False,0,True
14587,https://www.goodreads.com/user/show/20234777-t...,20234777-tawallah,Tawallah,1624,3.36,1125,True,2,False,0,True
14604,https://www.goodreads.com/user/show/18009940-dean,18009940-dean,Dean,558,4.51,533,True,58,False,0,True


In [4]:
np.corrcoef(users_data['num_ratings'], users_data['num_reviews'])

array([[1.        , 0.77111958],
       [0.77111958, 1.        ]])

In [5]:
ratings_to_reviews_ratio = np.mean(users_data['num_ratings']/users_data['num_reviews'])
ratings_to_reviews_ratio

5.495404887418799

In [6]:
np.std(users_data['num_ratings']/users_data['num_reviews'])

18.15884616487855

#### Label reviews with genres

In [119]:
def construct_book_genre_labels(all_books):
    mlb = MultiLabelBinarizer(classes=genres)
    genre_matrix = mlb.fit_transform(all_books['genres'])
    genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=all_books.index)
    genre_labels = pd.concat([all_books[['title']], genre_df], axis=1)

    return genre_labels

def label_reviews_with_genre(all_reviews, genre_labels):
    all_labeled_reviews = all_reviews.merge(
        genre_labels, 
        on='title', 
        how='inner'
    )

    all_labeled_reviews = all_labeled_reviews.drop_duplicates(subset=['title', 'user_id', 'rating'])
    return all_labeled_reviews

genre_labels = construct_book_genre_labels(all_books)
all_labeled_reviews = label_reviews_with_genre(all_reviews, genre_labels)

labeled_reviews_users_with_data = all_labeled_reviews[all_labeled_reviews.user_id.isin(users_with_data)]

#### User genre counts

For each user (col), how many books from each genre (row) have they read?

In [8]:
def get_user_genre_counts(reviews):
    
    user_genre_counts = reviews.groupby('user_id')[genres].sum().T  # genres as index
    num_reviews_by_user = reviews.groupby('user_id')['title'].count()
    user_genre_pct = user_genre_counts.div(num_reviews_by_user, axis = 1)

    return user_genre_counts, user_genre_pct

In [9]:
user_genre_counts, user_genre_pct = get_user_genre_counts(all_labeled_reviews)

#### User item matrix

In [10]:
def construct_user_item_matrix(reviews):
    reviews_grouped = reviews.groupby(['user_id', 'title'])['rating'].mean().reset_index()
    user_item_matrix = reviews_grouped.pivot(index='user_id', columns='title', values='rating')
    user_item_matrix = user_item_matrix.fillna(0)

    return user_item_matrix

def condense_user_item_matrix(user_item_matrix, n = 3):

    filtered_matrix = user_item_matrix[user_item_matrix.astype(bool).sum(axis=1) >= n]
    filtered_matrix = filtered_matrix.loc[:, filtered_matrix.astype(bool).sum(axis=0) >= n]

    return filtered_matrix

### Genre recommender

In [11]:
my_genre = 'Philosophy'

#### Find top reviewers for genre

In [12]:
def get_score(count, pct, alpha = 1):
    score = count * pct**alpha
    return score

def user_read_counts_for_genre(my_genre, user_genre_counts, user_genre_pct):
    genre_review_count_ranked = user_genre_counts.loc[my_genre, :].sort_values(ascending = False)
    genre_pct_of_reviews_ranked = user_genre_pct.loc[my_genre, :].sort_values(ascending = False)

    res = pd.DataFrame({"review_count": genre_review_count_ranked, "review_pct": genre_pct_of_reviews_ranked})
    return res

def get_genre_ranker(my_genre, user_genre_counts, user_genre_pct, alpha = 1):
    user_read_counts = user_read_counts_for_genre(my_genre, user_genre_counts, user_genre_pct)
    user_read_counts['score'] = get_score(user_read_counts['review_count'], user_read_counts['review_pct'], alpha = alpha)

    user_read_counts = user_read_counts[user_read_counts.review_count > 0]
    user_read_counts = user_read_counts.sort_values(by = 'score', ascending = False)
    return user_read_counts

def get_top_n_reviewers(ranker, n):
    top_n = ranker.head(n)
    top_n['score_normed'] = top_n['score']/np.sum(top_n['score'])

    return top_n

#### Suggest books from top reviewers

In [81]:
def filter_reviews_for_genre(my_genre, labeled_reviews):
    reviews_filtered = labeled_reviews[labeled_reviews[my_genre] == 1]
    reviews_filtered = reviews_filtered.drop(columns = my_genre)
    return reviews_filtered[['title', 'user_id', 'rating']]

def get_expert_user_item_matrix(user_item_matrix, experts):
    expert_user_item_matrix =  user_item_matrix[user_item_matrix.index.isin(experts)]
    expert_user_item_matrix = expert_user_item_matrix.loc[experts]

    return expert_user_item_matrix

def center_user_item_matrix(user_item_matrix):
    """ Assumes users are rows"""
    
    tmp = user_item_matrix.replace(0, np.NaN)
    tmp_means = tmp.mean(axis = 1)

    new_tmp = tmp.sub(tmp_means, axis = 0)
    new_tmp = new_tmp.fillna(0)

    return new_tmp

def adjust_expert_user_item_matrix(expert_user_item_matrix, adjust_value = 3):
    """ Adjust ratings downward by adjust_value"""
    adjusted_expert_user_item_matrix = expert_user_item_matrix.where(expert_user_item_matrix == 0, expert_user_item_matrix - adjust_value)
    return adjusted_expert_user_item_matrix

def lookup_rating(user_item_matrix, user_id, book_name):
    return user_item_matrix.loc[user_id, book_name]

def ratings_of_those_who_read(book_name, top_n_reviewers, expert_user_item_matrix):
    experts = top_n_reviewers.index
    amount_of_say = top_n_reviewers['score_normed']
    
    wavgs = pd.DataFrame(amount_of_say)
    wavgs['book_rating'] = [lookup_rating(expert_user_item_matrix, u, book_name) for u in experts]
    wavgs = wavgs[wavgs.book_rating != 0]

    return wavgs

def avg_expert_rating(book_name, top_n_reviewers, expert_user_item_matrix):
    amount_of_say = top_n_reviewers['score_normed']
    experts = top_n_reviewers.index
    
    wavgs = ratings_of_those_who_read(book_name, top_n_reviewers, expert_user_item_matrix)
    res = np.dot(wavgs['score_normed'], wavgs['book_rating'])/np.sum(wavgs['score_normed'])
    
    return res, len(wavgs)

def get_expert_ratings(expert_user_item_matrix, top_n_reviewers):
    """ sum(amount of say * rating) for everyone who rated the book for each book
    
    The more people who interacted...the more the score will be affected
    E.g. 10 people who rated positive > 5 people who rated positive
    """
    amount_of_say = top_n_reviewers['score_normed']

    expert_ratings = expert_user_item_matrix.T.dot(amount_of_say)
    expert_ratings= pd.DataFrame(expert_ratings)
    
    expert_ratings.columns = ['expert_metric']
    expert_ratings = expert_ratings.drop_duplicates()
    expert_ratings = expert_ratings.sort_values(by = 'expert_metric', ascending = False)

    return expert_ratings

def merge_expert_with_overall(expert_rating, all_books_rating, num_reviewers = 50):
    merged = expert_rating.merge(all_books_ratings, left_index=True, right_on='title', how='inner')
    merged = merged.set_index('title')
    
    merged = merged[['expert_wavg_rating', f'experts_who_read_out_of_{num_reviewers}', 
                     'expert_metric', 'rating', 'num_ratings']]

    return merged

In [84]:
def top_n_genre_expert_recommendations(my_genre, all_labeled_reviews, user_genre_counts, user_genre_pct, 
                                       num_reviewers = 50, how_many = 10, alpha = 1.2):
    # filter reviews from this genre and transform to user_item_matrix
    genre_filtered_reviews = filter_reviews_for_genre(my_genre, all_labeled_reviews)
    genre_user_item_matrix = construct_user_item_matrix(genre_filtered_reviews)
    genre_user_item_matrix = condense_user_item_matrix(genre_user_item_matrix)

    # use top reviewers to decide which books to recommend
    genre_ranker = get_genre_ranker(my_genre, user_genre_counts, user_genre_pct, alpha = alpha)
    
    top_n = get_top_n_reviewers(genre_ranker, num_reviewers)
    experts = top_n.index
    amount_of_say = top_n['score_normed']


    """ maybe write this as another atom func """
    # user_item_matrix for top reviewers of this genre
    expert_user_item_matrix = get_expert_user_item_matrix(genre_user_item_matrix, top_n.index)
    adjusted_expert_user_item_matrix = center_user_item_matrix(expert_user_item_matrix)

    adjusted_expert_ratings = get_expert_ratings(adjusted_expert_user_item_matrix, top_n)
    adjusted_expert_ratings[['expert_wavg_rating',f'experts_who_read_out_of_{num_reviewers}']] = [avg_expert_rating(book, top_n, expert_user_item_matrix) for book in adjusted_expert_ratings.index]
    # adjusted_expert_ratings['experts_who_read'] = adjusted_expert_ratings['experts_who_read']/num_reviewers
    # adjusted_expert_ratings['experts_who_read'] = adjusted_expert_ratings['experts_who_read'].apply(lambda x: f"{x * 100:.0f}%")

    adjusted_expert_ratings = adjusted_expert_ratings.round(2)
    adjusted_expert_ratings = adjusted_expert_ratings[['expert_wavg_rating', f'experts_who_read_out_of_{num_reviewers}', 'expert_metric']]

    best = adjusted_expert_ratings.head(how_many)
    controversial = adjusted_expert_ratings.tail(how_many).sort_values(by = 'expert_metric')
    
    best = merge_expert_with_overall(best, all_books_ratings)
    controversial = merge_expert_with_overall(controversial, all_books_ratings)

    return best, controversial

In [85]:
best, controversial = top_n_genre_expert_recommendations('Philosophy', all_labeled_reviews,
                                                         user_genre_counts, user_genre_pct)

In [86]:
best

Unnamed: 0_level_0,expert_wavg_rating,experts_who_read_out_of_50,expert_metric,rating,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"In Search of Lost Time, Volume 1: The Way by Swann's",4.88,10.0,0.19,4.15,65076
Tractatus Logico Philosophicus,4.66,8.0,0.18,4.1,21407
The Symposium,4.6,9.0,0.17,4.09,83867
Phenomenology of Spirit,4.7,8.0,0.17,3.96,19718
Swann's Way,4.87,9.0,0.17,4.15,65076
Introduction to Metaphysics,4.61,6.0,0.16,4.03,4260
An Enquiry Concerning Human Understanding,4.24,5.0,0.16,3.94,21653
The Elementary Forms of the Religious Life,4.3,4.0,0.13,3.93,3660
Darwin's Dangerous Idea: Evolution and the Meanings of Life,4.47,5.0,0.13,4.07,16774
Capital: A Critique of Political Economy Volume 1,4.23,8.0,0.13,4.09,22854


In [87]:
controversial

Unnamed: 0_level_0,expert_wavg_rating,experts_who_read_out_of_50,expert_metric,rating,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cosmos and Psyche: Intimations of a New World View,1.6,3.0,-0.25,4.25,1213
The Closing of the American Mind,2.11,5.0,-0.25,3.76,5822
Being and Nothingness,3.27,10.0,-0.21,3.99,33625
Industrial Society and Its Future,1.33,4.0,-0.21,3.86,11534
12 Rules for Life: An Antidote to Chaos,1.91,6.0,-0.19,3.92,252695
The Varieties of Religious Experience,2.9,8.0,-0.19,4.01,12570
The Road to Serfdom,2.02,5.0,-0.18,4.15,25121
"Enlightenment Now: The Case for Reason, Science, Humanism, and Progress",2.55,3.0,-0.18,4.2,31404
The Teachings of Don Juan: A Yaqui Way of Knowledge,2.59,4.0,-0.18,3.94,44399
"A Study of History, Abridgement of Vols 1-6",1.72,2.0,-0.18,4.1,518


### User similarity recommender

In [141]:
user_id = '155041466'
base_url = 'https://www.goodreads.com/user/show/'
user_url = base_url+user_id

def get_user_info(url):
    user = UserMetaData(url, review_pages=4)
    user.get_metadata()

    user_metadata = user.retrieve_metadata()

    user.get_review_info()
    user_reviews = user.retrieve_reviews()

    return user_metadata, user_reviews



In [179]:
def replace_zeros_with_nonzero_mean(arr):
    arr = arr.copy()  # avoid modifying original array
    non_zero_mean = arr[arr != 0].mean()
    print(non_zero_mean)
    arr[arr == 0] = non_zero_mean
    return arr

In [180]:
user_metadata, user_reviews = get_user_info(user_url)
user_reviews = pd.DataFrame(user_reviews)

In [181]:
user_reviews.head()

Unnamed: 0,user_id,title_id,title,rating,votes
0,155041466,1052.The_Richest_Man_in_Babylon,The Richest Man in Babylon,0,0
1,155041466,4866.How_to_Stop_Worrying_and_Start_Living,How to Stop Worrying and Start Living: Time-Te...,4,0
2,155041466,61439040-1984,1984,5,0
3,155041466,51893.Thus_Spoke_Zarathustra,Thus Spoke Zarathustra,0,0
4,155041466,6708.The_Power_of_Now,The Power of Now: A Guide to Spiritual Enlight...,5,0


In [182]:
user_reviews['rating'] = replace_zeros_with_nonzero_mean(user_reviews.rating.values)

4.517241379310345


In [184]:
user_reviews.head()

Unnamed: 0,user_id,title_id,title,rating,votes
0,155041466,1052.The_Richest_Man_in_Babylon,The Richest Man in Babylon,4,0
1,155041466,4866.How_to_Stop_Worrying_and_Start_Living,How to Stop Worrying and Start Living: Time-Te...,4,0
2,155041466,61439040-1984,1984,5,0
3,155041466,51893.Thus_Spoke_Zarathustra,Thus Spoke Zarathustra,4,0
4,155041466,6708.The_Power_of_Now,The Power of Now: A Guide to Spiritual Enlight...,5,0


In [185]:
user_title_list = user_reviews.title.unique()

In [186]:
user_related_reviews = all_labeled_reviews[all_labeled_reviews.title.isin(user_title_list)]
user_item_matrix = construct_user_item_matrix(user_related_reviews)

In [188]:
m_T = user_item_matrix.T
m_T = m_T[m_T.index.isin(user_title_list)]
m_T

user_id,100476652-irun-a,100759205-kat,10077465-anurag-vaishnav,100778714-andrea-hartmann,100790884-abby-moore,100807586-nellian,101358081-imme-van-gorp,101459683-frank-hidalgo-gato-dur-n,101693182-tharindu-dissanayake,10171516-jessica,...,975193-kathy,97546350-giorgia-reads,975596-jesse,98090277-juju,9882308-atlas,99062839-don,99413969-simon-ri,9946499-roxanne,996039-rosieface,9999244-jessi-galloway
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12 Rules for Life: An Antidote to Chaos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
All the Light We Cannot See,0.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Brave New World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Can't Hurt Me: Master Your Mind and Defy the Odds,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
Crime and Punishment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Educated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
How to Stop Worrying and Start Living: Time-Tested Methods for Conquering Worry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
How to Win Friends & Influence People,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
In Five Years,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"JFK: Coming Of Age In The American Century, 1917-1956",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [189]:
user_title_rating = user_reviews[['title', 'rating']]
user_title_rating = user_title_rating.rename(columns = {'rating': user_id})
user_title_rating = user_title_rating.set_index('title')
print(len(user_title_rating))
user_title_rating.head()

40


Unnamed: 0_level_0,155041466
title,Unnamed: 1_level_1
The Richest Man in Babylon,4
How to Stop Worrying and Start Living: Time-Tested Methods for Conquering Worry,4
1984,5
Thus Spoke Zarathustra,4
The Power of Now: A Guide to Spiritual Enlightenment,5


In [190]:
user_item_matrix_joined = m_T.merge(user_title_rating, left_index = True, right_index = True)
user_item_matrix_joined

Unnamed: 0_level_0,100476652-irun-a,100759205-kat,10077465-anurag-vaishnav,100778714-andrea-hartmann,100790884-abby-moore,100807586-nellian,101358081-imme-van-gorp,101459683-frank-hidalgo-gato-dur-n,101693182-tharindu-dissanayake,10171516-jessica,...,97546350-giorgia-reads,975596-jesse,98090277-juju,9882308-atlas,99062839-don,99413969-simon-ri,9946499-roxanne,996039-rosieface,9999244-jessi-galloway,155041466
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12 Rules for Life: An Antidote to Chaos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
All the Light We Cannot See,0.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
Brave New World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
Can't Hurt Me: Master Your Mind and Defy the Odds,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,4
Crime and Punishment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
Educated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
How to Stop Worrying and Start Living: Time-Tested Methods for Conquering Worry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
How to Win Friends & Influence People,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
In Five Years,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
"JFK: Coming Of Age In The American Century, 1917-1956",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [191]:
user_item_matrix_T = user_item_matrix_joined.T
user_item_matrix_T_centered = center_user_item_matrix(user_item_matrix_T)
user_item_matrix_T_centered.head()

title,12 Rules for Life: An Antidote to Chaos,All the Light We Cannot See,Brave New World,Can't Hurt Me: Master Your Mind and Defy the Odds,Crime and Punishment,Educated,How to Stop Worrying and Start Living: Time-Tested Methods for Conquering Worry,How to Win Friends & Influence People,In Five Years,"JFK: Coming Of Age In The American Century, 1917-1956",...,The Martian,The Midnight Library,The Myth of Sisyphus,The Plague,The Power of Habit: Why We Do What We Do in Life and Business,The Power of Now: A Guide to Spiritual Enlightenment,The Prince,The Richest Man in Babylon,The Stranger,Thus Spoke Zarathustra
100476652-irun-a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100759205-kat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10077465-anurag-vaishnav,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100778714-andrea-hartmann,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100790884-abby-moore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
cos_sim_matrix = cosine_similarity(user_item_matrix_T)
similarity_df = pd.DataFrame(cos_sim_matrix, index=user_item_matrix_T.index, columns=user_item_matrix_T.index)
similarity_df

Unnamed: 0,100476652-irun-a,100759205-kat,10077465-anurag-vaishnav,100778714-andrea-hartmann,100790884-abby-moore,100807586-nellian,101358081-imme-van-gorp,101459683-frank-hidalgo-gato-dur-n,101693182-tharindu-dissanayake,10171516-jessica,...,97546350-giorgia-reads,975596-jesse,98090277-juju,9882308-atlas,99062839-don,99413969-simon-ri,9946499-roxanne,996039-rosieface,9999244-jessi-galloway,155041466
100476652-irun-a,1.00000,0.00000,0.000000,0.00000,0.000000,0.00000,1.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,0.00000,1.00000,0.165380
100759205-kat,0.00000,1.00000,0.000000,1.00000,0.000000,1.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,1.00000,0.00000,0.165380
10077465-anurag-vaishnav,0.00000,0.00000,1.000000,0.00000,1.000000,0.00000,0.00000,0.00000,0.00000,0.339276,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,0.00000,0.00000,0.206725
100778714-andrea-hartmann,0.00000,1.00000,0.000000,1.00000,0.000000,1.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,1.00000,0.00000,0.165380
100790884-abby-moore,0.00000,0.00000,1.000000,0.00000,1.000000,0.00000,0.00000,0.00000,0.00000,0.339276,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,0.00000,0.00000,0.206725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99413969-simon-ri,0.00000,0.00000,0.000000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,1.00000,0.0,0.00000,0.00000,0.165380
9946499-roxanne,0.00000,0.00000,0.000000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,0.00000,0.00000,0.000000
996039-rosieface,0.00000,1.00000,0.000000,1.00000,0.000000,1.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,1.00000,0.00000,0.165380
9999244-jessi-galloway,1.00000,0.00000,0.000000,0.00000,0.000000,0.00000,1.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.0,0.00000,1.00000,0.165380


In [193]:
similarity_score = similarity_df[user_id].sort_values(ascending = False)
similarity_score = similarity_score[similarity_score.index != user_id]

In [194]:
similarity_ranker = pd.DataFrame({'similarity': similarity_score})
similarity_ranker

Unnamed: 0,similarity
614778-ahmad-sharabiani,0.776947
16958299-dr-appu-sasidharan-dasfill,0.528886
124132123-lisa-of-troy,0.497859
70395042-fergus-weaver-of-autistic-webs,0.484434
32879029-emma,0.476294
...,...
9520302-umberto,0.000000
67918882-zainab,0.000000
79694445-hanan-baba,0.000000
9946499-roxanne,0.000000


In [195]:
user_review_counts = users_data[['user_id', 'num_reviews']]
user_review_counts = user_review_counts.set_index('user_id')

In [196]:
similarity_ranker = similarity_ranker.merge(user_review_counts, left_index = True, right_index = True)

In [203]:
#similarity_ranker['score'] = get_score(similarity_ranker['num_reviews'], similarity_ranker['similarity'], alpha = )

similarity_ranker['score'] = similarity_ranker['similarity']
similarity_ranker = similarity_ranker.sort_values(by = 'score', ascending = False)
similarity_ranker

Unnamed: 0,similarity,num_reviews,score
614778-ahmad-sharabiani,0.776947,9563,0.776947
16958299-dr-appu-sasidharan-dasfill,0.528886,1360,0.528886
124132123-lisa-of-troy,0.497859,844,0.497859
32879029-emma,0.476294,2415,0.476294
27788046-sean-barrs,0.454941,1122,0.454941
...,...,...,...
9520302-umberto,0.000000,59,0.000000
67918882-zainab,0.000000,121,0.000000
79694445-hanan-baba,0.000000,4,0.000000
9946499-roxanne,0.000000,38,0.000000


In [214]:
top_n = get_top_n_reviewers(similarity_ranker, 10).iloc[1:]
experts = top_n.index
amount_of_say = top_n['score_normed']

In [215]:
top_n

Unnamed: 0,similarity,num_reviews,score,score_normed
16958299-dr-appu-sasidharan-dasfill,0.528886,1360,0.528886,0.109103
124132123-lisa-of-troy,0.497859,844,0.497859,0.102702
32879029-emma,0.476294,2415,0.476294,0.098254
27788046-sean-barrs,0.454941,1122,0.454941,0.093849
21348532-liong,0.45306,266,0.45306,0.093461
101693182-tharindu-dissanayake,0.44376,308,0.44376,0.091543
10171516-jessica,0.427833,2630,0.427833,0.088257
78485297-mario-the-lone-bookwolf,0.404695,805,0.404695,0.083484
30181442-yun,0.383309,602,0.383309,0.079072


In [216]:
full_user_item_matrix = construct_user_item_matrix(labeled_reviews_users_with_data)

In [217]:
full_user_item_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9396 entries, 1-otis-chandler to 999171-anita
Columns: 16576 entries, "A Problem from Hell": America and the Age of Genocide to 默读 [Mo Du] The Light in the Night
dtypes: float64(16576)
memory usage: 1.2+ GB


In [218]:
expert_user_item_matrix = get_expert_user_item_matrix(full_user_item_matrix, experts)
adjusted_expert_user_item_matrix = center_user_item_matrix(expert_user_item_matrix)

In [219]:
adjusted_expert_ratings = get_expert_ratings(adjusted_expert_user_item_matrix, top_n)

In [220]:
adjusted_expert_ratings

Unnamed: 0_level_0,expert_metric
title,Unnamed: 1_level_1
The Picture of Dorian Gray,0.472723
We Should All Be Feminists,0.447104
The Little Prince,0.436358
The Ocean at the End of the Lane,0.427834
"Pride and Prejudice: The Complete Novel, with Nineteen Letters from the Characters' Correspondence, Written and Folded by Hand",0.412560
...,...
The Time Traveler's Wife,-0.453489
The Woman in the Library,-0.510691
Atlas Shrugged,-0.570850
The Woman in the Window,-0.637368


In [221]:
adjusted_expert_ratings[['expert_wavg_rating',f'experts_who_read_out_of_{10}']] = [avg_expert_rating(book, top_n, expert_user_item_matrix) for book in adjusted_expert_ratings.index]

In [222]:
adjusted_expert_ratings.head(30)

Unnamed: 0_level_0,expert_metric,expert_wavg_rating,experts_who_read_out_of_10
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Picture of Dorian Gray,0.472723,4.684291,6.0
We Should All Be Feminists,0.447104,4.639786,6.0
The Little Prince,0.436358,5.0,4.0
The Ocean at the End of the Lane,0.427834,5.0,3.0
"Pride and Prejudice: The Complete Novel, with Nineteen Letters from the Characters' Correspondence, Written and Folded by Hand",0.41256,5.0,4.0
The Seven Husbands of Evelyn Hugo,0.412508,4.733213,4.0
Anxious People,0.406032,4.58837,5.0
To Kill a Mockingbird,0.391998,5.0,4.0
Romeo and Juliet,0.382312,4.74772,4.0
Project Hail Mary,0.365219,4.492283,6.0
