In [29]:
import sys
import warnings

warnings.filterwarnings('ignore')
sys.path.append(r"C:\Users\13477\Desktop\New Adventure\Goodreads\goodreads_prod")

from static import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException



def process_raw_books(books):
    res = books.drop_duplicates(subset = [col for col in books.columns if col != 'genres'])
    res = res.dropna()
    return res

def process_raw_reviews(reviews):
    """ Make sure each user only reviews a book once """
    res = reviews.drop_duplicates(subset = ['title', 'user_id'], keep = 'first')
    res = res.dropna()

    return res

# process dfs
original_books = pd.read_parquet("original_books.parquet")
original_books = process_raw_books(original_books)

original_reviews = pd.read_parquet("user_book_reviews.parquet")
original_reviews = process_raw_reviews(original_reviews)

new_books = pd.read_parquet("new_books.parquet")
new_books = process_raw_books(new_books)

new_reviews = pd.read_parquet("user_new_book_reviews.parquet")
new_reviews = process_raw_reviews(new_reviews)

original_users_top_5_reviews_full = pd.read_parquet("top_5_reviews_for_each_original_user.parquet")
original_users_top_5_reviews_full = process_raw_reviews(original_users_top_5_reviews_full)
original_users_top_5_reviews = original_users_top_5_reviews_full[['title', 'user_id', 'rating']]

In [30]:
# 29,636 books
all_books = pd.concat([original_books, new_books])
all_books = process_raw_books(all_books)

In [31]:
# 702,390 reviews (around 24 reviews per book)
all_reviews = pd.concat([original_reviews, new_reviews, original_users_top_5_reviews])
all_reviews = process_raw_reviews(all_reviews)

In [32]:
# 219,300 unique users
num_unique_users = all_reviews.user_id.nunique()

In [33]:
# 9466 users with data
users_data = pd.read_parquet("users_data.parquet")
users_data = users_data[users_data.num_ratings > 0]
users_with_data = users_data.user_id.unique()
users_data.head()

Unnamed: 0,user_url,user_id,name,num_ratings,avg_rating,num_reviews,is_best_reviewer,reviewer_rank,is_most_followed,follow_rank
1,https://www.goodreads.com/user/show/7869133-sarah,7869133-sarah,Sarah,185,4.01,28,False,0,False,0
2,https://www.goodreads.com/user/show/143499596-...,143499596-camden-glenn,Camden Glenn,99,4.52,44,False,0,False,0
3,https://www.goodreads.com/user/show/83361162-j...,83361162-josiah-edwards,Josiah Edwards,152,3.59,95,False,0,False,0
4,https://www.goodreads.com/user/show/63671935-g...,63671935-gabriela-bevenuto,Gabriela Bevenuto,155,4.45,30,False,0,False,0
6,https://www.goodreads.com/user/show/139483609-...,139483609-harry-taylor,Harry Taylor,14,4.64,11,False,0,False,0


In [34]:
users_data.is_best_reviewer.value_counts(normalize = True)

is_best_reviewer
False    0.811113
True     0.188887
Name: proportion, dtype: float64

In [35]:
users_data.is_most_followed.value_counts(normalize = True)

is_most_followed
False    0.915698
True     0.084302
Name: proportion, dtype: float64

#### Label genres for each book/review

In [36]:
target_genres = genres

mlb = MultiLabelBinarizer(classes=target_genres)
genre_matrix = mlb.fit_transform(all_books['genres'])

# Create a DataFrame with the new columns
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=all_books.index)
genre_labels = pd.concat([all_books[['title']], genre_df], axis=1)

In [37]:
all_labeled_reviews = all_reviews.merge(
        genre_labels, 
        on='title', 
        how='inner'
    )

all_labeled_reviews = all_labeled_reviews.drop_duplicates(subset=['title', 'user_id', 'rating'])

### Genre expertise score for each user

In [48]:
def get_user_genre_counts(reviews):
    
    user_genre_counts = reviews.groupby('user_id')[genres].sum().T  # genres as index
    num_reviews_by_user = reviews.groupby('user_id')['title'].count()
    user_genre_pct = user_genre_counts.div(num_reviews_by_user, axis = 1)

    return user_genre_counts, user_genre_pct

In [38]:
# num reviews of each user by genre
user_genre_counts = all_labeled_reviews.groupby('user_id')[genres].sum().T  # genres as index

# num reviews by each user
num_reviews_by_user = all_labeled_reviews.groupby('user_id')['title'].count()

# pct reviews by genre for each user
user_genre_pct = user_genre_counts.div(num_reviews_by_user, axis = 1)
user_genre_counts.head()

user_id,1-otis-chandler,10000084-heidi-clark,100001132-nikoleta,10000169-allegra,100002084-jacob-rutledge,100002600-shatha-awwad,1000041-smaileh,10000471-ellen-s-reviews,10000560-jacquelyn,100006355-torri-blackromanceconnoisseur,...,9999244-jessi-galloway,999928-kevin,99993439-philip,9999401-elizabeth,9999468-deanna,9999500-lexi,9999557-erika,9999943-gretchen-miller,9999971-cindy,9999999-mary
Art,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Biography,22,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
Business,34,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Chick Lit,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Children's,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
is_notable_user = users_data[['user_id', 'is_best_reviewer', 'is_most_followed']]
is_notable_user['is_notable_user'] = is_notable_user['is_best_reviewer'] | is_notable_user['is_most_followed']
is_notable_user = is_notable_user[['user_id', 'is_notable_user']]
is_notable_user

Unnamed: 0,user_id,is_notable_user
1,7869133-sarah,False
2,143499596-camden-glenn,False
3,83361162-josiah-edwards,False
4,63671935-gabriela-bevenuto,False
6,139483609-harry-taylor,False
...,...,...
14612,5856801-jimmy,False
14614,123832897-brock-wiebe,False
14615,1310936-jeremy,False
14617,5885995-eric-molicki,False


#### Cosine similarity!!!

In [64]:
ex_user = '49501801-carolyn-marie'

In [52]:
compact_labeled_reviews = all_labeled_reviews[all_labeled_reviews.user_id.isin(users_with_data)]
compact_user_genre_count, compact_genre_pct = get_user_genre_counts(compact_labeled_reviews)

In [61]:
# Transpose so users are rows
M_T = compact_user_genre_count.T  # shape (num_users, num_genres)
user_indices = M_T.index

cos_sim_matrix = cosine_similarity(M_T)
similarity_df = pd.DataFrame(cos_sim_matrix, index=user_indices, columns=user_indices)

In [62]:
similarity_df

user_id,1-otis-chandler,10001905-lynn,100019622-vonda,100022104-jasmine,10010399-kathleen,100118640-dwayne,1001852-gayle,1002168-sally,100218389-jan-agaton,1002184-sara,...,99754790-will-wilson,99765491-jimz,99766625-brok3n,9977-bob,9977792-elizabeth-day,9979214-israel,9984005-hanna,99910477-abi,9991498-candice,999171-anita
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-otis-chandler,1.000000,0.763942,0.234333,0.519730,0.794682,0.169514,0.552426,0.274338,0.237521,0.289660,...,0.361450,0.253082,0.577033,0.512703,0.573745,0.752115,0.314124,0.078781,0.306513,0.901359
10001905-lynn,0.763942,1.000000,0.334040,0.569951,0.715671,0.290834,0.767195,0.322193,0.283670,0.304320,...,0.385987,0.395478,0.511844,0.359935,0.589192,0.754150,0.367941,0.127319,0.479301,0.735070
100019622-vonda,0.234333,0.334040,1.000000,0.646419,0.260157,0.848491,0.059761,0.819853,0.702355,0.735230,...,0.684021,0.799422,0.752591,0.467292,0.510252,0.423678,0.717137,0.585571,0.809671,0.254408
100022104-jasmine,0.519730,0.569951,0.646419,1.000000,0.532666,0.540977,0.346688,0.564463,0.622343,0.544999,...,0.610483,0.542727,0.763758,0.693978,0.767426,0.636146,0.554700,0.490290,0.689695,0.612626
10010399-kathleen,0.794682,0.715671,0.260157,0.532666,1.000000,0.191660,0.512148,0.308837,0.219031,0.218779,...,0.281825,0.259566,0.474236,0.493904,0.627577,0.640741,0.402402,0.129337,0.307414,0.822734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9979214-israel,0.752115,0.754150,0.423678,0.636146,0.640741,0.368878,0.625543,0.345782,0.338173,0.463179,...,0.477325,0.520643,0.718590,0.434783,0.527504,1.000000,0.536180,0.126379,0.500639,0.753673
9984005-hanna,0.314124,0.367941,0.717137,0.554700,0.402402,0.673934,0.285714,0.516877,0.479247,0.561435,...,0.427648,0.822987,0.598145,0.431923,0.609868,0.536180,1.000000,0.375200,0.587995,0.344237
99910477-abi,0.078781,0.127319,0.585571,0.490290,0.129337,0.581932,0.000000,0.700517,0.720398,0.457407,...,0.337968,0.389805,0.465248,0.252758,0.543045,0.126379,0.375200,1.000000,0.710283,0.137934
9991498-candice,0.306513,0.479301,0.809671,0.689695,0.307414,0.793492,0.257248,0.853195,0.739167,0.805866,...,0.588885,0.769181,0.740162,0.411239,0.623675,0.500639,0.587995,0.710283,1.000000,0.344377


In [69]:
similarities = similarity_df[ex_user]
similarities = similarities[similarities.index != ex_user]
similarities = similarities.sort_values(ascending = False)
similarities

user_id
38055348-lea               0.983299
18922532-ilenia-zodiaco    0.974259
10272784-joan              0.973384
73395599-sana              0.973239
45198798-leynes            0.972967
                             ...   
4620119-george             0.054365
131030985-pete-hays        0.054365
1373605-crystal-faith      0.054365
353500-joanna              0.054365
3587251-ryan-johnson       0.054365
Name: 49501801-carolyn-marie, Length: 9465, dtype: float64

In [99]:
similarity_ranker = pd.DataFrame({'similarity': similarities})
similarity_ranker = similarity_ranker.reset_index()

users_num_ratings = users_data[['user_id', 'num_ratings']]
users_num_ratings = users_num_ratings.merge(is_notable_user, left_on = 'user_id', right_on = 'user_id')

similarity_ranker = similarity_ranker.merge(users_num_ratings,  left_on = 'user_id',
                                                               right_on = 'user_id',
                                                               how = 'inner')

similarity_ranker

Unnamed: 0,user_id,similarity,num_ratings,is_notable_user
0,38055348-lea,0.983299,661,True
1,18922532-ilenia-zodiaco,0.974259,875,True
2,10272784-joan,0.973384,726,False
3,73395599-sana,0.973239,783,True
4,45198798-leynes,0.972967,1262,True
...,...,...,...,...
9460,4620119-george,0.054365,2,False
9461,131030985-pete-hays,0.054365,15,False
9462,1373605-crystal-faith,0.054365,17,False
9463,353500-joanna,0.054365,3,False


In [104]:
similarity_ranker['score'] = similarity_ranker['num_ratings'] * similarity_ranker['similarity']**5
similarity_ranker = similarity_ranker.sort_values(by = 'score', ascending = False)
similarity_ranker

Unnamed: 0,user_id,similarity,num_ratings,is_notable_user,score
10,614778-ahmad-sharabiani,0.969666,9138,True,7.833622e+03
81,87835420-abigail,0.940502,7646,False,5.626420e+03
448,3498889-cheryl,0.880078,9755,False,5.150304e+03
467,45524597-belinda,0.876903,9279,False,4.811251e+03
556,122647-sarah-sammis,0.866258,9558,False,4.662344e+03
...,...,...,...,...,...
9390,51165768-jared,0.066014,1,False,1.253668e-06
9402,156385582-anissa-galata,0.065900,1,False,1.242851e-06
9420,92257579-mc-savoy,0.065900,1,False,1.242851e-06
9460,4620119-george,0.054365,2,False,9.497471e-07


#### Suggestion

In [115]:
user_item_matrix = construct_user_item_matrix(compact_labeled_reviews)
condensed_user_item_matrix = condense_user_item_matrix(user_item_matrix)

## Genre:Philosophy!!

### Ranking reviewers

In [12]:
my_genre = 'Psychology'

genre_review_count_ranked = user_genre_counts.loc[my_genre, :].sort_values(ascending = False)
genre_review_count_ranked

user_id
614778-ahmad-sharabiani      236
1045774-mehrsa               161
175635-trevor                136
3897817-morgan-blackledge    115
155663-david-rubenstein      108
                            ... 
207997-stacy                   0
2080192-catherine              0
20802067-doc-fabulous          0
2080212-deb                    0
9999999-mary                   0
Name: Psychology, Length: 219300, dtype: int32

In [13]:
genre_pct_of_reviews_ranked = user_genre_pct.loc[my_genre, :].sort_values(ascending = False)
genre_pct_of_reviews_ranked

user_id
3702421-stephanie         1.0
59598703-carey-calvert    1.0
173838287-james           1.0
11713462-ariza            1.0
1738410-chris             1.0
                         ... 
207997-stacy              0.0
2080192-catherine         0.0
20802067-doc-fabulous     0.0
2080212-deb               0.0
9999999-mary              0.0
Name: Psychology, Length: 219300, dtype: float64

In [14]:
genre_ranker = pd.DataFrame({"review_count": genre_review_count_ranked, "review_pct": genre_pct_of_reviews_ranked})
genre_ranker['score1'] = genre_ranker['review_count'] * genre_ranker['review_pct']**1.2
genre_ranker = genre_ranker.sort_values(by = 'score1', ascending = False)

genre_ranker[genre_ranker.review_count >= 3]

Unnamed: 0_level_0,review_count,review_pct,score1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3897817-morgan-blackledge,115,0.761589,82.939786
1045774-mehrsa,161,0.438692,59.898735
49415231-amir-tesla,70,0.823529,55.451459
155663-david-rubenstein,108,0.484305,45.244566
175635-trevor,136,0.336634,36.823745
...,...,...,...
13061577-claudia-lomel,3,0.006865,0.007605
5431458-henry-avila,3,0.006550,0.007188
1036893-fabian,3,0.006073,0.006564
10171516-jessica,5,0.003846,0.006324


In [15]:
def get_top_n_reviewers(genre_ranker, n):
    top_n = genre_ranker.head(n)
    top_n['score1_normed'] = top_n['score1']/np.sum(top_n['score1'])

    return top_n

In [16]:
top_20 = genre_ranker.head(20)
top_20['score1_normed'] = top_20['score1']/np.sum(top_20['score1'])
top_20.head(5)

Unnamed: 0_level_0,review_count,review_pct,score1,score1_normed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3897817-morgan-blackledge,115,0.761589,82.939786,0.131823
1045774-mehrsa,161,0.438692,59.898735,0.095202
49415231-amir-tesla,70,0.823529,55.451459,0.088133
155663-david-rubenstein,108,0.484305,45.244566,0.071911
175635-trevor,136,0.336634,36.823745,0.058527


### User item matrix

In [114]:
def construct_user_item_matrix(reviews):
    reviews_grouped = reviews.groupby(['user_id', 'title'])['rating'].mean().reset_index()
    user_item_matrix = reviews_grouped.pivot(index='user_id', columns='title', values='rating')
    user_item_matrix = user_item_matrix.fillna(0)

    return user_item_matrix

def condense_user_item_matrix(user_item_matrix, n = 3):

    filtered_matrix = user_item_matrix[user_item_matrix.astype(bool).sum(axis=1) >= n]
    filtered_matrix = filtered_matrix.loc[:, filtered_matrix.astype(bool).sum(axis=0) >= n]

    return filtered_matrix

def filter_reviews_for_genre(all_reviews, my_genre):
    # Merge genre labels with reviews on 'title'
    all_reviews_for_genre = all_reviews.merge(
        genre_labels[['title', my_genre]], 
        on='title', 
        how='inner'
    )
    
    all_reviews_for_genre = all_reviews_for_genre.drop_duplicates(subset=['title', 'user_id', 'rating'])
    all_reviews_for_genre = all_reviews_for_genre[all_reviews_for_genre[my_genre] == 1]
    all_reviews_for_genre = all_reviews_for_genre.drop(columns = my_genre)
    
    # only keep titles with at least 1 english char
    all_reviews_for_genre = all_reviews_for_genre[all_reviews_for_genre['title'].str.contains(r'[a-zA-Z]', regex=True)]

    return all_reviews_for_genre

# Function to safely detect language
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def filter_english_reviews(reviews):
    all_titles = reviews.title.unique()
    english_titles = [t for t in all_titles if is_english(t)]

    english_reviews = reviews[reviews.title.isin(english_titles)]
    return english_reviews

In [106]:
all_reviews_for_genre = filter_reviews_for_genre(all_reviews, my_genre)
genre_review = filter_english_reviews(all_reviews_for_genre)

In [108]:
user_item_matrix = construct_user_item_matrix(genre_review)
condensed_user_item_matrix = condense_user_item_matrix(user_item_matrix)

### Expert suggested books

In [109]:
def lookup_rating(user_item_matrix, user_id, book_name):
    return user_item_matrix.loc[user_id, book_name]

def how_many_read(book_name, experts):
    wavgs = amount_of_say.copy()
    wavgs['book_rating'] = [lookup_rating(expert_user_item_matrix, u, book_name) for u in experts]
    wavgs = wavgs[wavgs.book_rating != 0]

def avg_expert_rating(book_name, experts):
    wavgs = amount_of_say.copy()
    wavgs['book_rating'] = [lookup_rating(expert_user_item_matrix, u, book_name) for u in experts]
    wavgs = wavgs[wavgs.book_rating != 0]

    res = np.dot(wavgs['score1_normed'], wavgs['book_rating'])/np.sum(wavgs['score1_normed'])
    
    return res, len(wavgs)

In [110]:
# define experts
top_50 = get_top_n_reviewers(genre_ranker, 50)
amount_of_say = top_50[['score1_normed']]

experts = top_50.index

In [111]:
# ratings by the "experts"
expert_user_item_matrix = condensed_user_item_matrix[condensed_user_item_matrix.index.isin(top_50.index)]
expert_user_item_matrix = expert_user_item_matrix.loc[top_50.index]

# scored by experts
expert_ratings = expert_user_item_matrix.T.dot(amount_of_say)
expert_ratings.columns = ['expert_score']
expert_ratings = expert_ratings.drop_duplicates()
expert_ratings = expert_ratings.sort_values(by = 'expert_score', ascending = False)

In [112]:
# adjust score (penalize those with 2 stars or less)
adjusted_expert_user_item_matrix = expert_user_item_matrix.where(expert_user_item_matrix == 0, expert_user_item_matrix - 3)

# scored by experts
adjusted_expert_ratings = adjusted_expert_user_item_matrix.T.dot(amount_of_say)
adjusted_expert_ratings.columns = ['expert_score']
adjusted_expert_ratings = adjusted_expert_ratings.drop_duplicates()
adjusted_expert_ratings = adjusted_expert_ratings.sort_values(by = 'expert_score', ascending = False)

In [113]:
adjusted_expert_ratings[['wavg_rating','experts_who_read']] = [avg_expert_rating(book, experts) for book in adjusted_expert_ratings.index]
adjusted_expert_ratings.head(10)

Unnamed: 0_level_0,expert_score,wavg_rating,experts_who_read
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Behave: The Biology of Humans at Our Best and Worst,0.407505,4.817332,6.0
"How to Change Your Mind: What the New Science of Psychedelics Teaches Us About Consciousness, Dying, Addiction, Depression, and Transcendence",0.396028,5.0,4.0
How to Change Your Mind: The New Science of Psychedelics,0.370594,5.0,3.0
Darwin's Dangerous Idea: Evolution and the Meanings of Life,0.324929,5.0,3.0
Stumbling on Happiness,0.322087,4.589485,6.0
Essentialism: The Disciplined Pursuit of Less,0.30867,4.679008,5.0
How Emotions Are Made: The Secret Life of the Brain,0.297073,4.184956,6.0
Drive: The Surprising Truth About What Motivates Us,0.292698,4.084384,7.0
"The Body Keeps the Score: Brain, Mind, and Body in the Healing of Trauma",0.285259,5.0,4.0
"Peak: Unleashing Your Inner Champion Through Revolutionary Methods for Skill Acquisition and Performance Enhancement in Work, Sports, and Life",0.279871,4.789982,6.0
