In [1]:

import pandas as pd
from collections import defaultdict
from collections import Counter
import numpy as np

In [2]:
#create pd dataframes

bk_df = pd.read_csv('books.csv')
rt_df = pd.read_csv('ratings.csv')
tr_df = pd.read_csv('to_read.csv')

bk_df.authors = [authors.split(",")[0] for authors in bk_df.authors] #delete all but LEAD authors for simplicity

#keep only users who have more than 160 ratings + rt
combi_df = pd.concat([rt_df, tr_df], ignore_index = True) #a dataframe of all ratings or book listings
total_entry_counts = Counter(combi_df.user_id)
total_entries_list = [total_entry_counts[user_id] for user_id in combi_df.user_id]
combi_df['total_user_entries'] = total_entries_list

In [3]:
NR_CLUSTERS = 30
MIN_COUNT_IMPORTANT_AUTHOR = 100 #min total entries for an author to be considered a defining author of any cluster
MIN_COUNT_IMPORTANT_BOOK = 50 #min total entries for a book to be considered a defining book of any cluster
TOP_N = 10

#finding the cutoff for which users to consider
MIN_USER_RATINGS = 50
MAX_USER_RATINGS = 83

In [4]:
bk_df

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9.780440e+12,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7130616,7130616,7392860,19,441019455,9.780441e+12,Ilona Andrews,2010.0,Bayou Moon,...,17204,18856,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...
9996,9997,208324,208324,1084709,19,067973371X,9.780680e+12,Robert A. Caro,1990.0,Means of Ascent,...,12582,12952,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9997,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,9421,10733,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...
9998,9999,8565083,8565083,13433613,7,61711527,9.780062e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,11279,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...


In [5]:
combi_df = combi_df[(combi_df["total_user_entries"] >= MIN_USER_RATINGS) & (combi_df["total_user_entries"] <= MAX_USER_RATINGS)]
print(combi_df.shape)

(174097, 4)


In [6]:
combi_df.head()

Unnamed: 0,user_id,book_id,rating,total_user_entries
351,25,14,4.0,60
352,25,8336,4.0,60
353,25,653,5.0,60
354,25,5374,4.0,60
355,25,872,4.0,60


In [7]:
len(set(combi_df.user_id)) #how many users we winnowed it down to

2432

In [8]:
#create dictionaries of indexes


#set of USERS
users_set = set(combi_df.user_id)
index2userid = {index:user_id for index, user_id in enumerate(users_set)}
userid2index = {user_id:index for index,user_id in index2userid.items()}


#set of lead authors of books
author_set = set(bk_df.authors)

#give each author an index that connects to their name
index2author = defaultdict()
for index, author in enumerate(author_set):
    index2author[index] = author
    
author2index = {author:index for index, author in index2author.items()}
nr_authors = len(index2author)


#set of books
book_set = set(bk_df.book_id)

#give each BOOK an INDEX that connects to their name
index2bookid = defaultdict()
for index, book_id in enumerate(book_set):
    index2bookid[index] = book_id
    
bookid2index = {book_id:index for index, book_id in index2bookid.items()}
nr_books = len(bookid2index)
    
#connect each book id to the title in a dictionary
#connect each book id to the author index
bookid2title = defaultdict()
bookid2authorindex = defaultdict()
for _, row in bk_df.iterrows():
    bookid2title[row.book_id] = row.original_title 
    bookid2authorindex[row.book_id] = author2index[row.authors] #assign work id to the author index in another dictionary
    


In [9]:
nr_users = len(set(combi_df.user_id))
user_vectors = np.zeros((nr_users, nr_authors)) #each row is a user, each column is an author

for book_id, user_id in zip(combi_df.book_id, combi_df.user_id): #iterate through ratings
    author_index = bookid2authorindex[book_id]
    user_index = userid2index[user_id]
    user_vectors[user_index, author_index] += 1
    
print("We have {} authors and {} users, considering a total of {} user-book interactions.".format(nr_authors, nr_users, len(combi_df.user_id) ))

We have 3888 authors and 2432 users, considering a total of 174097 user-book interactions.


In [10]:
#NORMALIZE user vectors

empty_users = []

for user_index in range(nr_users):
    vector_sum = np.sum(user_vectors[user_index, :])
    if vector_sum > 0:
        user_vectors[user_index] = user_vectors[user_index]/vector_sum
    else:
        user_vectors[user_index] = np.zeros(nr_authors)

#users without any data should be removed from the user vectors
for user_id in empty_users:
    user_vectors.pop(user_id)


In [11]:
#find the top-shelved author of a specific user
index2author[np.argmax(user_vectors[0])]


'Patricia Cornwell'

### Clustering

In [12]:
from sklearn.cluster import KMeans

In [13]:
# #define AVERAGE user and subtract from all user vectors to center the whole set of points around the origin

# avg_user = np.mean(user_vectors, axis = 0)
# avg_user

In [14]:
kmeans = KMeans(n_clusters=NR_CLUSTERS, random_state=0).fit(user_vectors)


In [15]:
cluster_counts = Counter(kmeans.labels_) #this is a label corresponding to each user in the user vectors arra1y

In [16]:
len(kmeans.labels_)

2432

In [17]:
cluster_counts

Counter({21: 608,
         10: 94,
         3: 152,
         9: 23,
         4: 251,
         14: 71,
         7: 199,
         11: 133,
         0: 49,
         8: 35,
         23: 89,
         22: 55,
         12: 108,
         2: 162,
         6: 24,
         15: 8,
         28: 10,
         13: 35,
         1: 102,
         26: 30,
         20: 17,
         18: 60,
         19: 37,
         24: 30,
         25: 6,
         17: 5,
         27: 8,
         5: 10,
         16: 17,
         29: 4})

In [18]:
# #subtract average user from each cluster
# cluster_centers = [center - avg_user for center in kmeans.cluster_centers_] #these are the cluster centers, MINUS THE AVERAGE USER

# #print the top author at the center of each cluster
# for cluster_coordinates in cluster_centers:
#     print(index2author[np.argmax(cluster_coordinates)])

In [19]:
kmeans.cluster_centers_

array([[ 0.00000000e+00, -6.77626358e-21,  0.00000000e+00, ...,
         0.00000000e+00,  1.69406589e-21,  4.33680869e-19],
       [ 0.00000000e+00, -1.35525272e-20,  5.42101086e-20, ...,
        -1.69406589e-20, -5.08219768e-21,  3.95845143e-03],
       [ 0.00000000e+00,  7.71604938e-05,  3.14522941e-04, ...,
        -1.35525272e-20, -1.69406589e-21,  1.08420217e-18],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  4.62962963e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -1.69406589e-21,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [20]:
#dictionary of USER_ID to CLUSTER
userid2cluster = {index2userid[index] : cluster_label for index, cluster_label in enumerate(kmeans.labels_)}
str(userid2cluster)[:400] #preview

'{32774: 21, 40968: 10, 16: 3, 17: 9, 24595: 4, 32788: 21, 21: 14, 40980: 7, 40981: 11, 24600: 3, 25: 14, 49171: 0, 40987: 4, 32798: 8, 38: 3, 42: 3, 49195: 4, 49197: 23, 48: 22, 16438: 12, 32824: 4, 24633: 2, 59: 3, 32829: 7, 63: 14, 41025: 7, 66: 14, 68: 14, 41028: 2, 70: 22, 41031: 8, 24646: 2, 8260: 2, 74: 6, 77: 3, 78: 3, 80: 3, 49235: 10, 86: 3, 49238: 21, 24664: 21, 91: 14, 8285: 4, 94: 14, '

### Using TF-IDF to find the defining authors and books for each cluster

In [21]:
# Getting a 2D array of total author counts for each author and cluster
nr_clusters = len(Counter(kmeans.labels_))

combi_df.head() #this is the dataframe we will iterate through

Unnamed: 0,user_id,book_id,rating,total_user_entries
351,25,14,4.0,60
352,25,8336,4.0,60
353,25,653,5.0,60
354,25,5374,4.0,60
355,25,872,4.0,60


In [22]:
#create a 2D array with # rows = # of clusters, each column the cluster count for that AUTHOR
cluster_author_counts = np.zeros((nr_clusters, nr_authors))

#create a 2D array with # rows = # of clusters, each column the cluster count for that BOOK
cluster_book_counts = np.zeros((nr_clusters, nr_books))

total_entries_by_author = np.zeros(nr_authors)
total_entries_by_book = np.zeros(nr_books)
total_entries_by_cluster = np.zeros(nr_clusters)
for book_id, user_id in zip(combi_df.book_id, combi_df.user_id): #iterate through each rating/shelving
    
    #author counts
    author_index = bookid2authorindex[book_id] #get author index of the book
    total_entries_by_author[author_index] += 1 #add to that author's total count
    total_entries_by_cluster[userid2cluster[user_id]] += 1 #add to cluster's total entry count
    
    cluster_author_counts[userid2cluster[user_id], bookid2authorindex[book_id]] += 1 #add to that author's count for the relevant cluste
    
    #book counts
    book_index = bookid2index[book_id] #get  index of the book
    total_entries_by_book[book_index] += 1 #add to that author's total count
    cluster_book_counts[userid2cluster[user_id], book_index] += 1 #add to that author's count for the relevant cluste
    
total_entry_count = len(combi_df.book_id)
    

In [23]:
np.sum(cluster_author_counts[:,0]) == total_entries_by_author[0] #make sure the sum of each author's entry counts by cluster is the same as the total entries for author

True

### Uniqueness score for authors

In [24]:
# Calculate TF-IDF-LIKE score for each author in each cluster

# FORMULA: tf = author count in cluster / total cluster entries
#.  df = author count in whole corpus / total entries

cluster_author_uniqueness = np.zeros((nr_clusters, nr_authors))

#loop thtoruh geach cluster-author pair
for cluster in range(nr_clusters):
    for author_index in range(nr_authors):
        
        #frequency of that author in that cluster: nr occurences of author in cluster / total nr of entries in cluster
        tf = cluster_author_counts[cluster, author_index] / total_entries_by_cluster[cluster]
        
        #document frequency: nr occurences of author in cluster / total entries for author in whole corpus
        if total_entries_by_author[author_index] > MIN_COUNT_IMPORTANT_AUTHOR: #make sure author actually is rated
            idf = total_entry_count / total_entries_by_author[author_index]
            score = tf*idf
        else:
            score = 0 #unrated authors get a score of 0
        
        #now ccalculate uniqueness score
        cluster_author_uniqueness[cluster, author_index] = score

In [25]:
#Counter(cluster_author_uniqueness[0])

In [26]:
for cluster_scores in cluster_author_uniqueness:
    top_idx = np.argsort(cluster_scores)[-TOP_N:]
    top_values = [index2author[i] for i in top_idx]
    print(top_values)
    #print(index2author[np.argmax(cluster_coordinates)])

['Jessica Sorensen', 'J.A. Redmerski', 'Katie Ashley', 'Katy Evans', 'Jay Crownover', 'Nicole  Williams', 'Aurora Rose Reynolds', 'Tijan', 'Kristen Proby', 'Abbi Glines']
['Julie Kagawa', 'Lauren Kate', 'Cassandra Clare', 'Kami Garcia', 'L.J. Smith', 'Sarah J. Maas', 'Kelley Armstrong', 'Sara Shepard', 'P.C. Cast', 'Richelle Mead']
['Daniel Kahneman', 'Charles Duhigg', 'Michael   Lewis', 'Anthony Robbins', 'Chip Heath', 'Dan Ariely', 'Daniel H. Pink', 'Nassim Nicholas Taleb', 'Seth Godin', 'James C. Collins']
['Charles Bukowski', 'William Faulkner', 'Albert Camus', 'Fyodor Dostoyevsky', 'Franz Kafka', 'José Saramago', 'Jean-Paul Sartre', 'Haruki Murakami', 'Italo Calvino', 'Milan Kundera']
['Erin Hunter', 'Chetan Bhagat', 'John Tiffany', 'Jonathan Stroud', 'Orson Scott Card', 'Christopher Paolini', 'Andrea Hirata', 'Michael Scott', 'Dee Lestari', 'J.K. Rowling']
['J.D. Robb', 'Deborah Harkness', 'Anne Rice', 'Sherrilyn Kenyon', 'Kristin Hannah', 'Daniel Silva', 'Julie Garwood', 'Judith

### Uniqueness score for books

In [27]:
# Calculate TF-IDF-LIKE score for each book in each cluster

# FORMULA: tf = book count in cluster / total cluster entries
#.  df = book count in whole corpus / total entries

cluster_book_uniqueness = np.zeros((nr_clusters, nr_books))

#loop thtoruh geach cluster-book pair
for cluster in range(nr_clusters):
    for book_index in range(nr_books):
        
        #frequency of that book in that cluster: nr occurences of book in cluster / total nr of entries in cluster
        tf = cluster_book_counts[cluster, book_index] / total_entries_by_cluster[cluster]
        
        #document frequency: nr occurences of book in cluster / total entries for book in whole corpus
        if total_entries_by_book[book_index] > MIN_COUNT_IMPORTANT_BOOK: #make sure book actually is rated often enough
            idf = total_entry_count / total_entries_by_book[book_index]
            score = tf*idf
        else:
            score = 0 #unrated books get a score of 0
        
        #now ccalculate uniqueness score
        cluster_book_uniqueness[cluster, book_index] = score
        
for cluster_scores in cluster_book_uniqueness:
    top_idx = np.argsort(cluster_scores)[-1*TOP_N:]
    top_values = [bookid2title[index2bookid[i]] for i in top_idx]
    print(top_values)
    #print(index2author[np.argmax(cluster_coordinates)])

['Come Away with Me', nan, 'One Week Girlfriend', nan, 'Fallen Too Far (Too Far, #1)', 'Never Too Far (Too Far, #2)', 'Breathe', 'Forever Too Far (Too Far, #3)', 'The Vincent Brothers (The Vincent Boys, #1)', 'Twisted Perfection (Perfection, #1)']
['Fallen', 'Beautiful Creatures', 'Throne of Glass', 'Clockwork Angel', 'City of Ashes', 'City of Glass', nan, 'City of Lost Souls', 'Marked', 'City of Fallen Angels']
['The 4-Hour Workweek: Escape 9-5, Live Anywhere, and Join the New Rich', 'Predictably Irrational: The Hidden Forces that Shape Our Decisions', 'The Black Swan: The Impact of the Highly Improbable', 'Influence: The Psychology of Persuasion', "Good to Great: Why Some Companies Make the Leap... and Others Don't", 'The Magic of Thinking Big', 'Drive: The Surprising Truth About What Motivates Us', 'The Lean Startup\u200e', 'Built to Last: Successful Habits of Visionary Companies', 'Made to Stick: Why Some Ideas Survive and Others Die']
['بوف کور', ' سمفونی مردگان', 'Nine Stories', 

### Cluster report function



In [28]:
def cluster_report(cluster_index):
    """Give a summary of this cluster"""
    top_books_idx = np.argsort(cluster_book_uniqueness[cluster_index])[-1*TOP_N:]
    top_authors_idx = np.argsort(cluster_author_uniqueness[cluster_index])[-1*TOP_N:]
    
    top_book_values = [bookid2title[index2bookid[i]] for i in top_books_idx]
    top_author_values = [index2author[i] for i in top_authors_idx]
    
    report = ""
    
    report += ">>You have been assigned to user cluster #"+str(cluster_index)+"\n"
    report += "\n\nAbout {}% of the Goodreads community is in this cluster.".format(100*cluster_counts[cluster_index]/nr_users)
    
    report += "\n\nThe defining books for this user cluster are:"
    for book in top_book_values:
        report+= "\n   "+str(book)
        
    report += "\n\nThe most defining authors for this user cluster are:"
    for author in top_author_values:
        report +="\n   "+author
    report +="\n"
    
    return report, cluster_index
    
#cluster_report(2)

### Predicting a new user

In [29]:
new_user_authors = (author2index["James Joyce"],)
new_user_vector = np.zeros(nr_authors)
for author_index in new_user_authors:
    new_user_vector[author_index] = 1
new_user_vector
cluster_report(kmeans.predict([new_user_vector])[0])

('>>You have been assigned to user cluster #3\n\n\nAbout 6.25% of the Goodreads community is in this cluster.\n\nThe defining books for this user cluster are:\n   بوف کور\n    سمفونی مردگان\n   Nine Stories\n   Идиот\n   La peste\n   La Nausée\n   Мастер и Маргарита\n   Ansichten eines Clowns\n   ねじまき鳥クロニクル [Nejimakidori kuronikuru]\n   Der Prozeß\n\nThe most defining authors for this user cluster are:\n   Charles Bukowski\n   William Faulkner\n   Albert Camus\n   Fyodor Dostoyevsky\n   Franz Kafka\n   José Saramago\n   Jean-Paul Sartre\n   Haruki Murakami\n   Italo Calvino\n   Milan Kundera\n',
 3)

### Letting a user create their own user vector interactively

In [49]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, fixed, interactive, VBox, HBox, Layout
from IPython.display import display, clear_output
import matplotlib.pyplot as plt



In [55]:
authors = []

style = {'description_width': '130px'}

#author entry field
author_field = widgets.Combobox(
    placeholder='Start typing',
    options=list(author2index.keys()),
    description='Pick up to 4 authors:',
    ensure_option=True,
    style = style,
    disabled=False)

output = widgets.Output()

output2 = widgets.Output()


#ADD button
add_button = widgets.Button(description="Add author")

def on_add_clicked(b):
    with output:
        clear_output()
    if len(authors) < 4:
        if author_field.value not in authors and author_field.value != "":
            authors.append(author_field.value)
            sliders[len(authors)-1].description = author_labels()[len(authors)-1]
        vb_sliders.children = sliders[:len(authors)]
    elif len(authors) >= 4:
        with output:
            print("Can't add more than 4 authors!", authors)

add_button.on_click(on_add_clicked)

#RESET button
reset_button = widgets.Button(description="Reset")

def on_reset_clicked(b):
    authors.clear()
    vb_sliders.children = []
    for slider in sliders: slider.value=0.5
    with output:
        clear_output()   
    with output2:
        clear_output() 
        

reset_button.on_click(on_reset_clicked)

#hbox combining add and reset
add_reset = HBox([add_button, reset_button])

#pie chart generating function
def cluster_pie(target_cluster):
    labels = []
    explode = []
    sizes = []

    for cluster, count in cluster_counts.items():
        if cluster == target_cluster:
            explode.append(0.4)
            labels.append(cluster)
        else:
            explode.append(0)
            labels.append(None)
        sizes.append(count)

    # Plot
    plt.pie(sizes, labels=labels, explode = explode)

    plt.axis('equal')
    plt.show()

#compute cluster
compute_button = widgets.Button(description="Generate report")
def on_compute_clicked(b):
    
    #create list of authors
    new_user_authors = [author2index[author] for author in authors]
    
    #turn into user vector
    new_user_vector = np.zeros(nr_authors)
    for idx, author_index in enumerate(new_user_authors):
        new_user_vector[author_index] = vb_sliders.children[idx].value
        
    #predict cluster if any authors are selected currently
    if len(authors) > 0:
        report, pred_cluster = cluster_report(kmeans.predict([new_user_vector])[0])

        with output2:
            clear_output()
            cluster_pie(pred_cluster)

            print(report)
        
compute_button.on_click(on_compute_clicked)

#slider for authors

def author_labels():
    labels = ["author 1", "author 2", "author 3", "author 4"]
    authors_list = list(authors)
    for index, author in enumerate(authors_list):
        labels[index] = author
    return labels

sliders = [widgets.FloatSlider(min=5, max=10, value=5, step=0.5, style=style, layout = Layout(width='400px')), 
           widgets.FloatSlider(min=5, max=10, value=5, step=0.5, style=style, layout = Layout(width='400px')), 
           widgets.FloatSlider(min=5, max=10, value=5, step=0.5, style=style, layout = Layout(width='400px')), 
           widgets.FloatSlider(min=5, max=10, value=5, step=0.5, style=style, layout = Layout(width='400px'))]

vb_sliders = VBox()

explanation = widgets.HTML(
    value="Welcome to <b>Literary Clusters</b>. <br> Choose up to four authors and tell us how much you like them on a scale of 5 (okay) to 10 (amazing). <br>We'll tell you which Goodreads cluster you fit into.",
)


file = open("logo.png", "rb")
image = file.read()
logo = widgets.Image(
    value=image,
    format='png',
    width = 380
)

display(logo, explanation, author_field, add_reset, vb_sliders, output, compute_button, output2)


    



Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03*\x00\x00\x00\xc8\x08\x06\x00\x00\x00\x07\xad\x15…

HTML(value="Welcome to <b>Literary Clusters</b>. <br> Choose up to four authors and tell us how much you like …

Combobox(value='', description='Pick up to 4 authors:', ensure_option=True, options=('Harry Bernstein', "Danie…

HBox(children=(Button(description='Add author', style=ButtonStyle()), Button(description='Reset', style=Button…

VBox()

Output()

Button(description='Generate report', style=ButtonStyle())

Output()