In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import random
import scipy.spatial.distance

sns.set(font_scale=1.5)
plt.style.use('fivethirtyeight')


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Load the saved dense df

In [2]:
df_sub = pd.read_csv('/Volumes/external/Sangeetha-Project/df_sub.csv.gz', 
                       compression='gzip').astype({'rating':'int8', 'total_votes':'int32'})

## Prediction matrix based on collaborative filtering - baseline only

In [3]:
overall_mean = df_sub.rating.mean()
overall_mean

4.089157753571024

In [4]:
#Calculating the user biases
bu = df_sub.groupby('reviewerId')[['rating']].mean() - overall_mean
bu[:5]

Unnamed: 0_level_0,rating
reviewerId,Unnamed: 1_level_1
A100NGGXRQF0AQ,0.156605
A102Z3T7NSM5KC,0.148937
A106016KSI0YQ,-0.417925
A106E1N0ZQ4D9W,0.099248
A10BZSGALQPS0V,-0.145895


In [5]:
#Calculating the item biases
bi = df_sub.groupby('asin')[['rating']].mean() - overall_mean
bi[:5]

Unnamed: 0_level_0,rating
asin,Unnamed: 1_level_1
000100039X,0.437158
0002007770,0.278872
0002051850,0.300673
0002219417,0.680073
000222383X,0.547206


In [6]:
baseline_est = overall_mean + bu.values.reshape(-1,1) + bi.values.reshape(1,-1)
baseline_est = pd.DataFrame(baseline_est, index = bu.index, columns = bi.index)

baseline_est[baseline_est>5] = 5
baseline_est[baseline_est<1] = 1

baseline_est.head()

asin,000100039X,0002007770,0002051850,0002219417,000222383X,000224053X,0002242052,0002247399,0002550938,0002726874,...,452289858,452295297,452295386,452295513,452295831,452296099,452296293,452296315,452296366,452296544
reviewerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100NGGXRQF0AQ,4.682921,4.524635,4.546435,4.925836,4.792969,4.008457,4.549462,3.993814,3.61115,4.792969,...,4.823272,4.734918,3.538958,3.77199,5.0,3.448272,3.468293,3.299462,4.562855,4.156605
A102Z3T7NSM5KC,4.675253,4.516967,4.538768,4.918168,4.785301,4.000789,4.541795,3.986147,3.603483,4.785301,...,4.815604,4.727251,3.53129,3.764322,5.0,3.440604,3.460626,3.291795,4.555187,4.148937
A106016KSI0YQ,4.108391,3.950105,3.971906,4.351306,4.218439,3.433927,3.974932,3.419284,3.036621,4.218439,...,4.248742,4.160388,2.964428,3.19746,4.582075,2.873742,2.893763,2.724932,3.988325,3.582075
A106E1N0ZQ4D9W,4.625564,4.467278,4.489079,4.868479,4.735612,3.9511,4.492105,3.936457,3.553793,4.735612,...,4.765915,4.677561,3.481601,3.714633,5.0,3.390915,3.410936,3.242105,4.505498,4.099248
A10BZSGALQPS0V,4.38042,4.222134,4.243935,4.623335,4.490468,3.705957,4.246962,3.691314,3.30865,4.490468,...,4.520771,4.432418,3.236458,3.469489,4.854105,3.145771,3.165793,2.996962,4.260355,3.854105


In [7]:
baseline_est.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2647 entries, A100NGGXRQF0AQ to AZY96LU53VWGO
Columns: 10982 entries, 000100039X to 452296544
dtypes: float64(10982)
memory usage: 221.8+ MB


In [5]:
baseline_est.to_csv('/dVolumes/external/Sangeetha-Project/baseline_est-.csv.gz', header = True, index=True,
                       compression='gzip')

NameError: name 'baseline_est' is not defined

## Running through an example - user 100 ('A151OW029XD9JU') & book 4599 (0312385846)

In [7]:
baseline_est = pd.read_csv('/Volumes/external/Sangeetha-Project/baseline_est.csv.gz', index_col=0,
                       compression='gzip')

In [9]:
df_sub = pd.read_csv('/Volumes/external/Sangeetha-Project/df_sub.csv.gz', 
                       compression='gzip').astype({'rating':'int8', 'total_votes':'int32'})

In [61]:
#Pick a random user:
#user=random.choice(np.unique(df_sub.reviewerId))
user = 'A2S166WSCFIFP5'
user

'A2S166WSCFIFP5'

In [62]:
#Create a list of unread books with more than 10 reviews
books_over_10_reviews = df_sub.asin.value_counts()
books_over_10_reviews= np.array(books_over_10_reviews[books_over_10_reviews>10].index)
books_over_10_reviews[:5]

array(['0439023483', '0143170090', '030758836X', '0439023513',
       '0141039280'], dtype=object)

In [63]:
books_read_by_user = df_sub[df_sub.reviewerId == user].asin.values
books_read_by_user[:5]

array(['000100039X', '0007136838', '0020519001', '0020519109',
       '006000942X'], dtype=object)

In [64]:
#Books that have over 10 reviews but hasnt been read by user
unread_book_by_user= [x for x in books_over_10_reviews if x not in books_read_by_user]
unread_book_by_user[:5]

['0439023483', '0143170090', '030758836X', '0439023513', '0141039280']

In [79]:
book_chosen = random.choice(unread_book_by_user)
#book_chosen = '0312385846'
book_chosen = '0002007770'

In [80]:
#Check to see if the reviewer hasnt read the book yet if you want 
df_sub[(df_sub.reviewerId==user)&(df_sub.asin==book_chosen)]

Unnamed: 0,reviewerId,asin,rating,summary,total_votes


In [81]:
#List of users who have rated the book
reviewers_rated_book = df_sub[df_sub.asin==book_chosen].reviewerId.values
reviewers_rated_book

array(['APBA7Y4SINS0H', 'A22019GDU7OVFG', 'A1HRYC60VTMYC0',
       'A1QZ6MFP9TSQU6', 'A3T4KN9XO3PIXG', 'A19D3N662QS9WD',
       'A26UNIPEVYE9DE', 'A914TQVHI872U', 'A3E02KBFI21TY0',
       'A3BIWTN2DA0YY2', 'A30H2335OM7RD6', 'A21NVBFIEQWDSG',
       'AJQ1S39GZBKUG', 'A2PN65B6BSTIYZ', 'A3KOO6L0NA2PJA',
       'A3GUK4RWXKHS9I', 'ASPABROV8R7M5', 'A75W6T9I2S8BA',
       'A33E0CODNP98PV', 'A3QVI57VT1VGRO', 'A5H86XYSS0OU6',
       'A2PWUIQG57O1AZ', 'AK536TCZST9B', 'A26BVUB2YJMGB7',
       'A2ATWKOFJXRRR1', 'A1PKJUAQFGNLSX', 'A15NZD5WINDQZM',
       'A39650P2CZUUC9', 'A2VUNCELWAV29X', 'A1GIW9VUC7V2WE',
       'A27QB2QCDK90GP', 'AO3M6LS4V3UY4', 'A7SSCDSCM4PXF',
       'A2F6N60Z96CAJI', 'A2R53V98P2T4Y4', 'AGFNLMUZ09UF1',
       'A2HRPFBYBZ4RJ5', 'A1NOBRCN20NGD5', 'AYT4FJYVCHYLE',
       'A1QQEFV51EMD7E', 'A3F0OFSE2M25U2', 'A35ROK8CVWNF62',
       'A1BI8PUEHA5CHW', 'A19KMJBW99ZTZK', 'A2ZATPER188K3J',
       'AG2YXYIQ8TLTA', 'A2SPCMHMXS2QSF', 'A99CILDQPTRTZ',
       'A3JSDV57XPD937', 'A3SUD7WHS9KI

In [82]:
user_distances = []

for reviewer in reviewers_rated_book:
    user_distances.append(scipy.spatial.distance.cosine(baseline_est.loc[user,:].values, 
                                                          baseline_est.loc[reviewer,:].values))
user_distances[:5]

[2.0211375665879494e-05,
 3.555496537110958e-05,
 1.6393689702187686e-05,
 2.3400917009808175e-05,
 4.505484467887122e-05]

In [83]:
sorted_reviewers = sorted(zip(reviewers_rated_book,user_distances), key=lambda x:x[1])
sorted_reviewers = pd.DataFrame(sorted_reviewers, columns = ['reviewerId', 'cos_distance'])
sorted_reviewers[:5]

Unnamed: 0,reviewerId,cos_distance
0,A1PKJUAQFGNLSX,3.806073e-09
1,A7SSCDSCM4PXF,7.656516e-09
2,A75W6T9I2S8BA,1.340006e-08
3,A2ECPW4RA7NWMP,1.453352e-08
4,A21NVBFIEQWDSG,2.817721e-08


In [84]:
reviews_example = df_sub[df_sub.asin == book_chosen].copy()
reviews_example = reviews_example.merge(sorted_reviewers, on ='reviewerId')

In [50]:
reviews_example.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 0 to 13
Data columns (total 6 columns):
reviewerId      14 non-null object
asin            14 non-null object
rating          14 non-null int8
summary         14 non-null object
total_votes     14 non-null int32
cos_distance    14 non-null float64
dtypes: float64(1), int32(1), int8(1), object(3)
memory usage: 630.0+ bytes


I chose the method 'average' for the rank for the total votes and the method 'dense' for the cosine as cosine will always be different for each row so the ranks will increase faster than for the total_votes where there are more likely to be ties. So average for total_votes and dense for cos_distance should smooth these differences

In [85]:
reviews_example['vote_rank'] = reviews_example['total_votes'].rank(method = 'average', ascending=False)*reviews_example['rating']
reviews_example.sort_values(by='total_votes', ascending = False)

Unnamed: 0,reviewerId,asin,rating,summary,total_votes,cos_distance,vote_rank
171,A2MF2QVSCUI27G,0002007770,5,"""In seventy years, I've never told a blessed s...",1150,5.920482e-05,5.0
66,A39ABKRS1MKFTW,0002007770,5,One of the best books I've read this year...,394,4.134266e-05,10.0
107,A2PR6NXG0PA3KY,0002007770,2,Basically a Question of Taste,258,4.734389e-06,6.0
221,AN3D3M8MJ07BQ,0002007770,5,YOU CAN GO HOME AGAIN,77,4.577832e-04,20.0
0,APBA7Y4SINS0H,0002007770,1,Now I lay me down to sleep,49,2.021138e-05,5.0
...,...,...,...,...,...,...,...
173,A33N05GF31KN8N,0002007770,4,Entertaining and Well Paced,0,4.793384e-06,994.0
215,A2T5O7MHGONT6S,0002007770,5,"A ""must read""",0,8.938374e-06,1242.5
22,AK536TCZST9B,0002007770,5,the life of Jacob,0,7.627394e-05,1242.5
96,A14ASJYL2MJNUF,0002007770,5,Narration is excellent,0,2.837032e-05,1242.5


In [86]:
reviews_example['cos_rank'] = reviews_example['cos_distance'].rank(method = 'dense', ascending=False)*reviews_example['rating']
reviews_example.sort_values(by='cos_distance', ascending = False)

Unnamed: 0,reviewerId,asin,rating,summary,total_votes,cos_distance,vote_rank,cos_rank
189,A1F3PBBSHAA2Y9,0002007770,3,Where's the romance?,3,3.496928e-03,252.0,3.0
239,A3B6FOHAAOL7G3,0002007770,5,"Circus, Circa, 1930",7,1.654271e-03,172.5,10.0
35,AGFNLMUZ09UF1,0002007770,4,Worth reading,1,1.359798e-03,726.0,12.0
115,A1UFV4HALV2OXW,0002007770,5,Wonderful story.,1,1.356055e-03,907.5,20.0
65,AGZK126DNQ2FN,0002007770,5,A story not to be missed,1,1.243148e-03,907.5,25.0
...,...,...,...,...,...,...,...,...
11,A21NVBFIEQWDSG,0002007770,5,A 5-star read in any season,3,2.817721e-08,420.0,1295.0
136,A2ECPW4RA7NWMP,0002007770,4,"Quick, Fun, But a Little Thin",6,1.453352e-08,166.0,1040.0
17,A75W6T9I2S8BA,0002007770,5,The Perfect Gift for Almost Everyone!,18,1.340006e-08,75.0,1305.0
32,A7SSCDSCM4PXF,0002007770,3,I wish I could say,2,7.656516e-09,345.0,786.0


In [87]:
print(reviews_example['vote_rank'].sum())
print(reviews_example['cos_rank'].sum())
print(reviews_example['vote_rank'].sum()-reviews_example['cos_rank'].sum())

160657.5
156235.0
4422.5


## Calculating the impact of the model

In [94]:
impact_df = pd.read_csv('/Volumes/external/Sangeetha-Project/impact.csv.gz', 
                        names =['reviewerId', 'book', 'impact'])

In [95]:
impact_df.head()

Unnamed: 0,reviewerId,book,impact
0,A2S166WSCFIFP5,0002007770,4422.5
1,A2S166WSCFIFP5,0002051850,376.0
2,A2S166WSCFIFP5,0002219417,3.5
3,A2S166WSCFIFP5,000222383X,1.0
4,A2S166WSCFIFP5,000224053X,30.5


In [99]:
impact_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115000 entries, 0 to 114999
Data columns (total 3 columns):
reviewerId    115000 non-null object
book          115000 non-null object
impact        115000 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.6+ MB


In [98]:
impact_df.impact.mean()

35.22292608695652

In [100]:
impact_df.impact.min()

-2965.0

In [101]:
impact_df.impact.max()

49964.0

In [97]:
impact = []
counter = 0

for user_ in df_sub.reviewerId.unique():

    # get all the books the user hasn't read
    read_books = df_sub[df_sub.reviewerId == user_].asin
    unread_books = [x for x in df_sub.asin.unique()
                    if x not in read_books.values]

    # For each book in this list, calculate the impact
    for book_ in unread_books:

        # every 30000 rows write to csv and clear the impact list for memory management
        print(counter)
        '''if counter == 5000:
            section_df = pd.DataFrame(impact)
            section_df.to_csv('/Volumes/external/Sangeetha-Project/impact.csv.gz', compression='gzip',
                              header=False, mode='a', index=False)
            impact = []
            counter = 0
'''
        counter += 1

        # get all the reviewers who have reviewed the book
        relevant_reviews = df_sub[df_sub.asin == book_].copy()

        # for each reviewer in the reviewers who have rated the book calculate the cosine similarities
        distances_ = []
        for reviewer_ in relevant_reviews.reviewerId:

            distances_.append(scipy.spatial.distance.cosine(baseline_est.loc[user_, :].values,
                                                            baseline_est.loc[reviewer_, :].values))

        relevant_reviews['cos_dist'] = distances_

        # Calculate the rank*rating for total votes
        relevant_reviews['vote_rank'] = relevant_reviews['total_votes'].rank(
            method='average', ascending=False)*relevant_reviews['rating']

        # Calculate the rank*rating for cosine_distances
        relevant_reviews['cos_rank'] = relevant_reviews['cos_dist'].rank(
            method='dense', ascending=False)*relevant_reviews['rating']

        # add the result (impact) of each user-item pair to the final list
        impact.append((user_, book_, relevant_reviews['vote_rank'].sum(
        ) - relevant_reviews['cos_rank'].sum()))

## Calculating user similarities - dont run this

In [None]:
predictions = pd.read_csv('/Volumes/external/Sangeetha-Project/pred_cosine_sim.csv.gz', index_col=0,
                       compression='gzip')

In [83]:
mat = predictions.values

#Pick a user
#user = 'A102Z3T7NSM5KC'
#user_val = predictions.index.get_loc(user)

user_val = 78
user = predictions.index[user_val]

This goes through each row in the matrix, subtracts your row of choice from the each row of the matrix (elementwise) 
nd squares each element in each row for all rows in the matrix. How is this the cosine distance?

In [84]:
#Calculate the distances

distances = []

for row in mat:
    distancess.append(scipy.spatial.distance.cosine(mat[user_val,:], row))
distances

In [85]:
#Find the closest neighbour
closest_n = np.argpartition(distances, 1)[1]
closest_n

1492

In [86]:
cloest_n_id = predictions.index[closest_n]
cloest_n_id

'A368INKWDUYHMK'

In [88]:
distances[closest_n]

192.08513160657645

In [89]:
np.nansum((mat[closest_n,:] - mat[user_val,:])**2)

192.085131606613

In [91]:
#To check if the distances have been calculated correctly
np.isclose(distances[closest_n], np.nansum((mat[closest_n,:] - mat[user_val,:])**2))

True