In [1]:
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import mean
from matplotlib import pyplot as plt
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
import sklearn
import numpy as np 
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import CountVectorizer
from scipy.spatial import distance
from pyspark.ml.evaluation import RegressionEvaluator

def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark


from pyspark import SparkFiles
spark = init_spark()
sp = None


# Content Based Book Recommendation

In [2]:
#Load Book and Tag dataset into df
bk = spark.read.csv("book_clean.csv", header=True)
tags = spark.read.csv("book-tag_clean.csv", header=True)
print(bk.count(),tags.count())

9939 111740


In [3]:
bk.show(5)
tags.show(5)

+-------+--------------------+-------------------------+--------------------+-------------+--------------+-------------+---------+---------+---------+---------+---------+
|book_id|             authors|original_publication_year|               title|language_code|average_rating|ratings_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|
+-------+--------------------+-------------------------+--------------------+-------------+--------------+-------------+---------+---------+---------+---------+---------+
|      1|     Suzanne Collins|                     2008|The Hunger Games ...|          eng|          4.34|      4780653|    66715|   127936|   560092|  1481305|  2706317|
|      2|J.K. Rowling, Mar...|                     1997|Harry Potter and ...|          eng|          4.44|      4602479|    75504|   101676|   455024|  1156318|  3011543|
|      3|     Stephenie Meyer|                     2005|Twilight (Twiligh...|        en-US|          3.57|      3866839|   456191|   436802|   79

In [4]:
distinct_authors_df = bk.select('authors').distinct()
distinct_title_df = bk.select('title').distinct()
distinct_lang_df = bk.select('language_code').distinct()
distinct_tag_df = tags.select('tag_id').distinct()

num_authors = distinct_authors_df.count()
num_title = distinct_title_df.count()
num_lang = distinct_lang_df.count()
num_tag = distinct_tag_df.count()

In [5]:
#Categorical features
print("num_authors :",num_authors)
print("num_title :",num_title)
print("num_lang :",num_lang)
print("num_tag :",num_tag)

num_authors : 4617
num_title : 9903
num_lang : 26
num_tag : 763


Load and Split Ratings Dataset

In [6]:
#spliting into Training and Testing dataset 70:30 ratio
ratings = spark.read.csv("ratings.csv", header=True)
ratings = ratings.withColumn("user_id",ratings.user_id.cast('int'))
ratings = ratings.withColumn("rating",ratings.rating.cast('int'))
ratings = ratings.withColumn("book_id",ratings.book_id.cast('int'))

train,test = ratings.randomSplit([0.7, 0.3],0)
print(train.count(),test.count())
train.show(5)

4182526 1793953
+-------+-------+------+
|user_id|book_id|rating|
+-------+-------+------+
|      1|     10|     4|
|      1|     11|     5|
|      1|     13|     4|
|      1|     22|     3|
|      1|     31|     4|
+-------+-------+------+
only showing top 5 rows



In [7]:
train.select('book_id').distinct().count()

10000

In [8]:
test.select('book_id').distinct().count()

##since all the books are present in both the ratings of the testing and training set,The normalization of the 
#book hotVector will be the same and therefore not necessary to be done seperatly. 
#(e.g the max and min of book year in both dataset will be the same since the book is present in both) 

10000

# Hot Vector Encoding(Item Profile)

Merging the book tags from tag dataset to book dataframe

In [9]:
#format tags to get list of categories of each book
rdd1 = tags.rdd
rdd1 = rdd1.map(lambda x: (x['goodreads_book_id'],x['tag_id'])).groupByKey()
tags_ = rdd1.toDF(["book_id","tag_list"])
tags_.show()

+-------+--------------------+
|book_id|            tag_list|
+-------+--------------------+
|      2|[[14064, 8717, 30...|
|   6185|[[14064, 5775, 87...|
|  17245|[[14064, 8717, 30...|
|  30183|[[14064, 8717, 30...|
|  99561|[[14064, 8717, 30...|
| 113436|[[14064, 8717, 30...|
|1656001|[[14064, 8717, 30...|
|6304335|[[14064, 8717, 30...|
|   8852|[[5775, 8717, 305...|
|      1|[[8717, 30574, 11...|
|     13|[[8717, 30574, 11...|
|     24|[[8717, 30574, 11...|
|     33|[[8717, 30574, 11...|
|    275|[[8717, 30574, 11...|
|    304|[[8717, 30574, 11...|
|    359|[[8717, 30574, 11...|
|    446|[[8717, 30574, 11...|
|    447|[[8717, 30574, 11...|
|    621|[[8717, 30574, 21...|
|    656|[[8717, 30574, 11...|
+-------+--------------------+
only showing top 20 rows



In [10]:
#join tag/category list to book dataframe
bk = bk.join(tags_, ['book_id'], 'left_outer')

In [11]:
bk.printSchema()
bk = bk.withColumn("tag_list",bk.tag_list['data'])

root
 |-- book_id: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- ratings_1: string (nullable = true)
 |-- ratings_2: string (nullable = true)
 |-- ratings_3: string (nullable = true)
 |-- ratings_4: string (nullable = true)
 |-- ratings_5: string (nullable = true)
 |-- tag_list: struct (nullable = true)
 |    |-- data: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- index: long (nullable = true)
 |    |-- maxindex: long (nullable = true)



In [12]:
#replace null list of tags with empty list
import pyspark.sql.functions as F
bk = bk.withColumn("tag_list",when(bk.tag_list.isNull(),F.array([])).otherwise(bk.tag_list))

In [13]:
bk.select('book_id','tag_list').orderBy('book_id').show()
bk.printSchema()

+-------+--------------------+
|book_id|            tag_list|
+-------+--------------------+
|      1|[8717, 30574, 112...|
|     10|[8717, 30574, 115...|
|    100|                  []|
|   1000|                  []|
|  10000|                  []|
|   1001|                  []|
|   1002|                  []|
|   1003|                  []|
|   1004|                  []|
|   1005|[8717, 30574, 270...|
|   1006|                  []|
|   1007|                  []|
|   1008|                  []|
|   1009|                  []|
|    101|                  []|
|   1010|                  []|
|   1011|                  []|
|   1012|                  []|
|   1013|                  []|
|   1014|                  []|
+-------+--------------------+
only showing top 20 rows

root
 |-- book_id: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- ave

Non-categorical features

In [14]:
#cast non-categorical feature columns to int/float
bk = bk.withColumn("original_publication_year",bk.original_publication_year.cast('int'))
bk = bk.withColumn("ratings_count",bk.ratings_count.cast('int'))
bk = bk.withColumn("average_rating",bk.ratings_count.cast('float'))
bk = bk.withColumn("ratings_1",bk.ratings_1.cast('int'))
bk = bk.withColumn("ratings_2",bk.ratings_2.cast('int'))
bk = bk.withColumn("ratings_3",bk.ratings_3.cast('int'))
bk = bk.withColumn("ratings_4",bk.ratings_4.cast('int'))
bk = bk.withColumn("ratings_5",bk.ratings_5.cast('int'))

In [15]:
#assemble non-categorical features as vector


assembler = VectorAssembler(
    inputCols=["original_publication_year", "ratings_count","average_rating","ratings_1","ratings_2","ratings_3","ratings_4","ratings_5"],
    outputCol="non-categorical")
bk = assembler.transform(bk)
#bk.show(10)


In [16]:
bk.select("non-categorical").show(5)

+--------------------+
|     non-categorical|
+--------------------+
|[2000.0,75469.0,7...|
|[1994.0,80056.0,8...|
|[2004.0,69007.0,6...|
|[1853.0,39758.0,3...|
|[1991.0,67753.0,6...|
+--------------------+
only showing top 5 rows



Categorical Features

In [17]:
# map categorical features to numeric(vector index) 

indexer = StringIndexer(inputCol='authors', outputCol='authors_ind')
bk=indexer.fit(bk).transform(bk)

indexer = StringIndexer(inputCol= 'title', outputCol='title_ind')
bk=indexer.fit(bk).transform(bk)


indexer = StringIndexer(inputCol='language_code', outputCol='language-code_ind')
bk=indexer.fit(bk).transform(bk)



In [18]:
bk.select("authors_ind","title_ind","language-code_ind").show(5)

+-----------+---------+-----------------+
|authors_ind|title_ind|language-code_ind|
+-----------+---------+-----------------+
|     3443.0|   3177.0|              0.0|
|     4445.0|   5961.0|              3.0|
|       70.0|   5653.0|              0.0|
|     4181.0|   9237.0|              0.0|
|     3651.0|   2043.0|              0.0|
+-----------+---------+-----------------+
only showing top 5 rows



In [19]:
# encode categorical features index to hot vectors
encoder = OneHotEncoder(inputCol='authors_ind', outputCol = 'authors_vec')
bk = encoder.fit(bk).transform(bk)


colorVectorizer = CountVectorizer(inputCol="tag_list", outputCol="tag_vec", vocabSize=763, minDF=1.0)
colorVectorizer_model = colorVectorizer.fit(bk)
bk = colorVectorizer_model.transform(bk)

encoder = OneHotEncoder(inputCol='title_ind', outputCol = 'title_vec')
bk = encoder.fit(bk).transform(bk)

encoder = OneHotEncoder(inputCol='language-code_ind', outputCol = 'language-code_vec')
bk = encoder.fit(bk).transform(bk)



In [20]:
bk.filter(bk.tag_list != F.array([]) ).select('authors_vec','tag_vec').show()

+-------------------+--------------------+
|        authors_vec|             tag_vec|
+-------------------+--------------------+
|(4616,[1375],[1.0])|(537,[0,2,3,8,10,...|
|(4616,[3772],[1.0])|(537,[1,2,62],[1....|
| (4616,[148],[1.0])|(537,[0,1,9,57],[...|
|(4616,[2235],[1.0])|(537,[0,1,9,16,12...|
|   (4616,[3],[1.0])|(537,[0,1,9,132,2...|
|(4616,[4131],[1.0])|(537,[0,1,2,3,4,5...|
|(4616,[4553],[1.0])|(537,[0,1,2,3,4,5...|
|(4616,[1507],[1.0])|(537,[0,1],[1.0,1...|
| (4616,[335],[1.0])|(537,[0,1,2,3,4,5...|
|(4616,[2305],[1.0])|(537,[0,2,3,12],[...|
|(4616,[3978],[1.0])|(537,[0,3,7,12,13...|
| (4616,[165],[1.0])|(537,[0,1,2,3,4,5...|
| (4616,[434],[1.0])|(537,[0,2],[1.0,1...|
| (4616,[967],[1.0])|(537,[0,1,2,3,4,5...|
|(4616,[3773],[1.0])|(537,[0,3,4,6,11,...|
|(4616,[2111],[1.0])|(537,[0,2,3,210,2...|
|(4616,[1420],[1.0])|(537,[0,1,2,3,4,5...|
| (4616,[273],[1.0])|(537,[0,1,2],[1.0...|
|(4616,[1481],[1.0])|(537,[0,1,2,3,4,5...|
|(4616,[3554],[1.0])|(537,[0,1,2,3,243...|
+----------

Assemble all vectors

In [21]:
# Assemble all the vectors as 1 Hot-Vector : 15063 columns of features
assembler = VectorAssembler(
    inputCols=["non-categorical","authors_vec","title_vec","tag_vec"],outputCol="hotVector")
bk = assembler.transform(bk)

In [22]:
bk.select('hotVector').show()

+--------------------+
|           hotVector|
+--------------------+
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
|(15063,[0,1,2,3,4...|
+--------------------+
only showing top 20 rows



Normalization

In [23]:
#Normalize the non-categorical features to scale 0-1
from pyspark.ml.feature import MinMaxScaler

# MinMaxScaler Transformation
scaler = MinMaxScaler(inputCol="hotVector", outputCol="hotVector_scaled")
scalerModel =  scaler.fit(bk.select("hotVector"))
bk = scalerModel.transform(bk)

In [24]:
#1 row hot-vector example
bk.where('book_id=2').select('hotVector_scaled').collect()

[Row(hotVector_scaled=SparseVector(15063, {0: 0.9777, 1: 0.9627, 2: 0.9627, 3: 0.1655, 4: 0.2327, 5: 0.5734, 6: 0.7805, 7: 1.0, 205: 1.0, 7598: 1.0, 14526: 1.0, 14527: 1.0, 14528: 1.0, 14529: 1.0, 14530: 0.5, 14531: 1.0, 14533: 1.0, 14536: 1.0, 14537: 1.0, 14539: 1.0, 14540: 1.0, 14541: 1.0, 14544: 1.0, 14545: 1.0, 14548: 1.0, 14550: 1.0, 14551: 1.0, 14555: 1.0, 14556: 1.0, 14561: 1.0, 14565: 1.0, 14566: 1.0, 14568: 1.0, 14577: 1.0, 14578: 1.0, 14579: 1.0, 14582: 1.0, 14585: 1.0, 14586: 1.0, 14594: 1.0, 14602: 1.0, 14603: 1.0, 14604: 1.0, 14612: 1.0, 14621: 1.0, 14622: 1.0, 14626: 1.0, 14628: 1.0, 14638: 1.0, 14639: 1.0, 14640: 1.0, 14643: 1.0, 14648: 1.0, 14650: 1.0, 14651: 1.0, 14656: 1.0, 14661: 1.0, 14664: 1.0, 14667: 1.0, 14670: 1.0, 14674: 1.0, 14675: 1.0, 14677: 1.0, 14679: 1.0, 14691: 1.0, 14694: 1.0, 14695: 1.0, 14697: 1.0, 14700: 1.0, 14703: 1.0, 14716: 1.0, 14727: 1.0, 14728: 1.0, 14731: 1.0, 14732: 1.0, 14733: 1.0, 14755: 1.0, 14757: 1.0, 14759: 1.0, 14770: 1.0, 14778: 1.0,

In [25]:
#denseVector to vector(array)
from pyspark.ml.functions import vector_to_array
bk_arr = bk.withColumn("hot-vector",vector_to_array(bk.hotVector_scaled))
bk_arr = bk_arr.select('book_id','hot-vector')
bk_arr.show(5)

+-------+--------------------+
|book_id|          hot-vector|
+-------+--------------------+
|   1090|[0.98104793756967...|
|   1159|[0.97435897435897...|
|   1436|[0.98550724637681...|
|   1512|[0.81716833890746...|
|   1572|[0.97101449275362...|
+-------+--------------------+
only showing top 5 rows



In [26]:
bk_arr.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- hot-vector: array (nullable = false)
 |    |-- element: double (containsNull = false)



# Recommenation using Cosine Distance

Functions for recommending books and predicting ratings

In [27]:
#function to build a user profile with user_id as input

#User profile : Aggregate of the profile of books that was rated 4 or 5 by the user from the training set 

def get_user_profile(user_id,ratings,books):
        
        #get high ratings of the user(4 and 5)
        ratings = ratings.filter((ratings.user_id==user_id) & ((ratings.rating== 4) | (ratings.rating==5)))
        items_count = ratings.count()
        
        if items_count > 2:
            
            #get the books that was read and highly rated by the user
            items = books.join(ratings,['book_id'],'inner')
            count = items.count()
            
            #convert to rdd for easy computations 
            rdd_HV = items.select('hot-vector').rdd
            rdd_HV = rdd_HV.map(lambda x: x['hot-vector'])
            arr = np.array(rdd_HV.collect())
            
            #aggregate the highly rated book profile to get user profile
            print(arr.shape)
            return np.add.reduce(arr)/count
        else:
            print("Not enough training sample to build user profile")
            return []
        
#function that return sorted cosine distances of unread books by a user
#cosine distance of user profile and book profiles
def get_cosine_distance(user_id,ratings,bk):
    
    #get the user profile
    print("Getting User Profile..")
    user_profile = get_user_profile(user_id,train,bk)
    if len(user_profile)==0:
        return None
    print("User Profile:",user_profile)
    
    #get books that has not been rated by the user
    print("Getting Book Not Read By User..")
    books = ratings.filter(ratings.user_id == user_id).select('book_id').withColumn('read',lit('True'))
    books = bk.join(books,['book_id'],'left_outer')
    not_read = books.filter(books.read.isNull()).select('book_id','hot-vector').rdd
    print("Book Not Read: ",not_read.count())
    
    #compute the cosine distance of the book profiles and the user profile
    print("Computing Cosine Distances..")
    not_read = not_read.map(lambda x: (x['book_id'],x['hot-vector']))
    not_read = not_read.map(lambda x: (x[0],np.array(x[1])))
    cosine_distance = not_read.map(lambda x: (distance.cosine(x[1] , user_profile ),x[0]))
    
    #sort by cosine distance( smaller cosine distance: better)
    cosine_distance = cosine_distance.sortByKey()
    print("Complete")
    
    return cosine_distance


#function to get top n recommended books based on cosine distance
#recommend top 10 book that has a lower cosine distance to the user's profile
def recommend_top_book(user_id,top=10):
    
    cos_d = get_cosine_distance(user_id,train,bk_arr)    
    if cos_d == None:
        print("Prediction not possible")
        return None
    
    cos_d = cos_d.map(lambda x: (float(x[0]), int(x[1])))
    cos_d = cos_d.toDF(["cosine-distance","book_id"])
    cos_d = cos_d.limit(top)
    cos_d.show(10)
    
    return cos_d

#function that get top items based on rating prediction (cosine similarity scaled to 1-5)
# cosine similarity = 1-cosine distance (range 0->1)
def predict_ratings(user_id,ratings,bk):
    
    #get the user profile
    print("Getting User Profile..")
    user_profile = get_user_profile(user_id,train,bk)
    print("User Profile:",user_profile)
    
    #get books that has not been rated by the user
    print("Getting Book Not Read By User..")
    books = ratings.filter(ratings.user_id == user_id).select('book_id').withColumn('read',lit('True'))
    books = bk.join(books,['book_id'],'left_outer')
    not_read = books.filter(books.read.isNull()).select('book_id','hot-vector').rdd
    print("Book Not Read: ",not_read.count())
    
    #compute the cosine similarity of the book profiles and the user profile
    print("Computing Cosine Similarity..")
    not_read = not_read.map(lambda x: (x['book_id'],x['hot-vector']))
    not_read = not_read.map(lambda x: (x[0],np.array(x[1])))
    similarity = not_read.map(lambda x: (1 - distance.cosine(x[1] , user_profile ),x[0]))
    
    #cosine similarity range: 0-1 -> scaled to range of prediction 1-5 
    #scale to ratings 1-5
    #new_value = ( (old_value - old_min) / (old_max - old_min) ) * (new_max - new_min) + new_min
    print("Converting to rating..")
    rat = similarity.map(lambda x: ( (((x[0] - 0) / (1 - 0)) * (5 - 1) + 1),x[1]))
    
    #Convert back to spark df
    rat = rat.map(lambda x: (float(x[0]), int(x[1])))
    rat = rat.toDF(["prediction","book_id"])
    rat = rat.orderBy('prediction',ascending=False)   
    print("Complete")
    
    rat.show(10)
    return rat

#-----------Evaluation Functions

#function that calculates top k recommendation's precision for a list of user 
def evaluate_recommend_top_book(fraction=0.1,seed=0,top=10):
    
    print("getting random subset for evaluation")
    #getting evaluation sub-set
    t1 = train.groupBy('user_id').count().where('count>100')
    t2 = test.groupBy('user_id').count().where('count>60')
    
    #only include user with >100 book read in training set and >60 book read in testing set
    #sub sample to get a smaller subset based on fraction (too much to compute)
    t1 = t1.join(t2,['user_id'],'inner').sample(withReplacement=False,fraction=fraction,seed=seed)
    
    
    test_list_ = t1.select('user_id').collect()
    test_list= []
    for usr in test_list_:
        test_list.append(usr['user_id'])

    print("test user list:", test_list)
    
    t = ['28343']
    
    #true positive
    tp = 0
    #false positive
    fp = 0
    
    #temporarily testing for only 1 user since the computation time is too long
    for user in t:#test_list:
        #get sorted cosine distance
        cos_d = get_cosine_distance(user,train,bk_arr)
        
        #get top k (default=10)
        cos_d = cos_d.map(lambda x: (float(x[0]), int(x[1])))
        cos_d = cos_d.toDF(["cosine-distance","book_id"])
        cos_d = cos_d.limit(top)
        
        #check number of book in top k that match test set(rating>3) 
        print("Calculating Precision..")
        match = test.filter((test.user_id==user) & (test.rating >= 3)).join(cos_d,['book_id'],'inner').count()
        tp += match
        fp += top - match
        
    #calculate total presision
    precision = tp/(tp+fp)
    print("precision:",precision)
    return precision


In [28]:
#recommending top 10 book for a user of id '28343'
r = recommend_top_book('28343')

Getting User Profile..
(63, 15063)
User Profile: [0.98198581 0.08147422 0.08147422 ... 0.         0.         0.        ]
Getting Book Not Read By User..
Book Not Read:  9831
Computing Cosine Distances..
Complete
+-------------------+-------+
|    cosine-distance|book_id|
+-------------------+-------+
|  0.372320187762202|   5373|
|0.37946318827467695|   4415|
| 0.4246526870308782|    336|
| 0.4250362047136187|    242|
| 0.4254430141234804|    266|
| 0.4287743826194933|   2282|
| 0.4287780681652572|   2974|
| 0.4288884249986521|   5891|
|0.42912548320166477|   1535|
| 0.4291912343345733|    674|
+-------------------+-------+



In [33]:
#check how many of the recommendation was read (in test set)
match = r.join(test.where('user_id=28343'),['book_id'],'inner')
match.show()

+-------+-------------------+-------+------+
|book_id|    cosine-distance|user_id|rating|
+-------+-------------------+-------+------+
|    266| 0.4254430141234804|  28343|     4|
|   1535|0.42912548320166477|  28343|     3|
|   2974| 0.4287780681652572|  28343|     4|
+-------+-------------------+-------+------+



In [29]:
#Example rating prediction for user 28343

u1 = '28343'
rat = predict_ratings(u1,train,bk_arr)

Getting User Profile..
(63, 15063)
User Profile: [0.98198581 0.08147422 0.08147422 ... 0.         0.         0.        ]
Getting Book Not Read By User..
Book Not Read:  9831
Computing Cosine Similarity..
Converting to rating..
Complete
+------------------+-------+
|        prediction|book_id|
+------------------+-------+
| 3.510719248951192|   5373|
| 3.482147246901292|   4415|
| 3.301389251876487|    336|
|3.2998551811455252|    242|
|3.2982279435060784|    266|
| 3.284902469522027|   2282|
| 3.284887727338971|   2974|
|3.2844463000053916|   5891|
| 3.283498067193341|   1535|
| 3.283235062661707|    674|
+------------------+-------+
only showing top 10 rows



In [30]:
#Evaluate rating prediction for user 28343

evaluation = rat.join(test.where('user_id='+u1),['book_id'],'inner')
evaluation.show()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(evaluation)
print("rmse:",rmse)

+-------+------------------+-------+------+
|book_id|        prediction|user_id|rating|
+-------+------------------+-------+------+
|    148| 3.096848711656899|  28343|     3|
|   2488| 3.050374516674024|  28343|     3|
|    133|2.9497537168965007|  28343|     4|
|   1871|3.0199833163907366|  28343|     4|
|     12| 3.141642464346574|  28343|     4|
|   8067| 3.057451790816606|  28343|     4|
|    122|2.6874890249750503|  28343|     3|
|   5025| 3.082004342837952|  28343|     4|
|    232| 3.053569513730684|  28343|     2|
|   1425|3.0584953302345954|  28343|     4|
|    259|3.0955098194704944|  28343|     3|
|     52|3.1501085046167216|  28343|     4|
|    182|3.1395863782494393|  28343|     4|
|    280|3.1265939600025243|  28343|     4|
|     20|3.1418766715950217|  28343|     3|
|     57|3.1273075642940817|  28343|     5|
|   1032| 2.779275134337932|  28343|     4|
|    266|3.2982279435060784|  28343|     4|
|    415|2.5252362217527615|  28343|     3|
|    227|3.2024416133593085|  28

In [31]:
#top 10 precision for user u1 recommendation 
evaluate_recommend_top_book()

getting random subset for evaluation
test user list: [32923, 33207, 18100, 38082, 49297, 32745, 23291, 3662, 12417, 5267, 22496, 9246, 47476, 35111, 35146, 49295, 27127, 52036, 44019, 12771]
Getting User Profile..
(63, 15063)
User Profile: [0.98198581 0.08147422 0.08147422 ... 0.         0.         0.        ]
Getting Book Not Read By User..
Book Not Read:  9831
Computing Cosine Distances..
Complete
Calculating Precision..
precision: 0.3


0.3