In [1]:
%matplotlib inline

from collections import defaultdict
import os
from surprise import Dataset
import os
import pandas as pd 
import sys
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
import seaborn as sns
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
dataset = pd.read_csv('reviews.csv', low_memory=False )
dataset.head()

Unnamed: 0,rating,review_id,user_id,book_id,review
0,4,338670838,7878381,13431841,"""عزازيل الذي صنعناه ،الكامن في أنفسنا"" يذكرني..."
1,4,39428407,1775679,3554772,من أمتع ما قرأت من روايات بلا شك. وحول الشك ت...
2,4,32159373,1304410,3554772,رواية تتخذ من التاريخ ،جوًا لها اختار المؤلف ...
3,1,442326656,11333112,3554772,إني أقدّر هذه الرواية كثيرا، لسبب مختلف عن أس...
4,5,46492258,580165,3554772,الكاهن الذي أطلق على نفسه اسم هيبا تيمنا بالع...


In [3]:
#Create a smaller dataframe with a subset of all features
ratings_df = dataset[[ 'user_id', 'book_id','rating', ]]
#Output only the first 5 rows of small_df
ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,7878381,13431841,4
1,1775679,3554772,4
2,1304410,3554772,4
3,11333112,3554772,1
4,580165,3554772,5


In [4]:
dataset.groupby('user_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,user_id,rating
6273,5890605,138
302,1741426,120
4304,4829290,111
1521,2922700,95
12826,12252292,90
1202,2716355,86
6696,6140307,83
2390,3642565,81
6603,6089735,80
8801,7810481,79


In [5]:
dataset.groupby('book_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,book_id,rating
380,3503947,1216
396,3554772,1046
2027,16031620,956
2045,16081961,761
1555,10706934,737
346,3438000,728
1211,7704143,721
1124,7119070,671
1557,10722079,585
2042,16076787,550


In [6]:
R_df = ratings_df.pivot_table(index = 'user_id', columns ='book_id', values = 'rating').fillna(0)
R_df.head()

book_id,151,152,155,291,320,343,865,968,1420,2517,...,17340924,17346096,17346175,17373702,17376140,17376299,17379499,17403265,17448644,17560408
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [8]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)
U.shape, sigma.shape, Vt.shape

((16486, 50), (50,), (50, 2131))

In [9]:
sigma = np.diag(sigma)

In [10]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [11]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.head()

book_id,151,152,155,291,320,343,865,968,1420,2517,...,17340924,17346096,17346175,17373702,17376140,17376299,17379499,17403265,17448644,17560408
0,0.000163,0.000583,0.000357,0.000481,7.1e-05,0.001249,0.014924,0.001256,0.000319,0.000492,...,0.000254,0.000263,0.000276,0.000293,0.000255,0.000274,0.000236,0.000257,0.000275,0.000333
1,0.002767,0.002623,0.002902,0.002254,0.002001,0.001767,0.000698,0.002183,0.002683,0.002766,...,0.002929,0.002837,0.002943,0.002916,0.002915,0.002854,0.002895,0.002959,0.002943,0.002902
2,0.002767,0.002623,0.002902,0.002254,0.002,0.001767,0.000698,0.002183,0.002683,0.002766,...,0.002929,0.002837,0.002943,0.002916,0.002916,0.002854,0.002895,0.002959,0.002943,0.002902
3,0.002212,0.002098,0.002318,0.001804,0.001604,0.001419,0.000554,0.001745,0.002144,0.002211,...,0.00234,0.002267,0.002351,0.00233,0.002329,0.00228,0.002313,0.002364,0.002351,0.002319
4,-0.007954,-0.0107,-0.001114,0.015807,-0.011744,0.017615,-0.020394,-0.048031,0.004102,-0.013164,...,-0.001857,-0.005115,-0.002134,-0.001785,-0.002791,-0.00104,-0.005263,-0.002204,-0.002139,-0.000707


In [12]:
books_df = pd.read_csv('books.csv', low_memory=False )
books_df.head()

Unnamed: 0,book_id
0,13431841
1,3554772
2,13608357
3,7784490
4,7829608


In [13]:
def recommend_books(predictions_df, userID, books_df, original_ratings_df, num_recommendations=10):
    
    # Get and sort the user's predictions
    user_row_number = userID -userID  # UserID starts at 1, not 0
    sorted_user_predictions =  preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    
    # Get the user's data and merge in the book information.
    user_data = original_ratings_df[original_ratings_df.user_id == (userID)]
    user_full = (user_data.merge(books_df, how = 'left', left_on = 'book_id', right_on = 'book_id').
                     sort_values(['rating'], ascending=False) 
                 )

    print ('User {0} has already rated {1} books.'.format(userID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings books not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating books that the user hasn't seen yet.
    recommendations = (books_df[~books_df['book_id'].isin(user_full['book_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'book_id',
               right_on = 'book_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations-1]
                      )

    return user_full, recommendations

In [14]:
userID=5890605
already_rated, predictions = recommend_books(preds_df, userID, books_df, ratings_df, 10)

User 5890605 has already rated 138 books.
Recommending highest 10 predicted ratings books not already rated.


In [15]:
already_rated.head(10)

Unnamed: 0,user_id,book_id,rating
0,5890605,3438000,5
24,5890605,3127736,5
123,5890605,13562020,5
101,5890605,16282204,5
89,5890605,5954494,5
55,5890605,3018318,5
38,5890605,3614157,5
37,5890605,5954494,5
30,5890605,16163192,5
137,5890605,1499941,5


In [16]:
print( "The Recommendations for user:" ,userID,  "are these book ID's : ")
predictions

The Recommendations for user: 5890605 are these book ID's : 


Unnamed: 0,book_id,Predictions
6,646462,0.019468
314,865,0.014924
224,6494415,0.01226
370,3553395,0.012119
290,2246948,0.011801
484,6136509,0.010674
21,2501458,0.00831
1090,8993964,0.00753
649,2750285,0.007251


In [17]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import pandas as pd 
from surprise import BaselineOnly
from surprise import Dataset

from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import evaluate, print_perf

#Create a smaller dataframe with a subset of all features
ratings = dataset[['user_id','book_id','rating' , ]]
#Output only the first 5 rows of small_df
ratings.head()



Unnamed: 0,user_id,book_id,rating
0,7878381,13431841,4
1,1775679,3554772,4
2,1304410,3554772,4
3,11333112,3554772,1
4,580165,3554772,5


In [18]:
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)
#########---------------SVD
print('')
print('---------------SVD result-------------')
data.split(n_folds=5)
algo = SVD()
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)


---------------SVD result-------------
Evaluating RMSE, MAE of algorithm SVD.





------------
Fold 1
RMSE: 1.0179
MAE:  0.8038
------------
Fold 2
RMSE: 1.0194
MAE:  0.8099
------------
Fold 3
RMSE: 1.0118
MAE:  0.8039
------------
Fold 4
RMSE: 1.0161
MAE:  0.8061
------------
Fold 5
RMSE: 1.0300
MAE:  0.8191
------------
------------
Mean RMSE: 1.0191
Mean MAE : 0.8086
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    1.0179  1.0194  1.0118  1.0161  1.0300  1.0191  
MAE     0.8038  0.8099  0.8039  0.8061  0.8191  0.8086  


In [19]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for user_id, _, true_r, est, _ in predictions:
        user_est_true[user_id].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for user_id, rating in user_est_true.items():

        # Sort user ratings by estimated value
        rating.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in rating)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in rating[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in rating[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [20]:


kf = KFold(n_splits=5)
algo = SVD()
sumP = 0
sumr=0
sumf1_score=0
countF = 0;
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    p=sum(prec for prec in precisions.values()) / len(precisions)
    r=sum(rec for rec in recalls.values()) / len(recalls)
    f1_score = (2*p*r)/ (p + r)
    sumP= sumP+p
    sumr= sumr+r
    sumf1_score= sumf1_score+f1_score
    print('Precision= %f %%'%(p *100))
    print('recall= %f %%'%(r*100))
    print('F1 score=%f %%'%(f1_score*100))
    print('\n')
    countF =countF +1
avgP = (sumP/countF)
avgr = (sumr/countF)
avgf1_score = (sumf1_score/countF)
print('AVG Precision= %f %%'%(avgP *100))
print('AVG recall= %f %%'%(avgr *100))
print('AVG f1_score= %f %%'%(avgf1_score *100))

Precision= 91.686433 %
recall= 63.192273 %
F1 score=74.818214 %


Precision= 91.492454 %
recall= 61.970968 %
F1 score=73.892213 %


Precision= 91.407802 %
recall= 62.669010 %
F1 score=74.358190 %


Precision= 91.150570 %
recall= 63.334754 %
F1 score=74.738478 %


Precision= 91.829057 %
recall= 62.439986 %
F1 score=74.335135 %


AVG Precision= 91.513263 %
AVG recall= 62.721398 %
AVG f1_score= 74.428446 %
