In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from math import sqrt
import csv



In [2]:
############ Popularity based #########################

In [3]:
def popularity(meta,ratings):
    avg = ratings.groupby('book_id')[['rating']].mean()
    count_rating = ratings.groupby('book_id')[['rating']].count()

    avg[avg['rating'].notnull()]['rating'] = avg[avg['rating'].notnull()]['rating'].astype('float')
    vote_averages= avg[avg['rating'].notnull()]['rating'] 
    C = vote_averages.mean()

    count_rating[count_rating['rating'].notnull()]['rating'] = count_rating[count_rating['rating'].notnull()]['rating'].astype('float')
    vote_counts = count_rating[count_rating['rating'].notnull()]['rating']
    m = vote_counts.quantile(0.9)

    meta['ratings_count'] = count_rating['rating']
    meta['average_rating'] = avg['rating']

    content = meta[(meta['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    content['ratings_count'] = content['ratings_count'].astype('float')

    content['average_rating'] = content['average_rating'].astype('float')

    content.shape
    #print("v rating_count : " + str(ratings_count) + "R average_rating : " + str(average_rating)+"m" + str(m)+"C" + str(C))
    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)

    content['popularity_rating'] = content.apply(weighted_rating, axis=1)
    p = content[['book_id','popularity_rating']]
    #print(p)
    #print(pop.shape)
    print("popularity _done")
    return p

In [4]:
######################### COllaborative ############################## 

In [5]:
def collaborative(ratings,user_id):

    reader = Reader()
    #ratings.head()

    new_ratings = ratings
    data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']], reader)
    #data.split(n_folds=5)

    ## Training the data ##
    svd = SVD()
    cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=2)

    trainset = data.build_full_trainset()

    algo = SVD()
    algo.fit(trainset)

    #svd.train(trainset)
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    count = 0
     
    for uid, iid, true_r, est, _ in predictions:
        if uid == user_id:
            count = count+1
            new_ratings.loc[len(new_ratings)+1]= [uid,iid,est]
            #print("book_sim " + str(new_ratings))

    cb = new_ratings[(new_ratings['user_id'] == user_id)][['book_id', 'rating']]
    cb.columns=['book_id', 'collab_rating']
    #print(cb)
    print("Collab_done")
    return(cb)

In [6]:
################## content based #########################

In [7]:
def content_based(meta,ratings,user_id):       
    meta['book_id'] = meta['book_id'].astype('int')
    meta['authors'] = meta['authors'].str.replace(' ','')
    meta['authors'] = meta['authors'].str.lower()
    meta['authors'] = meta['authors'].str.replace(',',' ')
    ratings['book_id'] = ratings['book_id'].astype('int')
    ratings['user_id'] = ratings['user_id'].astype('int')
    ratings['rating'] = ratings['rating'].astype('int')


    #print(md.head())

    meta['authors'] = meta['authors'].apply(lambda x: [x,x])
    #print(md['authors'])
    meta['Genres']=meta['Genres'].str.split(';')
    #print(md['Genres'])

    meta['keywords'] = meta['authors'] + meta['Genres']
    #print(md['keywords'])
    meta['keywords'] = meta['keywords'].str.join(' ')

    tfidf = TfidfVectorizer(stop_words='english')

    tfidf_matrix = tfidf.fit_transform(meta['keywords'])
    tfidf_matrix.shape
    #print(tfidf_matrix)


    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    def build_user_profiles():
        
        user_profiles=np.zeros((60001,999))
        #taking only the first 100000 ratings to build user_profile
        length = len(ratings)
        #for i in range(0,79999):
        for i in range(0,length):
            u=ratings.iloc[i]['user_id']
            u = round(u)
            b=ratings.iloc[i]['book_id']
            b = round(b)
            user_profiles[u][b-1]=ratings.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()

    def get_similar_items_to_user_profile(person_id):
            

        user_ratings = np.empty((999,1))
        cnt=0
        
        for i in range(0,998):
            book_sim=cosine_sim[i]
            #print("book_sim " + str(book_sim))
            user_sim=user_profiles[person_id]
            #print("user_sim " + str(user_sim))
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
            #print("user_ratings " + str(user_ratings))
        maxval = max(user_ratings)
        #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            if(user_ratings[i]>3):
                cnt+=1
        #print(cnt)
        return user_ratings

    content_ratings = get_similar_items_to_user_profile(user_id)
    #print(content_ratings)
    #content_ratings = np.nan_to_num(content_ratings,copy = True,nan=0.0,posinf = None,neginf = None)

    book = meta[['book_id']]
    book1 = pd.DataFrame(data=content_ratings[0:,0:])
    frames = [book, book1]


    #content_rating = pd.concat(frames, axis =1,join_axes=[num.index])
    content_rating = pd.concat(frames, axis=1)
    content_rating = content_rating.reindex(book.index)
    content_rating.columns=['book_id', 'content_rating']
    
    
    #print(content_rating)
    print("content_done")
    return(content_rating)

In [8]:
################## Hybrid system #######################

In [63]:
def hybrid(user_id,ratings):
    meta = pd.read_csv('CustomData/FinalData.csv')
    rat = ratings
    rat['rating'] = rat['rating'].astype('int')
    print(ratings[(ratings['user_id'] == user_id)][['user_id','book_id', 'rating']])
    popularity_rating = popularity(meta,ratings)
    collaborative_rating = collaborative(ratings,user_id)
    #print(collaborative_rating)
    content_rating = content_based(meta,ratings,user_id)
     
    
    
    
        
    hyb = meta[['book_id']]
    title = meta[['book_id','title', 'Genres']]

    hyb = hyb.merge(title,on = 'book_id')
    hyb = hyb.merge(collaborative_rating,on = 'book_id')
    hyb = hyb.merge(popularity_rating, on='book_id')
    hyb = hyb.merge(content_rating, on='book_id')

    def weighted_rating(x):
        collab = x['collab_rating']
        pop = x['popularity_rating']
        content = x['content_rating']
        a = 0.4
        res = (1-2*a)*content + a*collab + a*pop
        #res = a*c + a*v + a*R
        res = "{:.4f}".format(res)
        return res

    hyb['hyb_rating'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('hyb_rating', ascending=False).head(999)
    hyb.columns = ['book_id' , 'title', 'genres', 'collaborative_rating', 'popularity_rating' , 'content_rating', 'hybrid_rating']
    
    #    y_true = rat[(rat['user_id'] == user_id)][['user_id','book_id', 'rating']]
    y_pred = hyb[['book_id','hybrid_rating']]

    
    #print(hyb)
    return y_pred
#    return hyb

In [64]:
# x_train = pd.read_csv('x_train.csv')
# y_test = pd.read_csv('x_test.csv')
# ratings = pd.read_csv('CustomData/ratings.csv')
# ur = x_train[['user_id']]
# ur = np.array(ur)
# ur = list(np.unique(ur))
# #print(ur)

# # for u in ur:
# #     #print(ur)
# #     h = hybrid(u,x_train)

# h = hybrid(1,x_train)


In [None]:
y_test = pd.read_csv('x_test.csv')
#rat = pd.read_csv('x_test.csv')
#ratings = pd.read_csv('CustomData/ratings.csv')
ur = y_test[['user_id']]
ur = np.array(ur)
ur = list(np.unique(ur))
#print(ur)

for u in ur:
    #print(ur)
    h = hybrid(u,y_test)
    rat = pd.read_csv('x_test.csv')
    h = hybrid(u,y_test)
    rat = rat[(rat['user_id'] == u)][['user_id','book_id', 'rating']]

    e = rat.merge(h,on = 'book_id')
    e.to_csv(r'C:\Users\sabri\Desktop\hybrid_recommander\prediction_list.csv', header=None, index=None, sep=',', mode='a+')
    #p = np.array(e[['hybrid_rating']])
    #r = np.array(e[['rating']])
    #err = sqrt(mean_squared_error(r, p))
    #print(err)

    #ratings.head(4)
    #e.head()
    
    
    
    
    
    
    
    
    
#     rat = rat[(rat['user_id'] == u)][['user_id','book_id', 'rating']]
#     e = rat.merge(h,on = 'book_id')
#     e.to_csv(r'C:\Users\sabri\Desktop\final hybrid\our\1.30am recom\prediction_list.csv', header=None, index=None, sep=',', mode='a+')
#     p = np.array(e[['hybrid_rating']])
#     r = np.array(e[['rating']])
#     err = sqrt(mean_squared_error(r, p))
#     print(err)
#     file = open("all_user_RMSE.txt", "a+")

    
#     file.write(str(err))
#     file.write("\n")

#     file.close()




    
    
#u=2   
#rat = pd.read_csv('x_test.csv')
#h = hybrid(u,y_test)
# rat = rat[(rat['user_id'] == u)][['user_id','book_id', 'rating']]

# e = rat.merge(h,on = 'book_id')
# e.head()
 
#h.head()



#p = np.array(e[['hybrid_rating']])
#r = np.array(e[['rating']])
# err = sqrt(mean_squared_error(r, p))
# print(err)

#ratings.head(4)
#print(p)
#print(r)
#e.head()

       user_id  book_id  rating
16038        1      867       3
popularity _done
Collab_done
content_done
       user_id  book_id  rating
16038        1      867       3
20001        1       63       3
20002        1      353       3
20003        1       38       3
20004        1      343       3
...        ...      ...     ...
20677        1      564       3
20678        1      746       3
20679        1      466       3
20680        1      549       3
20681        1       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
      user_id  book_id  rating
2905        2      260       5
popularity _done
Collab_done
content_done
       user_id  book_id  rating
2905         2      260       5
20682        2       63       3
20683        2      353       3
20684        2       38       3
20685        2      343       3
...        ...      ...     ...
21358        2      564       3
21359        2      746       3
21360        2      466       3
21361        2      

Collab_done
content_done
       user_id  book_id  rating
12332       17      123       4
17129       17      628       4
18220       17      570       5
18871       17       84       3
popularity _done
Collab_done
content_done
       user_id  book_id  rating
12332       17      123       4
17129       17      628       4
18220       17      570       5
18871       17       84       3
26725       17       63       3
...        ...      ...     ...
27398       17      564       3
27399       17      746       3
27400       17      466       3
27401       17      549       4
27402       17       56       4

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
1776        18      572       4
4053        18       46       2
5511        18      557       4
11078       18      297       4
11902       18       14       4
16783       18      913       5
16834       18      883       3
19148       18       13       4
19462       18      476       3
195

Collab_done
content_done
       user_id  book_id  rating
232         34      740       4
506         34       62       4
4375        34      391       5
6698        34       10       4
11628       34      172       4
15326       34      371       3
17499       34      408       2
popularity _done
Collab_done
content_done
       user_id  book_id  rating
232         34      740       4
506         34       62       4
4375        34      391       5
6698        34       10       4
11628       34      172       4
...        ...      ...     ...
34821       34      564       3
34822       34      746       3
34823       34      466       3
34824       34      549       3
34825       34       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
5522        36      272       4
5603        36      828       4
6316        36      184       4
6479        36       89       2
7469        36       75       4
11838       36      168       4
121

Collab_done
content_done
       user_id  book_id  rating
4279        58      148       4
5664        58       43       3
8507        58      177       5
8746        58      189       4
10388       58       58       4
13632       58       27       4
popularity _done
Collab_done
content_done
       user_id  book_id  rating
4279        58      148       4
5664        58       43       3
8507        58      177       5
8746        58      189       4
10388       58       58       4
...        ...      ...     ...
42940       58      564       3
42941       58      746       3
42942       58      466       3
42943       58      549       3
42944       58       56       4

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
2871        59       66       4
8694        59       43       3
13003       59      489       1
15350       59       33       4
19765       59      629       3
popularity _done
Collab_done
content_done
       user_id  book_id  

Collab_done
content_done
       user_id  book_id  rating
1526        73      618       4
3594        73      820       5
5059        73      468       4
6838        73       13       5
7985        73      171       5
...        ...      ...     ...
50374       73      564       3
50375       73      746       3
50376       73      466       3
50377       73      549       4
50378       73       56       4

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
92          74      125       2
10989       74      582       5
popularity _done
Collab_done
content_done
       user_id  book_id  rating
92          74      125       2
10989       74      582       5
50379       74       63       3
50380       74      353       3
50381       74       38       3
...        ...      ...     ...
51054       74      564       3
51055       74      746       3
51056       74      466       3
51057       74      549       3
51058       74       56       3

[6

Collab_done
content_done
      user_id  book_id  rating
2743       86      590       3
5400       86       65       4
popularity _done
Collab_done
content_done
       user_id  book_id  rating
2743        86      590       3
5400        86       65       4
57119       86       63       3
57120       86      353       3
57121       86       38       4
...        ...      ...     ...
57794       86      564       3
57795       86      746       3
57796       86      466       3
57797       86      549       3
57798       86       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
2428        89        8       4
5069        89      323       4
5103        89        4       4
10525       89      194       5
11133       89      574       3
11601       89       21       3
12326       89       45       4
13466       89      468       5
13767       89       35       3
14315       89      353       4
15069       89       38       3
16381 

Collab_done
content_done
       user_id  book_id  rating
5220       110      618       3
5455       110       25       5
9424       110       65       4
10632      110      213       5
12646      110      614       4
14637      110      653       4
18326      110       85       4
19022      110       13       4
popularity _done
Collab_done
content_done
       user_id  book_id  rating
5220       110      618       3
5455       110       25       5
9424       110       65       4
10632      110      213       5
12646      110      614       4
...        ...      ...     ...
65904      110      564       3
65905      110      746       3
65906      110      466       3
65907      110      549       3
65908      110       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
2810       112        7       4
4861       112      688       4
5350       112        2       4
6745       112        8       3
7852       112       57       5
955

Collab_done
content_done
       user_id  book_id  rating
3497       125      160       3
5308       125      763       4
7534       125        2       4
7798       125       80       5
9809       125      972       3
11920      125      916       3
14528      125       24       5
14749      125      495       4
17145      125       21       4
19231      125       32       3
19580      125      715       3
popularity _done
Collab_done
content_done
       user_id  book_id  rating
3497       125      160       3
5308       125      763       4
7534       125        2       4
7798       125       80       5
9809       125      972       3
...        ...      ...     ...
73323      125      564       3
73324      125      746       3
73325      125      466       3
73326      125      549       3
73327      125       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
431        126      126       4
1036       126       60       3
115

Collab_done
content_done
       user_id  book_id  rating
1576       142       33       3
5601       142       58       4
8706       142      820       5
11105      142      121       5
11472      142      115       5
12249      142       94       5
12269      142      211       4
12281      142        8       5
13543      142      943       4
15722      142      380       4
17114      142       13       5
17994      142      344       4
18619      142      372       3
18709      142      104       5
popularity _done
Collab_done
content_done
       user_id  book_id  rating
1576       142       33       3
5601       142       58       4
8706       142      820       5
11105      142      121       5
11472      142      115       5
...        ...      ...     ...
80743      142      564       3
80744      142      746       3
80745      142      466       3
80746      142      549       3
80747      142       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
   

Collab_done
content_done
       user_id  book_id  rating
7707       159       35       3
19206      159      635       3
popularity _done
Collab_done
content_done
       user_id  book_id  rating
7707       159       35       3
19206      159      635       3
86806      159       63       3
86807      159      353       3
86808      159       38       3
...        ...      ...     ...
87481      159      564       3
87482      159      746       3
87483      159      466       3
87484      159      549       3
87485      159       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
      user_id  book_id  rating
2538      162      341       1
3889      162       42       1
4363      162      121       3
4683      162       46       4
8471      162       93       3
popularity _done
Collab_done
content_done
       user_id  book_id  rating
2538       162      341       1
3889       162       42       1
4363       162      121       3
4683       162       46       4

Collab_done
content_done
       user_id  book_id  rating
156        177      316       4
527        177       23       3
3561       177       57       3
5018       177      741       2
10502      177      401       3
...        ...      ...     ...
94910      177      564       3
94911      177      746       3
94912      177      466       3
94913      177      549       3
94914      177       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
5391       179      468       2
6856       179      249       4
7832       179       77       2
8038       179      270       3
8451       179      741       3
8992       179       36       5
11230      179       40       5
12179      179      122       3
18809      179      372       3
popularity _done
Collab_done
content_done
       user_id  book_id  rating
5391       179      468       2
6856       179      249       4
7832       179       77       2
8038       179      270       3
845

popularity _done
Collab_done
content_done
       user_id  book_id  rating
4344       196       18       5
4925       196      321       4
7755       196        8       4
8482       196       36       5
9052       196      650       3
13213      196       32       3
17078      196        5       4
19485      196      138       3
19735      196       56       5
popularity _done
Collab_done
content_done
        user_id  book_id  rating
4344        196       18       5
4925        196      321       4
7755        196        8       4
8482        196       36       5
9052        196      650       3
...         ...      ...     ...
102316      196      151       3
102317      196      564       3
102318      196      746       3
102319      196      466       3
102320      196      549       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
8492       197      189       5
14423      197        7       4
15833      197       35       4
popula

Collab_done
content_done
       user_id  book_id  rating
528        212      113       5
926        212      360       3
1693       212       24       4
1780       212        7       3
1848       212      180       3
4443       212      646       3
6229       212      287       5
9084       212       58       4
11484      212        2       2
11550      212      293       3
13229      212      155       4
14095      212      772       4
15980      212       71       4
16468      212      714       5
18231      212      941       4
18721      212      103       4
19183      212      301       4
19454      212      824       4
popularity _done
Collab_done
content_done
        user_id  book_id  rating
528         212      113       5
926         212      360       3
1693        212       24       4
1780        212        7       3
1848        212      180       3
...         ...      ...     ...
109736      212      564       3
109737      212      746       3
109738      212      466    

Collab_done
content_done
        user_id  book_id  rating
2318        228        9       3
2337        228       22       3
3528        228      593       5
5227        228      126       5
7212        228       54       5
...         ...      ...     ...
116479      228      564       3
116480      228      746       3
116481      228      466       3
116482      228      549       3
116483      228       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
958        229       18       4
1973       229      777       4
5884       229      306       4
6156       229      331       3
6193       229      691       4
6965       229      233       3
8208       229      310       5
12095      229       21       4
12997      229      101       5
13869      229       26       4
14656      229      378       4
16866      229      662       5
17717      229      710       3
19983      229       78       3
popularity _done
Collab_done
cont

Collab_done
content_done
       user_id  book_id  rating
87         245      461       4
1008       245      264       4
1513       245      879       4
2607       245      109       4
3796       245       18       4
4144       245      270       4
4888       245      786       4
6024       245       45       4
7307       245       42       5
11284      245      130       4
13120      245       33       4
14822      245       85       5
17835      245      172       4
18434      245       27       4
popularity _done
Collab_done
content_done
        user_id  book_id  rating
87          245      461       4
1008        245      264       4
1513        245      879       4
2607        245      109       4
3796        245       18       4
...         ...      ...     ...
123886      245      564       3
123887      245      746       3
123888      245      466       3
123889      245      549       3
123890      245       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
cont

Collab_done
content_done
        user_id  book_id  rating
2263        257       14       4
8528        257      177       2
17416       257      297       3
129261      257       63       3
129262      257      353       3
...         ...      ...     ...
129935      257      564       2
129936      257      746       3
129937      257      466       3
129938      257      549       3
129939      257       56       3

[682 rows x 3 columns]
popularity _done
Collab_done
content_done
       user_id  book_id  rating
416        258      101       3
1669       258      378       3
2041       258      186       5
6290       258      121       4
11439      258       35       4
12704      258      587       3
13825      258      306       5
16647      258      476       4
17345      258        4       4
19491      258      136       4
19741      258       94       5
19834      258       33       4
popularity _done
Collab_done


In [7]:
pred = pd.read_csv('prediction_list.csv')
pred.head(5)

Unnamed: 0,user_id,book_id,rating,predicted
0,1,867,3,3.1723
1,3,118,3,3.5948
2,3,782,2,2.455
3,3,119,1,2.6252
4,3,738,1,2.2389


In [15]:
p = np.array(pred[['predicted']])
r = np.array(pred[['rating']])
err = sqrt(mean_squared_error(r, p))
print("RMSE :" +str(err))
MAE = mean_absolute_error(r, p)
print("MAE :" +str(MAE))

RMSE :0.6410413197283689
MAE :0.5371685649202733


In [12]:
h['hybrid_rating_round']= h['hybrid_rating']
h['hybrid_rating_round'] = h['hybrid_rating_round'].astype(float)

h['hybrid_rating_round']= h['hybrid_rating_round'].apply(np.floor)

#h.head(5)
rat = pd.read_csv('x_train.csv')
rat = rat[(rat['user_id'] == 1)][['user_id','book_id', 'rating']]

e = rat.merge(h,on = 'book_id')

#ratings.head(4)
e.head()

Unnamed: 0,user_id_x,book_id,rating,hybrid_rating,user_id_y,hybrid_rating_round
0,1,47,3,3.808,15,3.0


In [62]:
rat = pd.read_csv('x_test.csv')
rat = rat[(rat['user_id'] == 3)][['user_id','book_id', 'rating']]
rat.head(4)

Unnamed: 0,user_id,book_id,rating
3345,3,184,1
3690,3,765,3
4735,3,118,3
6496,3,782,2


In [14]:
# y_test = pd.read_csv('x_test.csv')
# cb = collaborative(y_test,3624)

In [15]:
# new = y_test
# new['hybrid']='1'
# new.head(4)