problem statement
---
Given the data of 10000 books and their rating, Build a recommender system by using cosine simillarties score

In [1]:
# import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re

In [2]:
# import Data / read from CSV file, {note that there was trouble reading the file because of the exisistance of special char
#                                    This was an encoding issue while reading we used encodeing parms 'windows-1252'}

rawData = pd.read_csv('book.csv',encoding='windows-1252')
rawData

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [3]:
# Deleting the unnamed column which just has the serial nos

rawData = rawData.iloc[:,1:]
rawData

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
...,...,...,...
9995,162121,American Fried: Adventures of a Happy Eater.,7
9996,162121,Cannibal In Manhattan,9
9997,162121,How to Flirt: A Practical Guide,7
9998,162121,Twilight,8


In [4]:
# defining a function to remove & sign from the Book.Title column

def clean_title(x):
    return re.sub(r'\&amp', '',x).strip()

In [5]:
# calling the clean function to remove the & from the dataset

cleanData = rawData.copy(deep=True)
cleanData['Book.Title'] = rawData['Book.Title'].apply(clean_title)

cleanData['Book.Title']

0                                     Classical Mythology
1                                            Clara Callan
2                                    Decision in Normandy
3       Flu: The Story of the Great Influenza Pandemic...
4                                  The Mummies of Urumchi
                              ...                        
9995         American Fried: Adventures of a Happy Eater.
9996                                Cannibal In Manhattan
9997                      How to Flirt: A Practical Guide
9998                                             Twilight
9999                         Kids Say the Darndest Things
Name: Book.Title, Length: 10000, dtype: object

In [6]:
# verifying the application of cleaning on the dataset

cleanData.loc[4900:4920]

Unnamed: 0,User.ID,Book.Title,Book.Rating
4900,2104,The Trials of Rumpole,8
4901,2105,The Penguin Dictionary of Quotations,7
4902,2106,The Penguin Dictionary of Geology (Penguin Ref...,5
4903,2106,Dictionary of Geography (Reference Books),6
4904,2106,"Jason, Madison",8
4905,2106,Repairing PC Drives,4
4906,2106,Other Stories;Merril;1985;McClelland,6
4907,2108,Michael Strogoff: A Courier of the Czar (Scrib...,9
4908,2109,"Ordinary People, Extraordinary Wealth: The 8 S...",4
4909,2110,Christmas Out West (Double D Western),7


In [7]:
# checking the range of rating. noted that range in 1 to 10 and ther are no 0 rating.

cleanData['Book.Rating'].unique() 

array([ 5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

In [8]:
# Checking the count of the each ratings

cleanData['Book.Rating'].value_counts()

8     2283
7     2076
10    1732
9     1493
5     1007
6      920
4      237
3      146
2       63
1       43
Name: Book.Rating, dtype: int64

In [9]:
# Checking the no of unique values in user id column

len(cleanData['User.ID'].unique())

2182

In [10]:
# checking the no of unique values in Book Title column

len(cleanData['Book.Title'].unique())

9659

In [11]:
# Creating a pivot Talbe using userid, booktitle, and rating

userRatingTble = cleanData.pivot_table(values='Book.Rating', index='User.ID', columns='Book.Title')
userRatingTble

Book.Title,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",1001 Brilliant Ways to Checkmate,101 Bright Ideas: Esl Activities for All Ages,101 Dalmatians,...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,,,,,,,,,,,...,,,,,,,,,,
278849,,,,,,,,,,,...,,,,,,,,,,
278851,,,,,,,,,,,...,,,,,,,,7.0,,
278852,,,,,,,,,,,...,,,,,,,,,,


In [12]:
userRatingTble.fillna(0,inplace=True)
userRatingTble

Book.Title,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",1001 Brilliant Ways to Checkmate,101 Bright Ideas: Esl Activities for All Ages,101 Dalmatians,...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# userRatingTble.mean(axis=1)

Generate user similarity matrix
---

In [14]:
# import cosine_similarity from sklearn pairwise

from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# apply cosine_similarity on cent_useruserDf and round the resulting values to 3 decimal places

similarities = np.round(cosine_similarity(userRatingTble),3)
similarities

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
similarities.shape

(2182, 2182)

In [17]:
# creating useruser similarity matrix

userSimilaritydf = pd.DataFrame(similarities, index=userRatingTble.index, columns=userRatingTble.index)
userSimilaritydf

User.ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
# userSimilaritydf.mean(axis=1)[:50]

In [19]:
# Define a function to select n users from the similarity matrix against specific users who are similer to the specified user.

def similar_users(user, n):
    # Find the similarity scores for user_1 with all other users and sort them
    user_similarity_series = userSimilaritydf.loc[user]
    ordered_similarities = user_similarity_series.sort_values(ascending=False)
    nearest_neighbors = ordered_similarities[1:n].index
    return list(nearest_neighbors)
#     nearest_neighbors_score = ordered_similarities[1:n]
#     return pd.DataFrame({'UserId': list(nearest_neighbors),"SimilarityScore" : list(nearest_neighbors_score)})

In [25]:
similar_users(276729,10)

[278854, 2590, 2589, 2584, 2583, 2582, 2580, 2579, 2596]

In [21]:
# Define a function to get the average rating given by the selected no of user for any given book

def avg_neighbor_ratings(Book, user_list):
    # Extract the ratings of the neighbors
    neighbor_ratings = userRatingTble.reindex(user_list)
    avg_ratings = np.round(neighbor_ratings[Book].mean(),2)
    return avg_ratings

In [22]:
# Define a function to check if the perticular book can be recomended to a specific user, by nesting the other two function

def recommend_movie(user, Book):
    num_of_neighbors = 10
    neighbors = similar_users(user,num_of_neighbors)
    ## Get the average ratings given by similar users for a particular movie
    avg_ratings = avg_neighbor_ratings(Book, neighbors)
    print("The average rating given by similar users for the movie '{0}' is {1}".format(Book,avg_ratings), '\n\n','Hence','\n')
    if avg_ratings > 3.5 :
        print("The user-user model recommends '{0}' to user {1}".format(Book,user ))
    else:
        print("The user-user model does not recommend '{0}' to user {1}".format(Book,user )) 

In [27]:
# Test the function by passing the parameters

recommend_movie(user=278854,Book='Decision in Normandy')

The average rating given by similar users for the movie 'Decision in Normandy' is 0.0 

 Hence 

The user-user model does not recommend 'Decision in Normandy' to user 278854
