# COLLABORATIVE  FILTERING - Finding Similar Books and Movies

We'll start by loading up the Goodreads dataset. Using Pandas, we can very quickly load the rows of the rating and item files that we care about, and merge them together so we can work with book names instead of ID's. (In a real production job, you'd stick with ID's and worry about the names at the display layer to make things more efficient. But this lets us understand what's going on better for now.)

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

###  Load the data set of Book Ratings

In [None]:
pathToRatings = 'https://raw.githubusercontent.com/sujitcl/code/main/Data/bookratings.csv'
ratings = pd.read_csv(pathToRatings)
ratings.head(n=5)

Unnamed: 0,userId,itemId,rating
0,22,264,2
1,1138,264,5
2,1160,264,3
3,1217,264,3
4,1572,264,3


In [None]:
print("Number of ratings:", ratings.shape[0])
print("Unique users:", ratings['userId'].unique().size)
print("Unique books:", ratings['itemId'].unique().size)

Number of ratings: 212395
Unique users: 3000
Unique books: 1891


###  Load the item/book details.

In [None]:
pathToDetails = 'https://raw.githubusercontent.com/sujitcl/code/main/Data/bookInfo.csv'
items=pd.read_csv(pathToDetails)

In [None]:
items.sample(n=5)

Unnamed: 0,itemId,title,details
307,718,The Sound and the Fury,William Faulkner
55,1,The Hunger Games,Suzanne Collins
775,770,Julius Caesar,"William Shakespeare, Roma Gill"
1159,1279,Hoot,Carl Hiaasen
1884,2533,"The Winner's Curse (The Winner's Trilogy, #1)",Marie Rutkoski


# Build the Pivot Table of ratings

In [None]:
pivotTable = ratings.pivot_table(index=['userId'],columns=['itemId'],values='rating')
pivotTable.shape

(3000, 1891)

In [None]:
pivotTable.sample(n=5)

itemId,1,2,3,4,5,6,7,8,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,...,2550,2558,2562,2571,2576,2577,2587,2591,2605,2608,2614,2623,2626,2629,2642,2659,2665,2672,2684,2692,2697,2748,2762,2795,2801,2803,2821,2826,2868,2971,2998,3105,3132,3150,3231,3345,3384,3422,3436,7373
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
10121,5.0,1.0,5.0,3.0,,,1.0,5.0,3.0,,5.0,,,4.0,,,1.0,,,,,,,1.0,,,4.0,,3.0,,,,,5.0,,,,,,3.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
34837,2.0,5.0,2.0,,,,4.0,3.0,,,5.0,,,5.0,,5.0,4.0,,5.0,,5.0,5.0,5.0,,5.0,4.0,,,,,,,,5.0,5.0,,3.0,3.0,,5.0,...,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
48712,4.0,5.0,4.0,,,5.0,4.0,,,,,,,,,5.0,,,5.0,,5.0,5.0,5.0,,5.0,,,,,,,,,,,,,,5.0,,...,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,
7531,5.0,,,,,5.0,,5.0,,5.0,3.0,,,,4.0,,,4.0,,4.0,,,,,,,,5.0,5.0,,,1.0,,3.0,,,,1.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
47948,4.0,4.0,,,3.0,,4.0,,,,3.0,4.0,,,,,3.0,,5.0,,,,,,,,,,,,,,,,3.0,,3.0,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


##  Find out the correlation matrix of all books with each other

In [None]:
corrTable = pivotTable.corr(min_periods=250)

In [None]:
#View the corrtable
corrTable.head()

itemId,1,2,3,4,5,6,7,8,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,...,2550,2558,2562,2571,2576,2577,2587,2591,2605,2608,2614,2623,2626,2629,2642,2659,2665,2672,2684,2692,2697,2748,2762,2795,2801,2803,2821,2826,2868,2971,2998,3105,3132,3150,3231,3345,3384,3422,3436,7373
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1.0,0.234702,0.388496,0.031843,-0.015047,0.18635,-0.008045,0.066032,0.090484,0.245843,0.384684,0.109162,0.014685,0.061459,0.682536,0.152241,0.122538,0.527494,0.172085,0.218628,0.135668,0.198064,0.219148,0.243497,0.174467,0.093906,0.097844,0.208511,0.170955,0.028036,0.211826,0.273488,,0.163273,0.268511,0.218483,0.143506,,0.209821,0.196025,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.234702,1.0,0.135253,0.119011,0.068083,0.150993,0.278121,0.01233,0.251825,0.165599,0.183266,-0.001523,0.105501,0.249506,0.211764,0.653078,0.194636,0.141231,0.532025,0.030516,0.695142,0.617719,0.542171,0.304024,0.514092,0.030403,0.112215,0.217848,0.247543,0.133582,0.290638,,,0.213961,0.25795,0.187808,0.20835,,0.275625,0.314672,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0.388496,0.135253,1.0,0.008118,-0.090979,0.170944,0.02874,0.116441,0.174361,0.278934,0.34759,-0.052954,-0.019727,0.198119,0.316605,0.060908,0.090555,0.326567,0.145388,0.211613,0.128916,0.148787,0.155429,0.364606,0.149404,0.032779,0.174381,,0.197473,,0.255155,,,0.038957,0.172047,0.171621,,,,0.235742,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0.031843,0.119011,0.008118,1.0,0.333149,,0.123229,0.273784,0.296222,0.172932,,0.198485,0.243274,0.331845,-0.058143,0.131703,0.20416,-0.049918,0.083256,0.107921,0.125219,0.112736,0.156964,-0.021664,0.159219,0.261256,0.225786,-0.023124,0.13813,0.327692,0.087934,,,0.202585,0.166316,0.157277,,,,0.355883,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,-0.015047,0.068083,-0.090979,0.333149,1.0,,0.164837,0.397155,0.15106,0.048222,,0.291779,0.333838,0.261198,0.028706,0.120853,0.131523,0.061092,0.177705,0.160011,0.125764,0.095833,0.118941,0.036947,0.158515,0.301325,0.244788,,0.118751,0.215074,0.153285,,,0.046164,0.074694,0.135273,,,,0.209742,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### For a given item, find other items whose ratings are highly correlated. 

In [None]:
def itemsFromIDs(items, IDlist):
    df = pd.DataFrame(columns=items.columns)
    for id in IDlist:
        item = items[items.itemId == id] 
        df = pd.concat([df, item], axis=0)
    
    df.reset_index(inplace=True, drop=True)
    return df

In [None]:
itemsFromIDs(items, [1,2])

Unnamed: 0,itemId,title,details
0,1,The Hunger Games,Suzanne Collins
1,2,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré"


In [None]:
def relatedRecos(itemName):
    ItemID = items[items.title == itemName]["itemId"].iloc[0]
    my_corr=corrTable.loc[ItemID]

    top10 = my_corr.dropna().sort_values(ascending=False)[:10]
    top10itemIDs = list(top10.index)

    top10Items = itemsFromIDs(items, top10itemIDs)
    
    return top10Items

In [None]:
itemName = 'Harry Potter and the Deathly Hallows' 

top10Recos = relatedRecos(itemName)
top10Recos

Unnamed: 0,itemId,title,details
0,25,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré"
1,27,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré"
2,24,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré"
3,21,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré"
4,18,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck"
5,23,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré"
6,2,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré"
7,10,Pride and Prejudice,Jane Austen
8,17,"Catching Fire (The Hunger Games, #2)",Suzanne Collins
9,26,"The Da Vinci Code (Robert Langdon, #2)",Dan Brown


In [None]:
itemName = 'Of Mice and Men'

top10Recos = relatedRecos(itemName)
top10Recos

Unnamed: 0,itemId,title,details
0,32,Of Mice and Men,John Steinbeck
1,14,Animal Farm,George Orwell
2,58,The Adventures of Huckleberry Finn,"Mark Twain, John Seelye, Guy Cardwell"
3,15,The Diary of a Young Girl,"Anne Frank, Eleanor Roosevelt, B.M. Mooyaart-D..."
4,29,Romeo and Juliet,"William Shakespeare, Robert Jackson"
5,28,Lord of the Flies,William Golding
6,4,To Kill a Mockingbird,Harper Lee
7,8,The Catcher in the Rye,J.D. Salinger
8,7,The Hobbit,J.R.R. Tolkien
9,5,The Great Gatsby,F. Scott Fitzgerald


In [None]:
itemName = 'The Kite Runner'

top10Recos = relatedRecos(itemName)
top10Recos

Unnamed: 0,itemId,title,details
0,11,The Kite Runner,Khaled Hosseini
1,67,A Thousand Splendid Suns,Khaled Hosseini
2,31,The Help,Kathryn Stockett
3,33,Memoirs of a Geisha,Arthur Golden
4,57,The Secret Life of Bees,Sue Monk Kidd
5,3,"Twilight (Twilight, #1)",Stephenie Meyer
6,46,Water for Elephants,Sara Gruen
7,1,The Hunger Games,Suzanne Collins
8,15,The Diary of a Young Girl,"Anne Frank, Eleanor Roosevelt, B.M. Mooyaart-D..."
9,26,"The Da Vinci Code (Robert Langdon, #2)",Dan Brown


In [None]:
itemName = 'The Hunger Games'

top10Recos = relatedRecos(itemName)
top10Recos

Unnamed: 0,itemId,title,details
0,1,The Hunger Games,Suzanne Collins
1,17,"Catching Fire (The Hunger Games, #2)",Suzanne Collins
2,20,"Mockingjay (The Hunger Games, #3)",Suzanne Collins
3,3,"Twilight (Twilight, #1)",Stephenie Meyer
4,12,"Divergent (Divergent, #1)",Veronica Roth
5,73,"The Host (The Host, #1)",Stephenie Meyer
6,64,My Sister's Keeper,Jodi Picoult
7,52,"Eclipse (Twilight, #3)",Stephenie Meyer
8,69,"Insurgent (Divergent, #2)",Veronica Roth
9,53,"Eragon (The Inheritance Cycle, #1)",Christopher Paolini


### Try to find recommendations based of your favorite books

In [None]:
def searchForItems(items, searchStr):
    df = items[items['title'].str.contains(searchStr, case=False)]
    return list(df['title'])

In [None]:
searchForItems(items, "Lost")

['Wild: From Lost to Found on the Pacific Crest Trail',
 'The Lost Symbol (Robert Langdon, #3)',
 'Paradise Lost',
 'The Lost World (Jurassic Park, #2)',
 'The Lost Colony (Artemis Fowl, #5)',
 'The Lost Hero (The Heroes of Olympus, #1)',
 'City of Lost Souls (The Mortal Instruments, #5)']