# Collabrative Filtering
**We build a recommendation system based on ratings dataset. We extract information from ratings dataset and use that model to make recommendations without using all the other datasets**


To build this model we use a library called Surprise and we will be using singular value Decomposition(SVD) to recommned movies. It is a matrix factorization method in which it decomposes a matrix into three other matrices and extracts the features from the factorization of high-level matrix. In this case it would be user-item-rating matrix.

SVD can be written as A = USV

U - latent feature of users
S - Diagonal matrix representing the strength of each feature

Importing Libraries

In [1]:
!pip install pandas --upgrade --quiet
!pip install scikit-surprise
import os
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy



In [2]:
ratings = pd.read_csv("ratings_small.csv")

movies_df = pd.read_csv("movies_metadata.csv",low_memory=False)

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
# movie dataframe with votes more than 55
# IDs of movies with count more than 55 and movie ratings more than 55 counts
movies_df = movies_df[movies_df['vote_count']>55][['id','title']]
movie_ids = [int(x) for x in movies_df['id'].values]
ratings = ratings[ratings['movieId'].isin(movie_ids)]
ratings.reset_index(inplace=True, drop=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1371,2.5,1260759135
1,1,2105,4.0,1260759139
2,1,2294,2.0,1260759108
3,2,17,5.0,835355681
4,2,62,3.0,835355749


In [5]:
ratings.shape

(29965, 4)

In [6]:
# Initializing a surprise reader object and loading the data
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)


In [7]:
# Build trainset object
trainset = data.build_full_trainset()
# Initialize model
svd = SVD()
# cross-validate
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9844011af0>

In [8]:
#We pass three aruguments:userID, itemID and true rating
svd.predict(uid= 5,iid=2959,r_ui=5.0)
#est is estimated rating

Prediction(uid=5, iid=2959, r_ui=5.0, est=4.442924213033041, details={'was_impossible': False})

In [9]:
svd.predict(uid=15,iid=2678,r_ui=1.0)

Prediction(uid=15, iid=2678, r_ui=1.0, est=2.8212401766899706, details={'was_impossible': False})

In [10]:
def get_recommendations(data, movies_df, user_id, top_n, algo):
    
    # creating an empty list to store the recommended movie ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting those ids which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product ids which user_id has not interacted yet
    for item_id in non_interacted_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_id).est
        
        # appending ans sorting the predicted ratings 
        movie_name = movies_df[movies_df['id']==str(item_id)]['title'].values[0]
        recommendations.append((movie_name, est))

    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returing top n highest predicted rating products for this user
get_recommendations(data=ratings,movies_df=movies_df, user_id=655, top_n=10, algo=svd)


[('The Thomas Crown Affair', 4.810533959999681),
 ("Monsieur  Hulot's Holiday", 4.780764607854124),
 ('Sleepless in Seattle', 4.778932746484243),
 ('My Darling Clementine', 4.719832961142699),
 ('While You Were Sleeping', 4.718105349617916),
 ('The Thomas Crown Affair', 4.716560387947052),
 ('Bullitt', 4.7068904018555235),
 ('Once Were Warriors', 4.677181718106535),
 ('Solaris', 4.676452856872593),
 ("Don't Worry, I'm Fine", 4.660611994028044)]

In [11]:
get_recommendations(data=ratings,movies_df =movies_df, user_id=24, top_n=10, algo=svd)


[('The Million Dollar Hotel', 4.675048861638252),
 ("Don't Worry, I'm Fine", 4.595908614066184),
 ('While You Were Sleeping', 4.581910881907406),
 ('Birdman of Alcatraz', 4.563605615673539),
 ('Galaxy Quest', 4.549937751665292),
 ('Space Jam', 4.549506207753927),
 ('Sleepless in Seattle', 4.548600983082241),
 ('Lonely Hearts', 4.5131278422515715),
 ('Dead Man', 4.5092765522544696),
 ('Shriek If You Know What I Did Last Friday the Thirteenth',
  4.50822455593712)]

In [12]:
get_recommendations(data=ratings,movies_df =movies_df, user_id=78, top_n=10, algo=svd)


[('Nell', 4.953971378719272),
 ('While You Were Sleeping', 4.87346159602718),
 ('Hannibal Rising', 4.835457419054885),
 ('Dead Man', 4.826109184889111),
 ('Straw Dogs', 4.820969082613195),
 ('Galaxy Quest', 4.782611325449837),
 ('In Time', 4.7691248958087495),
 ("Don't Worry, I'm Fine", 4.767134448085589),
 ('Birdman of Alcatraz', 4.764334744279678),
 ('Terminator Salvation', 4.72315793839273)]