# ▶ **MOVIE Recommendation (Collaborative Filtering)**



In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#  **Importing Dataset**

In [5]:
movies_link = "/content/drive/MyDrive/Colab Notebooks/Recommender_system/movies.csv"
ratings_link = "/content/drive/MyDrive/Colab Notebooks/Recommender_system/ratings.csv"

In [6]:
# importing movies and ratings csv file into pandas dataframe
movies = pd.read_csv(movies_link)
ratings = pd.read_csv(ratings_link)

In [6]:
# Useful info
print("Total movies - ",movies.title.unique().size)
print("Total users - ",ratings.userId.unique().size)

Total movies -  9737
Total users -  610


In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# **Pivoting & finding Correlation**

In [13]:
# merge movies and ratings dataframes join on movieId
# dropping unnecessary columns
data = pd.merge(movies,ratings,on='movieId').drop(['genres','timestamp'],axis = 1)

In [14]:
# user ratings dataframe
# columns for all movies with users as rows
user_ratings = data.pivot_table(index = ['userId'], columns= ["title"],values = "rating")
user_ratings = user_ratings.dropna(thresh = 10,axis = 1).fillna(0)

In [15]:
user_ratings.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# correlation matrix for movies 
# higher value means movies are correlated
item_similarity_df = user_ratings.corr(method = 'pearson')

# **Helper functions**

In [24]:
# Functions for Top 5 recomendations 
# get_similar_movies() : desc order of corr values for a movie
# recommend() : desc order of movies based on sum of corr score

def get_similar_movies(movie_name,user_rating):
  similar_score = item_similarity_df[movie_name]*(user_rating - 2.5)
  similar_score = similar_score.sort_values(ascending = False)
  return similar_score

def recommend(l):
  similar_movies = pd.DataFrame()
  for movie in l:
    similar_movies = similar_movies.append(get_similar_movies(movie,5),ignore_index = True)

  rl = similar_movies.sum().sort_values(ascending = False)
  return rl[3:8]

# **Generate recommendations**

In [25]:
# Testing for the recomendations
# Given a movie list which user likes assumed rating 5/5
action_lover = [("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)"),("12 Years a Slave (2013)"),("2012 (2009)")]

In [26]:
list(recommend(action_lover).index)

['Crank (2006)',
 'Fast & Furious (Fast and the Furious 4, The) (2009)',
 'A-Team, The (2010)',
 'Mission: Impossible III (2006)',
 'Hancock (2008)']

In [27]:
item_similarity_df.to_csv('file1.csv')