# Collaborative Filtering

Recommending movies to users and finding movies similar to other movies

Download the MovieLens 100K dataset from [kaggle.com](https://www.kaggle.com/prajitdatta/movielens-100k-dataset) and store it in folder `ml-100k`.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import linregress
from sklearn.metrics.pairwise import cosine_similarity
sns.set(style="darkgrid")

## Demo data

### Importing data

In [None]:
users = pd.read_csv("collab-demo/users.csv")
users

In [None]:
movies = pd.read_csv("collab-demo/movies.csv")
movies

In [None]:
ratings = pd.read_csv("collab-demo/ratings.csv")
ratings = ratings.rename(columns = {'Item':'Movie'})
ratings.sample(n=5)

### Exploring our data

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="Movie", y="Rating", hue="User", palette=sns.color_palette("bright",4), data=ratings);

#### Linear Regression between two Users

In [None]:
linregress(ratings.query('User==1')['Rating'],ratings.query('User==3')['Rating']).rvalue

In [None]:
# This gives an error as User 2 did not vote for the same movies as User 1
linregress(ratings.query('User==1')['Rating'],ratings.query('User==2')['Rating']).rvalue

#### Playing with the movie reviews

In [None]:
ratings_movies = pd.merge(ratings,movies,on='Movie')
ratings_movies.sample(n=5)

In [None]:
# Most rated movies
# ratings_movies.groupby('Title').size().sort_values(ascending=False)
ratings_movies.Title.value_counts()

In [None]:
# Get mean ratings
movie_stats = ratings_movies.groupby('Title').agg({'Rating': [np.size, np.mean]})
movie_stats.sample(n=6)

In [None]:
# Sort movies by Rating
movie_stats.sort_values([('Rating', 'mean')], ascending=False).head()

In [None]:
# Only sort movies with at least 4 reviews
movies4 = movie_stats['Rating']['size'] >= 4
movie_stats[movies4].sort_values([('Rating', 'mean')], ascending=False)

#### Finding similar users

In [None]:
user_ratings = ratings_movies.pivot_table(index='User', columns='Title', values='Rating').fillna(0)
user_ratings.head()

In [None]:
cosine_similarity(user_ratings)

#### Recommendations for User 3

In [None]:
ratings_movies.query('User == 1').sort_values([('Rating')], ascending=False)

## Movielens Reviews

In [None]:
users = pd.read_csv("ml-100k/u.user", sep='|', encoding='latin1', names=['User','Age','Gender','Occupation','ZIP'])
users.info()

In [None]:
users.head()

In [None]:
movies = pd.read_csv("ml-100k/u.item", sep='|', encoding='latin1', names=['Movie','Title','release date','video release date','IMDb URL','unknown','Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'])

In [None]:
movies.head()

In [None]:
ratings = pd.read_csv("ml-100k/u.data", sep='\t',encoding='latin1', names=['User','Movie','Rating','Timestamp'])
#ratings = ratings.rename(columns = {'Item':'Movie'})

In [None]:
ratings.head()

In [None]:
ratings_movies = pd.merge(ratings,movies,on='Movie')
ratings_movies.sample(n=5)

In [None]:
movie_ratings = ratings_movies.pivot_table(index='Movie', columns='User', values='Rating').fillna(0)
movie_ratings.head()

In [None]:
cosine_similarity(movie_ratings)

In [None]:
def find_sim_movies(index):
    """
    Finds similar movies to the given movie index
    Globals: movies, movie_ratings
    """
    sim = cosine_similarity(movie_ratings)[index]
    sim_tuples = [(i,x,movies.iloc[[i]].Title) for i,x in enumerate(sim)]
    return sorted(sim_tuples, key=lambda x: x[1], reverse=True)[:10]

In [None]:
# GoldenEye
find_sim_movies(1)

In [None]:
# Casablanca
find_sim_movies(482)

In [None]:
# Four Weddings and a Funeral
find_sim_movies(69)

In [None]:
find_sim_movies(95)

In [None]:
movies[movies['Title'].str.contains("Terminator")]