# Memory Based Collaborative Model

In [None]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('ml-latest-small/ratings.csv')

In [None]:
movie_titles = pd.read_csv('ml-latest-small/movies.csv')
movie_titles.head()

In [None]:
df = pd.merge(df, movie_titles, on='movieId')
df.head()

In [None]:
df.describe()

In [None]:
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings.head()

In [None]:
ratings['number_of_ratings'] = df.groupby('title')['rating'].count()
ratings.head()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
ratings['rating'].hist(bins=50)

In [None]:
ratings['number_of_ratings'].hist(bins=60)

In [None]:
import seaborn as sns
sns.jointplot(x='rating', y='number_of_ratings', data=ratings)

In [None]:
movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')

In [None]:
AFO_user_rating = movie_matrix['Air Force One (1997)']
contact_user_rating = movie_matrix['Harry Potter and the Prisoner of Azkaban (2004)']

In [None]:
similar_to_air_force_one=movie_matrix.corrwith(AFO_user_rating)

In [None]:
similar_to_air_force_one.head()

In [None]:
similar_to_contact = movie_matrix.corrwith(contact_user_rating)
similar_to_contact.head()

earlier our matrix had very many missing values since not all the movies were rated by all the users. We therefore drop those null values and transform correlation results into dataframes to make the results look more appealing.

In [None]:
corr_contact = pd.DataFrame(similar_to_contact, columns=['Correlation'])
corr_contact.dropna(inplace=True)
corr_contact.head()


In [None]:
corr_AFO = pd.DataFrame(similar_to_air_force_one, columns=['correlation'])
corr_AFO.dropna(inplace=True)
corr_AFO.head()

These two dataframes above show us the movies that are most similar to Contact (1997) and Air Force One (1997) movies respectively. However we have a challenge in that some of the movies have very few ratings and may end up being recommended simply because one or two people gave them a 5 star rating. We can fix this by setting a threshold for the number of ratings. From the histogram earlier we saw a sharp decline in number of ratings from 100. We shall therefore set this as the threshold, however this is a number you can play around with until you get a suitable option. In order to do this we need to join the two dataframes with the number_of_ratings column in the ratings dataframe.

In [None]:
corr_AFO = corr_AFO.join(ratings['number_of_ratings'])
corr_contact = corr_contact.join(ratings['number_of_ratings'])

In [None]:
corr_AFO.head()

In [None]:
corr_contact.head()

We shall now obtain the movies that are most similar to Air Force One (1997) by limiting them to movies that have at least 100 reviews. We then sort them by the correlation column and view the first 10.

In [None]:
corr_AFO[corr_AFO['number_of_ratings'] > 100].sort_values(by='correlation', ascending=False).head(100)

In [None]:
corr_contact[corr_contact['number_of_ratings'] > 100].sort_values(by='Correlation', ascending=False).head(100)