# 1.Item base movie Recommendation System

### 1. Import Required Packages

### Download Movie lens data set and extract it to the folder recommendations/data/
    http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation
import warnings
warnings.filterwarnings('ignore')
import os

In [None]:
print(os.listdir("./data/ml-100k"))

### 2. Read and Merge Dataset

In [None]:

# User's Dataset
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./data/ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1', parse_dates=True) 
# Rating dataset
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('./data/ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# Movies Dataset
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('./data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

In [None]:
ratings.head(3)

In [None]:
movies.head(3)

In [None]:
users.head(3)

In [None]:
#Merge all dataset
movie_ratings = pd.merge(movies, ratings)
df = pd.merge(movie_ratings,users)
df.head(5)

### 3. Data Preprocessing

In [None]:
#  Drop unused column from comman and individual dataset
#'user_id', 'rating', 'occupation'
df.drop(df.columns[[3,4,7]], axis=1, inplace=True)
ratings.drop( "unix_timestamp", inplace = True, axis = 1 ) 
movies.drop(movies.columns[[3,4]], inplace = True, axis = 1 )
#Dropping all the columns that are not really needed
df.info()

### 4. Movie Rating  

In [None]:
#  Movie rating aggregate based on title so we get rating by movie title
movie_stats = df.groupby('title').agg({'rating' : [np.size, np.mean]})
movie_stats.head(10)

In [None]:
# Setting a threshold of atleast 50 ratings for better analysis.
min_50 = movie_stats['rating']['size'] >= 50
#min_50 will contain true or false for each movie
movie_stats[min_50].sort_values([('rating', 'mean')], ascending=False).head()

### 4.1 Movie's Rating Individual Count

In [None]:
# Rating Count and Graph
plt.figure(figsize=(20,8))
ratings.rating.plot.hist(bins=10, )
plt.title("Distribution of Users' Ratings")
plt.ylabel('Number of Ratings')
plt.xlabel('Rating (Out of 5)')

rating_count = pd.DataFrame(ratings.rating.value_counts()).T.rename({'rating':'Ratings'})
rating_count

### 4.2 Movie Rating Count By user age

In [None]:
# Rating count by age(How many user's by age)
plt.figure(figsize=(20,8))
users.age.plot.hist(bins = 300)
plt.title("User Ratings By age")
plt.ylabel('Number of Users')
plt.xlabel('Age')

user_age_count = pd.DataFrame(users.age.value_counts()).T.rename({'age':'Total Users by Age'})
user_age_count

### 5. Rating Pivot table

In [None]:
#Convert Rating to matrix format
ratings_matrix = ratings.pivot_table(index=['movie_id'],columns=['user_id'],values='rating').reset_index(drop=True)
ratings_matrix.fillna(0, inplace=True)
ratings_matrix.head()

In [None]:
# Calculate Cosine Similarity for rating and movie
# ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’
movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric="cosine")
np.fill_diagonal(movie_similarity, 0)
ratings_matrix = pd.DataFrame(movie_similarity)
ratings_matrix.head()

 ### 7. Recommendation Engine

In [None]:
# Recommendation System of movies
try:
    user_inp = "Speed (1994)"
    inp = movies[movies['title']==user_inp].index.tolist()
    inp = inp[0]
    movies['similarity'] = ratings_matrix.iloc[inp]
    movies.columns = ['movie_id', 'title', 'release_date','similarity']
    movies.head(10)
    movies['similarity'].plot(kind = "hist")
except: 
    print("Sorry, the movie is not in the database!")

In [None]:
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:10])

In [None]:
# We looked at just the user ratings to calculate the similarity between movies. It can be expanded to use other information 
# about the movie or information about user. Converity categorically variable like movie genere into one hot enconding