# <center> Recommendation System on Movie Lens Dataset

In [1]:
# Necessary imports

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings; warnings.simplefilter('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from math import sqrt
from ast import literal_eval



In [2]:
# Set height, width, maximum rows and columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

**Note:** In this project, due to limited computational resources, I use the full Movielens dataset (4 Million records), extract the metadata and keywords, merge with the smaller Movielens dataset (100k records) to build a recommendation system.

### (i) Reading the full dataset

In [3]:
# Credits dataset [contains cast and crew details] 
credits = pd.read_csv('data_full/credits.csv')

# Keywords dataset [contains important keywords of the movie] 
keywords = pd.read_csv('data_full/keywords.csv')

In [4]:
# Links dataset [contains ids such as moviedId, imdbId, tmdbId]
# links = pd.read_csv('data_full/links.csv')

# Sample version of the full dataset
links_small = pd.read_csv('data_full/links_small.csv')

In [5]:
links_small.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


In [6]:
# Metadata dataset [contains genre and movie description details]
metadata = pd.read_csv('data_full/movies_metadata.csv')

In [24]:
# Ratings dataset [contains ratings of each user]
# new_ratings = pd.read_csv('data_full/ratings.csv')
ratings_small = pd.read_csv('data_full/ratings_small.csv')

### (ii) Reading the sample dataset

In [31]:
# Columns for sample dataset 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [32]:
# Users, Ratings and Movies datasets
users_1 = pd.read_csv("data/u.user", sep = '|', names = u_cols)
ratings = pd.read_csv('data/u.data', sep = '\t', names = r_cols)
movies = pd.read_csv('data/u.item', sep = '|', names = m_cols, encoding = 'latin-1')

In [10]:
# Users, Ratings and Movies merged into Movielens dataset
movielens = pd.merge(users_1, ratings)
movielens = pd.merge(movielens, movies)
movielens.head(3)

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,timestamp,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,M,technician,85711,61,4,878542420,Three Colors: White (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Trzy%20kolory...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,13,47,M,educator,29206,61,4,882140552,Three Colors: White (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Trzy%20kolory...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,18,35,F,other,37212,61,4,880130803,Three Colors: White (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Trzy%20kolory...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [11]:
# 'zz' is a copy of Movielens data (sample) to build a basic recommendation system
zz = movielens.copy(deep = True)

In [12]:
# Dropping unimportant columns
zz.drop(['sex', 'zip_code', 'timestamp', 'video_release_date', 'imdb_url'], axis = 1, inplace = True)

In [13]:
zz.head(1)

Unnamed: 0,user_id,age,occupation,movie_id,rating,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,technician,61,4,Three Colors: White (1994),01-Jan-1994,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


**Note:** Genre columns is dummied already.

### (iii) Transformations

**Transformation 1:** Format `title`

In [14]:
# Format 'title' i.e. remove 'year' from title
zz['title'] = zz['title'].astype(str).str[:-7]

In [40]:
# Format 'title' i.e. remove 'year' from title
movies['title'] = movies['title'].astype(str).str[:-7]

**Transformation 2:** Categorize `rating`

In [15]:
# Categorize 'rating'
zz['rating_cat'] = zz['rating']

In [16]:
# Function to categorize 'rating'
def transformation_1(df):
    df['rating_cat'].replace([1, 2, 3, 4, 5],
                      ['below_avg', 'below_avg', 'avg', 'above_avg', 'above_avg'], 
                      inplace = True)

In [17]:
# Apply transformation_1
transformation_1(zz)

In [18]:
# Updated column
zz.rating_cat.value_counts()

above_avg    55375
avg          27145
below_avg    17480
Name: rating_cat, dtype: int64

**Transformation 3:** Categorize `occupation`

In [19]:
# Categorize 'occupation'
zz['occupation_cat'] = zz['occupation']

In [20]:
# Function to categorize 'occupation'
def transformation_3(df):
    df['occupation_cat'].replace(['student', 'other', 'educator', 'engineer', 'programmer', 'administrator', 'writer', 'librarian', 'technician', 'executive', 'healthcare', 'artist', 'entertainment', 'scientist', 'marketing', 'retired', 'lawyer', 'none', 'salesman', 'doctor', 'homemaker'],
                         ['category_1', 'category_2', 'category_2', 'category_2', 'category_2', 'category_2', 'category_3', 'category_3', 'category_4', 'category_4', 'category_4', 'category_4', 'category_4', 'category_4', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5'], 
                      inplace = True)

In [21]:
# Apply transformation_3
transformation_3(zz)

In [22]:
# Updated column
zz.occupation_cat.value_counts()

category_2    43560
category_1    21957
category_4    16174
category_3    10809
category_5     7500
Name: occupation_cat, dtype: int64

## 1. Simple Recommendation System (Popularity based - Ratings)

In [34]:
ratings.drop('timestamp', axis = 1, inplace = True)

In [35]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [36]:
# Ratings matrix with movie_id as columns and user_id as rows and ratings as values
ratings_matrix = ratings.pivot_table(index = ['movie_id'], columns = ['user_id'], values = 'rating').reset_index(drop = True)

# Fill nans with 0
ratings_matrix.fillna(0, inplace = True)
ratings_matrix.head()

user_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,...,694,695,696,697,698,699,700,701,702,703,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,876,877,878,879,880,881,882,883,884,885,886,887,888,889,890,891,892,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,1.0,5.0,4.0,5.0,0.0,3.0,5.0,0.0,5.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,5.0,5.0,4.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,2.0,0.0,0.0,2.0,3.0,4.0,3.0,3.0,3.0,0.0,0.0,4.0,0.0,4.0,2.0,0.0,4.0,0.0,5.0,0.0,4.0,0.0,4.0,4.0,4.0,2.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,5.0,4.0,5.0,5.0,4.0,0.0,4.0,0.0,3.0,3.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,3.0,4.0,0.0,0.0,4.0,0.0,5.0,4.0,0.0,0.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,3.0,0.0,0.0,4.0,3.0,0.0,0.0,4.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,4.0,0.0,0.0,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,1.0,5.0,3.0,3.0,3.0,2.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,4.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,0.0,4.0,4.0,3.0,4.0,4.0,...,0.0,0.0,0.0,5.0,4.0,3.0,0.0,4.0,0.0,4.0,0.0,5.0,4.0,0.0,5.0,4.0,4.0,0.0,0.0,0.0,3.0,5.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,0.0,0.0,4.0,3.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,4.0,2.0,4.0,5.0,4.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,5.0,0.0,1.0,0.0,4.0,4.0,0.0,0.0,5.0,5.0,4.0,5.0,5.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0,3.0,3.0,0.0,4.0,4.0,4.0,4.0,2.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,5.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,5.0,4.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,4.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,4.0,4.0,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,5.0,3.0,0.0,5.0,4.0,5.0,0.0,3.0,4.0,0.0,5.0,5.0,4.0,4.0,4.0,5.0,0.0,3.0,0.0,5.0,5.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,4.0,3.0,3.0,4.0,0.0,3.0,5.0,3.0,5.0,0.0,0.0,5.0,0.0,3.0,3.0,0.0,4.0,3.0,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,5.0,5.0,0.0,0.0,5.0,0.0,3.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,4.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0,4.0,4.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
def pop_rec_system_new(user_input, metricc):
    
    if metricc == "cosine":
        movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric = "cosine")
    elif metricc == "euclidean":
        movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric = "euclidean")
    elif metricc == "manhattan":
        movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric = "manhattan")
    elif metricc == "correlation":
        movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric = "correlation")
        
    np.fill_diagonal(movie_similarity, 0) 
    cosine_similarity_matrix = pd.DataFrame(movie_similarity)
        
    if (any(movies.title == user_input)):
            
            inp = movies[movies['title']==user_input].index.tolist() # Index of the user imput (movie)
            inp = inp[0]                                           # Index of the user imput (movie)

            similar_movies = movies[['movie_id', 'title']]         # similar Movies [dataframe with id, title]
            # 'similarity' column contains cosine values of each movie with user input
            similar_movies['similarity'] = cosine_similarity_matrix.iloc[inp] 
            similar_movies.columns = ['movie_id', 'title', 'similarity'] # rename columns

            # Reccommended Movies 
            print("Reccommended movies")
            print("------------------")
            print(similar_movies.sort_values( ["similarity"], ascending = False )[1:10])
     
    # If movie is not in existing dataframe
    else:
        print("Movie doesn't exist in the database")

In [43]:
pop_rec_system_new('GoldenEye', 'cosine')

Reccommended movies
------------------
     movie_id                       title  similarity
160       161                     Top Gun    0.623544
384       385                   True Lies    0.617274
402       403                      Batman    0.616143
61         62                    Stargate    0.604969
575       576                 Cliffhanger    0.601960
225       226                  Die Hard 2    0.597083
230       231              Batman Returns    0.595684
549       550  Die Hard: With a Vengeance    0.590124
95         96  Terminator 2: Judgment Day    0.584100


In [44]:
pop_rec_system_new('GoldenEye', 'euclidean')

Reccommended movies
------------------
      movie_id                          title  similarity
575        576                    Cliffhanger  -30.543621
232        233                    Under Siege  -30.591138
28          29                 Batman Forever  -31.249031
577        578                 Demolition Man  -31.649655
1227      1228  Under Siege 2: Dark Territory  -31.741411
230        231                 Batman Returns  -31.756679
553        554                     Waterworld  -32.555923
61          62                       Stargate  -32.570821
801        802                    Hard Target  -32.882149


In [45]:
pop_rec_system_new('GoldenEye', 'manhattan')

Reccommended movies
------------------
      movie_id                          title  similarity
232        233                    Under Siege      -315.0
575        576                    Cliffhanger      -326.0
577        578                 Demolition Man      -333.0
1227      1228  Under Siege 2: Dark Territory      -341.0
801        802                    Hard Target      -353.0
28          29                 Batman Forever      -353.0
230        231                 Batman Returns      -362.0
778        779                      Drop Zone      -362.0
61          62                       Stargate      -362.0


In [46]:
pop_rec_system_new('GoldenEye', 'correlation')

Reccommended movies
------------------
     movie_id           title  similarity
575       576     Cliffhanger    0.555861
160       161         Top Gun    0.553483
61         62        Stargate    0.548701
384       385       True Lies    0.547434
402       403          Batman    0.546641
230       231  Batman Returns    0.535239
577       578  Demolition Man    0.531821
225       226      Die Hard 2    0.530296
28         29  Batman Forever    0.526570


**Note:** This recommendation system is solely based on popularity. The movies returned with `cosine`, `euclidean` and `manhattan distance` are quite similar to each other. However, they are not so much when the recommendation system uses `pearson correlation`.

**Limitation:** This recommendation system suggests movies IRRSPECTIVE OF USER PREFERENCES. 

## Potential Next Steps:

**Suggestions for Content-Based filtering from other data scietists I met during the meet-up:**

1. Use weighted average on each movie:
    - How about multiplying `rating count` and `average rating`.

2. Since, dataset is with 100k records; work on Top 25%, Middle 50% and Bottom 25% of the records on the weighted scale.

3. Use metadata td-idf matrix (cosine similarity) rather than just the movies.

4. Also you can scrap the description of the movies from IMDB and use that as part of the 'Corpus'.

5. Try 'Movie Cast', 'Director' and 'Production'

6. For collaborative filtering - try 'movie-movie' similarity and 'user-user' similarity (Computationally Expensive)

7. Try to build a Hybrid Recommender

1 - Try (If you are using a linear column, there can be huge variance in the feature) [Try normalize and standardize]
2 - Try 
3 - tf-idf is used unstructured data (use 'word2vec').
4 - Use 'word2vec' and 'tf-idf'
5 - Google for imdb scraped data - someone miught have already done it (tmdb, omdb)
6 - Do not do User-User (Potential next step - In the write-up)
7 - Same as above