In [1]:
#importing required libraries 
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings; warnings.simplefilter('ignore')

** Idea: ** Use the full dataset to extract the metadata and keywords; and merge wit the sample dataset (100k records) and build a recommendation system.

### (i) Reading the full dataset

In [2]:
# Reading the full dataset
credits = pd.read_csv('data_full/credits.csv')
keywords = pd.read_csv('data_full/keywords.csv')
links = pd.read_csv('data_full/links.csv')
links_small = pd.read_csv('data_full/links_small.csv')

In [3]:
metadata = pd.read_csv('data_full/movies_metadata.csv')

In [None]:
new_ratings = pd.read_csv('data_full/ratings.csv')
new_ratings_small = pd.read_csv('data_full/ratings_small.csv')

### (ii) Reading the sample dataset

In [4]:
# Reading the sample dataset
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [9]:
users_1 = pd.read_csv("data/u.user",sep='|',names=u_cols)
ratings_1 = pd.read_csv('data/u.data',sep='\t', names=r_cols)
movies_1 = pd.read_csv('data/u.item', sep='|', names=m_cols, encoding='latin-1')
movielens=pd.merge(users_1 , ratings_1)
movielens=pd.merge(movielens,movies_1)
movielens.head(3)

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,timestamp,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,M,technician,85711,61,4,878542420,Three Colors: White (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Trzy%20kolory...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,13,47,M,educator,29206,61,4,882140552,Three Colors: White (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Trzy%20kolory...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,18,35,F,other,37212,61,4,880130803,Three Colors: White (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Trzy%20kolory...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [10]:
zz = movielens.copy(deep = True)

In [12]:
zz.drop(['sex', 'zip_code', 'timestamp', 'video_release_date', 'imdb_url'], axis = 1, inplace = True)

In [13]:
zz.head(1)

Unnamed: 0,user_id,age,occupation,movie_id,rating,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,technician,61,4,Three Colors: White (1994),01-Jan-1994,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


### (iii) Transformations

In [14]:
# Format 'title' i.e. remove 'year' from title
zz['title'] = zz['title'].astype(str).str[:-7]

In [15]:
# Categorize 'rating'
zz['rating_cat'] = zz['rating']

In [16]:
# Function to categorize 'rating'
def transformation_1(df):
    df['rating_cat'].replace([1, 2, 3, 4, 5],
                      ['below_avg', 'below_avg', 'avg', 'above_avg', 'above_avg'], 
                      inplace = True)

In [17]:
transformation_1(zz)

In [19]:
zz.rating_cat.value_counts()

above_avg    55375
avg          27145
below_avg    17480
Name: rating_cat, dtype: int64

In [20]:
# Categorize 'rating'
zz['occupation_cat'] = zz['occupation']

In [21]:
# Function to categorize 'occupation'
def transformation_3(df):
    df['occupation_cat'].replace(['student', 'other', 'educator', 'engineer', 'programmer', 'administrator', 'writer', 'librarian', 'technician', 'executive', 'healthcare', 'artist', 'entertainment', 'scientist', 'marketing', 'retired', 'lawyer', 'none', 'salesman', 'doctor', 'homemaker'],
                         ['category_1', 'category_2', 'category_2', 'category_2', 'category_2', 'category_2', 'category_3', 'category_3', 'category_4', 'category_4', 'category_4', 'category_4', 'category_4', 'category_4', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5'], 
                      inplace = True)

In [22]:
transformation_3(zz)

In [23]:
zz.occupation_cat.value_counts()

category_2    43560
category_1    21957
category_4    16174
category_3    10809
category_5     7500
Name: occupation_cat, dtype: int64

## Content Based Recommendation System

### (i) Removing 'NaN' values and Converting tmdbId to numeric

In [202]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [203]:
links_small.tmdbId.isnull().sum()

0

In [204]:
# Removing records with null 'tmdbid'
links_small = links_small[links_small['tmdbId'].notnull()]

In [207]:
# Converting float tmdbid to int 
links_small['tmdbId'] = links_small['tmdbId'].astype(int)

In [208]:
links_small.tmdbId.isnull().sum()

0

In [209]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


### (ii) Converting genre dictionary to list

In [28]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [29]:
metadata.genres.isnull().sum()

0

In [37]:
metadata.genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [36]:
literal_eval(metadata.genres[0])

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [57]:
# Converting genre dictionary to list
def dict_to_list(x):
    ls = []
    for i in literal_eval(x):
        ls.append(i['name'])
    return ls

In [65]:
metadata['genres'] = metadata['genres'].apply(dict_to_list)

In [66]:
metadata.genres.head(2)

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
Name: genres, dtype: object

In [70]:
# Format 'title' i.e. remove 'year' from title
metadata['release_date'] = metadata['release_date'].astype(str).str[:4]

In [72]:
metadata['release_date'].value_counts().head()

2014    1974
2015    1905
2013    1889
2012    1722
2011    1667
Name: release_date, dtype: int64

In [74]:
type(metadata.id[0])

str

In [75]:
metadata.id.isnull().values.any()

False

In [76]:
# Converting string to int
def str_to_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [77]:
metadata['id'] = metadata['id'].apply(str_to_int)

In [81]:
metadata[metadata['id'].isnull()]['id']

19730   NaN
29503   NaN
35587   NaN
Name: id, dtype: float64

In [82]:
list(metadata[metadata['id'].isnull()]['id'].index)

[19730, 29503, 35587]

In [84]:
metadata = metadata.drop(list(metadata[metadata['id'].isnull()]['id'].index))

In [85]:
metadata['id'] = metadata['id'].astype('int')

In [88]:
len(metadata[metadata['id'].isnull()]['id'])

0

In [89]:
metadata.shape

(45463, 24)

In [102]:
zz_metadata = metadata[metadata['id'].isin(zz['movie_id'])]

In [103]:
zz_metadata.shape

(1068, 24)

In [111]:
sm_df = metadata[metadata['id'].isin(links_small['tmdbId'])]
sm_df.shape

(9099, 24)

In [112]:
sm_df['tagline'] = sm_df['tagline'].fillna('')
sm_df['description'] = sm_df['overview'] + sm_df['tagline']
sm_df['description'] = sm_df['description'].fillna('')

In [114]:
sm_df.shape

(9099, 25)

## Description Based Recommendation

We use three columns for our description based recommendation:
- overview
- tagline
- description (overview + tagline)

### (i) metadata['overview'] 

In [123]:
dfs = [zz_metadata['overview'], zz_metadata['tagline'], zz_metadata['description']]

In [125]:
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(zz_metadata['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
zz_metadata_1 = zz_metadata.reset_index()
titles = zz_metadata_1['title']
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])

In [126]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [128]:
get_recommendations('The Dark Knight')

21                   Batman Forever
233                  Batman Returns
71                           Batman
427                             JFK
843                   Batman Begins
248                  Batman & Robin
324                  A Few Good Men
435    Teenage Mutant Ninja Turtles
261             Tomorrow Never Dies
Name: title, dtype: object

** Interpretation:** This model provides robust recommendations using metadata['overview']. However, there are some exceptions (possibly) with some of the recommendations provided by the model (Teenage Mutant Ninja Turtles is recommended for The Dark Knight) 

### (ii) metadata['tagline'] 

In [129]:
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(zz_metadata['tagline'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
zz_metadata_1 = zz_metadata.reset_index()
titles = zz_metadata_1['title']
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])

In [130]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [131]:
get_recommendations('The Dark Knight')

1                         Heat
2                    GoldenEye
3             Cutthroat Island
4                       Casino
5                   Four Rooms
6            Leaving Las Vegas
7    The City of Lost Children
8               Twelve Monkeys
9             Dead Man Walking
Name: title, dtype: object

** Interpretation:** The model built with respect to 'tagline' is not as robust as the previous model. It is apparent that the first model (using metadata['overview']) provides highly similar movies than the model using 'taglines'. 

### (iii) metadata['overview'] + metadata['tagline']

In [136]:
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(zz_metadata['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
zz_metadata_1 = zz_metadata.reset_index()
titles = zz_metadata_1['title']
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])

In [137]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [138]:
get_recommendations('The Dark Knight')

21                   Batman Forever
233                  Batman Returns
71                           Batman
427                             JFK
843                   Batman Begins
248                  Batman & Robin
324                  A Few Good Men
435    Teenage Mutant Ninja Turtles
261             Tomorrow Never Dies
Name: title, dtype: object

** Interpretation:** This model provides similar recommendations to that of the initial model (using metadata['overview']). We can infer that 'tagline' is not the best feature to consider to build a recommendation system.

In [142]:
metadata.shape

(45463, 24)

In [140]:
zz_metadata.shape

(1068, 25)

## Metadata Based Recommendation

In [143]:
credits.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [144]:
keywords.head(2)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [187]:
# Merging credits and keywords to metadata_full 
metadata_full = metadata.merge(credits, on = 'id')
metadata_full = metadata_full.merge(keywords, on = 'id')

In [151]:
metadata_full.shape

(46628, 27)

In [156]:
metadata_full.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [57]:
# Converting genre dictionary to list
def dict_to_list(x):
    ls = []
    for i in literal_eval(x):
        ls.append(i['name'])
    return ls

In [185]:
def director(x):
    for i in literal_eval(x):
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [189]:
metadata_full['director'] = metadata_full['crew'].apply(director)

In [190]:
for col in ['cast', 'crew', 'keywords']:
    metadata_full[col] = metadata_full[col].apply(dict_to_list)

In [193]:
metadata_full['keywords'] = metadata_full['keywords'].apply(dict_to_list)

In [194]:
metadata_full.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[jealousy, toy, boy, friendship, friends, riva...",John Lasseter
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[board game, disappearance, based on children'...",Joe Johnston


In [211]:
zz_metadata_full = metadata_full[metadata_full['id'].isin(links_small['tmdbId'])]

In [212]:
zz_metadata_full.shape

(9219, 28)

In [214]:
zz_metadata_full['cast'][0]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

In [225]:
# Remove spaces between first and last name and convert the string to lower case
test = zz_metadata_full['cast'].astype('str').apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [226]:
test = zz_metadata_full['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [228]:
test = zz_metadata_full['director'].apply(lambda x: [x,x])

#### Questions:

Shall I add more weight to lead actor and director by duplicatign their entry?

<div class="alert alert-warning">
  <strong> Still Pending </strong> 
</div>

<div class="alert alert-success">
  <strong> New - Collaborative Filtering (4) </strong> 
</div>

In [3]:
# Read 'ratings' data
# ratings = pd.read_csv('data_full/ratings.csv')
ratings_small = pd.read_csv('data_full/ratings_small.csv')

**Note**: Below code snippet pertains to 'ratings' dataset (2 Mil). For this project, I consider only the smaller dataset.  

ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = ratings.drop(['timestamp'], axis = 1)
ratings.head()

In [4]:
ratings_small.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_small = ratings_small.drop(['timestamp'], axis = 1)
ratings_small.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [5]:
# Fill NaN values in user_id and movie_id column with 0
ratings_small['user_id'] = ratings_small['user_id'].fillna(0)
ratings_small['movie_id'] = ratings_small['movie_id'].fillna(0)

In [6]:
# Replace NaN values in rating column with average of all values
ratings_small['rating'] = ratings_small['rating'].fillna(ratings_small['rating'].mean())

In [8]:
ratings_small.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [9]:
print(ratings_small.shape)

(100004, 3)


**Note:** Python throws 'Memory' Error when I use the full dataset. Hence, I pick 25% of the dataset and perform collborative filtering on it.

In [10]:
# Randomly sample 25% of the ratings dataset
small_data = ratings_small.sample(frac=0.25)

In [11]:
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25001 entries, 571 to 64450
Data columns (total 3 columns):
user_id     25001 non-null int64
movie_id    25001 non-null int64
rating      25001 non-null float64
dtypes: float64(1), int64(2)
memory usage: 781.3 KB
None


In [13]:
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [14]:
# Test and Train data matrix
train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

**Idea behind user and item similarity:**

User similarity can be calculated by measuring 'pairwise distances' between ratings datset.

However, if you have to calculate the 'item similarity', we have to transpose the 'ratings' data and then calculate the pairwise distances.

In [16]:
# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[1.         0.70454029 0.78791522 0.68269215]
 [0.70454029 1.         0.99211756 0.99953994]
 [0.78791522 0.99211756 1.         0.98786048]
 [0.68269215 0.99953994 0.98786048 1.        ]]


In [26]:
# Item Similarity Matrix (Train_data_matrix.Transpose)
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.          0.0136781   0.00742387]
 [ 0.0136781   1.         -0.0216893 ]
 [ 0.00742387 -0.0216893   1.        ]]


In [37]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
90716,602,783,4.0
3655,21,466,3.0
69220,480,2797,4.0
52058,380,81847,4.0
3430,19,1225,5.0


In [36]:
train_data.T.head()

Unnamed: 0,90716,3655,69220,52058,3430,87698,35374,921,17022,94911,...,90928,4487,33998,43512,13969,66920,96030,15203,91589,66064
user_id,602.0,21.0,480.0,380.0,19.0,584.0,253.0,13.0,111.0,624.0,...,605.0,23.0,243.0,311.0,91.0,468.0,638.0,99.0,607.0,468.0
movie_id,783.0,466.0,2797.0,81847.0,1225.0,4699.0,3745.0,3396.0,457.0,94777.0,...,608.0,4641.0,442.0,647.0,54286.0,74545.0,1378.0,2085.0,2174.0,2053.0
rating,4.0,3.0,4.0,4.0,5.0,4.0,2.0,3.5,4.0,2.5,...,3.0,4.0,3.0,4.0,5.0,3.0,4.0,3.0,3.5,1.5


In [24]:
user_correlation[:1]

array([[1.        , 0.70454029, 0.78791522, ..., 0.70915885, 0.85219278,
        0.82267086]])

In [27]:
item_correlation[:1]

array([[1.        , 0.0136781 , 0.00742387]])

In [52]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
    return pred

In [53]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

In [54]:
# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [55]:
# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 17596.17284084236
Item-based CF RMSE: 20855.26385843227
