In [1]:
import re
import numpy as np
import pandas as pd
from numpy import linalg as LA
from sklearn.metrics import mean_squared_error

In [2]:
# Constants
names_rating = ['user_id', 'movieId', 'rating', 'timestamp']
names_tags = ['user_id', 'movieId', 'tag', 'timestamp' ]
names_movies = ['movieId', 'title', 'genres']
names_links = ['movieId', 'imdb_id', 'tmdb_id']

DATA_DIR = 'data/ml-latest-small/'
FILE_RATINGS = DATA_DIR + 'ratings.csv'
FILE_TAGS = DATA_DIR + 'tags.csv'
FILE_MOVIES = DATA_DIR + 'movies.csv'
FILE_LINKS = DATA_DIR + 'links.csv'

In [3]:
# reading movies.csv file for genre information and removing movies with empty genres
movies_df = pd.read_csv(FILE_MOVIES)
no_genres = '(no genres listed)'
movies_df = movies_df[movies_df.genres != no_genres]
movies_df = movies_df.sort_values(['movieId'], ascending=[1])

# Movie Count
movies_size = movies_df.movieId.unique().shape[0]
print("Number of distinct movies : %d" % movies_size)

# Different types of Genres
categories = ['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy',
                'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','IMAX']

genre_size = len(categories)
genre_idx = {}
ind = 0
for cat in categories:
    genre_idx[cat] = ind
    ind += 1
    
print("Number of distinct Genres %d" % genre_size)

# Extracting different genres into a list from a string and mapping them to an index value
def getGenres(line, genre_idx):
    genres = line.split('|')
    
    # checking for no genres
    no_genres = 'no genres listed'
    if no_genres in genres[0]:
        return None
    
    for idx in range(len(genres)):
        val = genre_idx[genres[idx]]
        genres[idx] = val
    return genres

# Storing Movie list and mapping movies to index values
# List containing different types of movies 
movie_ID_list = [] # movie ID
movies_idx = {}    # movie ID -> index
movie_map = {}     # movie ID -> title

# Returns item profile
def getitemprofile(df, movie_ID_list, movie_idx, movie_map, movies_size, genre_size, genre_idx):
    item_matrix = np.zeros((movies_size, genre_size))
    index =0

    for indx, row in df.iterrows():
        gen = row['genres']
        title = row['title']
        movieid = row['movieId']
        
        genres = getGenres(gen, genre_idx)
        if genres == None:
            continue
        for genre in genres:
            item_matrix[index, genre] = 1
        
        # Filling the movie_ID_list, movies_idx, movie_map
        if movieid in movies_idx.keys():
            print("Duplicate Movie: ", title)
            continue
        movie_ID_list.append(movieid)
        movies_idx[movieid] = index
        movie_map[movieid] = title
        index += 1
        # checking for duplicate movie entries
    return item_matrix
    

item_profile = getitemprofile(movies_df, movie_ID_list, movies_idx, movie_map, movies_size, genre_size, genre_idx)
print("Item profile generated")
print("Number of movies: %d" % len(movies_idx))

# Debugging check
print("movie_map size: ", len(movie_map))
print("movies_idx size: ", len(movies_idx))
print("movie_ID_list size: ", len(movie_ID_list))

Number of distinct movies : 9107
Number of distinct Genres 19
Item profile generated
Number of movies: 9107
movie_map size:  9107
movies_idx size:  9107
movie_ID_list size:  9107


In [4]:
# Reading Ratings file
ratings_df = pd.read_csv(FILE_RATINGS)
n_users = ratings_df['userId'].unique().shape[0]
n_items = len(movies_idx) #ratings_df['movieId'].unique().shape[0]
print("Number of unique users: %d" % n_users)
print("Number of unique movies: %d" % n_items)

# Create a dictionary from movieId to index
ind = 0
movie_dict = movies_idx
movie_list = movie_ID_list

# Create user-item ratings matrix from the csv file data
ratings = np.zeros((n_users, n_items))

# df.itertuples() returns a Pandas Frame object
for row in ratings_df.itertuples():
    if row[2] not in movie_dict.keys():
        print("Error not present in database: ",row)
        continue
    ratings[row[1] - 1, movie_dict[row[2]]] = row[3]
    

# Split data into training and test sets by removing 10 ratings per user from the training set and adding to test set 
# All selected users had rated at least 20 movies. There are a total of 100004 ratings in this version of the dataset
# 10 ratings per user means a test set comprising of 6710 ratings which is around 6.7% of the total data
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

def row_normalize(mat):
    eps = 1e-9
    mat_mask = (mat != 0).astype(int)
    rsum = np.sum(mat, axis=1)
    nzcount = np.count_nonzero(mat, axis=1)
    avg_rat = rsum/(nzcount + eps)
    avg_rat = avg_rat[:, np.newaxis]
    mat = mat - avg_rat
    mat = mat * mat_mask
    return mat, avg_rat

train, user_mean = row_normalize(train)

Number of unique users: 671
Number of unique movies: 9107
Error not present in database:  Pandas(Index=8762, userId=56, movieId=128620, rating=5.0, timestamp=1467003913)
Error not present in database:  Pandas(Index=8781, userId=56, movieId=160590, rating=5.0, timestamp=1467095789)
Error not present in database:  Pandas(Index=11812, userId=73, movieId=141866, rating=4.0, timestamp=1469772876)
Error not present in database:  Pandas(Index=27677, userId=200, movieId=136592, rating=1.5, timestamp=1438020227)
Error not present in database:  Pandas(Index=39521, userId=287, movieId=117192, rating=5.0, timestamp=1473445036)
Error not present in database:  Pandas(Index=41926, userId=299, movieId=83829, rating=4.5, timestamp=1344180332)
Error not present in database:  Pandas(Index=45613, userId=324, movieId=149532, rating=3.0, timestamp=1451519751)
Error not present in database:  Pandas(Index=50521, userId=371, movieId=122888, rating=5.0, timestamp=1473624419)
Error not present in database:  Pand

In [5]:
def getuserprof(user_rat, genre_size, item_profile):
    nz_count = np.count_nonzero(user_rat)
    user_mat = np.zeros((nz_count, genre_size))
    rated_movie_idx = np.nonzero(user_rat)[0]
    eps = 1e-9
    ind = 0
    for x in rated_movie_idx:
        rating = user_rat[x]
        user_mat[ind,:] = item_profile[x,:] * rating
        ind += 1
    nz = np.count_nonzero(user_mat, axis=0)
    user_mat = np.sum(user_mat, axis=0)
    user_mat = user_mat/(nz + eps)
    return user_mat

def getProfiles(train, n_users, genre_size, item_profile):
    user_prof = np.zeros((n_users,genre_size))
    nrows = train.shape[0]
    for i in range(nrows):
        val = getuserprof(train[i,:], genre_size, item_profile)
        user_prof[i,:] = val
    return user_prof

user_rat = getProfiles(train, n_users, genre_size, item_profile)

t_item_prof = item_profile.transpose()
content_mat = np.dot(user_rat, t_item_prof)

user_norm = LA.norm(user_rat, axis=1)
user_norm = user_norm[:, np.newaxis]

t_item_prof_norm = LA.norm(t_item_prof, axis=0)

content_mat = content_mat/user_norm
content_mat = content_mat/t_item_prof_norm

test_mask = (test !=0).astype(int)
content_mat = content_mat + user_mean
content_mat = content_mat * test_mask
mse = mean_squared_error(content_mat, test)

# taking into account the unpredicted movies in the test set to normalize the mean squred error value:
mse = mse * (9107/10)

print("Mean Squared Error: ",mse)

Content based Recommendation Algorithm:

This algorithm takes into account the likes and dislikes of the user and generates a User Profile. For genreating a user profile, we take into account the item profiles( vector discribing an item) and their corresponding user rating.
The user profile is the weighted sum of the item profiles with weights being the ratings user rated. Once the user profile is generated, we calculate the similarity of the user profile with all the items in the dataset, which is calculated using cosine similarity between the user profile and item profile. Advantages of Content Based approach is that data of other users is not required and the recommender engine can recommend new items which are not rated currently, but the recommender algorithm doesn't recommend the items outside the category of items the user has rated.

For the current movie recommendation algorithm we take into account the genre of the movie as the description vector for item profile. However, it is not the optimal criteria to use for movie recommendation, as the user is generally not loyal to a praticular genre or actor or director. We need better discriptors of a movie for better prediction.