In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

%matplotlib inline

In [3]:
# Read in the movie id and title
df_movies = pd.read_csv('data/movies_metadata.csv', usecols=['id', 'title'])

In [4]:
# Some bad ids with date. Remove them.
# Drop rows with bad id and change dtype
df_movies.drop(df_movies[df_movies.id.str.isnumeric() == False].index, inplace=True)
df_movies.id = df_movies.id.astype(np.int32)

# Drop rows with null values
df_movies.dropna(inplace=True)

# Remove duplicates in id and title
df_movies.drop_duplicates(subset=['id'], inplace=True)
df_movies.drop_duplicates(subset=['title'], inplace=True)

# The movie dataset is clean now

In [3]:
# Read in the ratings file
df_ratings = pd.read_csv('data/ratings_small.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': np.int32, 'movieId': np.int32, 'rating': np.float32})

In [6]:
df_interaction = df_ratings.pivot(index='userId', columns='movieId', values='rating')

In [7]:
user_thres = 25
movie_thres = 25

df_interaction.dropna(axis=0, thresh=user_thres, inplace=True)
df_interaction.dropna(axis=1, thresh=movie_thres, inplace=True)
df_interaction.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 595 entries, 2 to 671
Columns: 1013 entries, 1 to 134130
dtypes: float32(1013)
memory usage: 2.3 MB


In [8]:
df_interaction.fillna(0, inplace=True)

In [9]:
# Link between movieId and tmdbId
df_links = pd.read_csv("data/links_small.csv", usecols=['movieId', 'tmdbId'])
df_links.dropna(inplace=True)
df_links.movieId = df_links.movieId.astype(np.int32)
df_links.tmdbId = df_links.tmdbId.astype(np.int32)

In [10]:
orig_ind = df_interaction.index
orig_cols = df_interaction.columns
df_shape = df_interaction.shape

enc_user = dict(zip(orig_ind, np.arange(df_shape[0])))
enc_mov = dict(zip(orig_cols, np.arange(df_shape[1])))

dec_user = dict(zip(np.arange(df_shape[0]), orig_ind))
dec_mov = dict(zip(np.arange(df_shape[1]), orig_cols))

In [11]:
df_interaction.rename(index=enc_user, columns=enc_mov, inplace=True)

In [12]:
def movieId_to_title(movieId):
    tmdbId = df_links[df_links.movieId == movieId].tmdbId
    movie_title = df_movies.set_index('id').loc[tmdbId]
    return movie_title.iloc[0]['title']

def title_to_movieId(movie_title):
    tmdbId = df_movies[df_movies.title == movie_title].id
    movieId = df_links[df_links.tmdbId == tmdbId].movieId
    return movie_idx.index

In [13]:
def get_nearest_ids(idx, interaction_matrix, n_neighbors=5):
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
    model_knn.fit(interaction_matrix)
    distances, indices = model_knn.kneighbors(interaction_matrix[idx], n_neighbors=n_neighbors+1)
    return distances[0,1:], indices[0,1:]

In [4]:
def get_user_top_movie_ratings(user_id, weight=1, num_top_movies=5):
    df_user_top_movies = df_ratings[df_ratings.userId == user_id].sort_values('rating', ascending=False)[:num_top_movies]
    df_user_top_movies.drop(['userId'], axis=1, inplace=True)
    df_user_top_movies.rating *= weight
    return df_user_top_movies

In [15]:
def recommend_movie_user_user(user_id, df_interaction, num_movies=10):
    num_user_neighbors = 5
    interaction_matrix = csr_matrix(df_interaction)
    dist_list, user_id_list = get_nearest_ids(user_id, interaction_matrix, num_user_neighbors)
    df_rec_movies = pd.DataFrame(columns=['movieId', 'rating'])
    for dist, user_id in zip(dist_list, user_id_list):
        df_rec_movies = df_rec_movies.append(get_user_top_movie_ratings(dec_user[user_id], weight=1.0 - dist), ignore_index=True)
    top_recommended_movie_idx = df_rec_movies.sort_values('rating', ascending=False).movieId.unique()[:num_movies]
    return top_recommended_movie_idx

In [16]:
reco_movieId = recommend_movie_user_user(enc_user[2], df_interaction)

In [17]:
i = 0
for movieId in reco_movieId:
    i += 1
    print(f"{i}: {movieId_to_title(movieId)}")

1: Schindler's List
2: What's Eating Gilbert Grape
3: The Silence of the Lambs
4: Terminator 2: Judgment Day
5: Forrest Gump
6: The Shawshank Redemption
7: Dances with Wolves
8: Braveheart
9: Ghost
10: In the Line of Fire


In [18]:
def recommend_movie_item_item(user_id, df_interaction, num_movies=10):
    num_movie_neighbors = 5
    interaction_matrix_sparse = csr_matrix(df_interaction).transpose()
    user_top_movies = get_user_top_movie_ratings(dec_user[user_id], num_top_movies=5)
    df_rec_movies = pd.DataFrame(columns=['movieId', 'rating'])
    for i in range(5):
        movieId, rating = user_top_movies.iloc[i]['movieId'], user_top_movies.iloc[i]['rating']
        dist_list, movieId_list = get_nearest_ids(enc_mov[movieId], interaction_matrix_sparse, num_movie_neighbors)
        df_rec_movies = df_rec_movies.append(pd.DataFrame.from_dict({'movieId': movieId_list, 'rating': dist_list*rating}), ignore_index=True)
    top_recommended_movie_idx = df_rec_movies.sort_values('rating', ascending=False).movieId.unique()[:num_movies]
    return top_recommended_movie_idx

In [19]:
reco_movieId = recommend_movie_user_user(enc_user[2], df_interaction)
i = 0
for movieId in reco_movieId:
    i += 1
    print(f"{i}: {movieId_to_title(dec_mov[movieId])}")

1: Rounders
2: The Shining
3: Run Lola Run
4: Notting Hill
5: Ben-Hur
6: A Grand Day Out
7: Austin Powers: The Spy Who Shagged Me
8: Forrest Gump
9: Superman II
10: Labyrinth


In [5]:
utp = get_user_top_movie_ratings(2)

In [6]:
utp

Unnamed: 0,movieId,rating
83,551,5.0
22,39,5.0
45,266,5.0
91,592,5.0
90,590,5.0


In [8]:
utp.movieId.map(lambda mov: mov*2)

83    1102
22      78
45     532
91    1184
90    1180
Name: movieId, dtype: int64