In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

%matplotlib inline

In [2]:
# Read in the movie id and title
df_movies = pd.read_csv('movies_metadata.csv', usecols=['id', 'title'])

In [3]:
# Some bad ids with date. Remove them.
# Drop rows with bad id and change dtype
df_movies.drop(df_movies[df_movies.id.str.isnumeric() == False].index, inplace=True)
df_movies.id = df_movies.id.astype(np.int32)

# Drop rows with null values
df_movies.dropna(inplace=True)

# Remove duplicates in id and title
df_movies.drop_duplicates(subset=['id'], inplace=True)
df_movies.drop_duplicates(subset=['title'], inplace=True)

# The movie dataset is clean now

In [4]:
# Read in the ratings file
df_ratings = pd.read_csv('ratings_small.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': np.int32, 'movieId': np.int32, 'rating': np.float32})

In [5]:
df_interaction = df_ratings.pivot(index='userId', columns='movieId', values='rating')

In [9]:
user_thres = 25
movie_thres = 25

df_interaction.dropna(axis=0, thresh=user_thres, inplace=True)
df_interaction.dropna(axis=1, thresh=movie_thres, inplace=True)
df_interaction.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529 entries, 2 to 671
Columns: 994 entries, 1 to 134130
dtypes: float32(994)
memory usage: 2.0 MB


In [10]:
df_interaction.fillna(0, inplace=True)

In [11]:
# Link between movieId and tmdbId
df_links = pd.read_csv("links_small.csv", usecols=['movieId', 'tmdbId'])
df_links.dropna(inplace=True)
df_links.movieId = df_links.movieId.astype(np.int32)
df_links.tmdbId = df_links.tmdbId.astype(np.int32)

In [12]:
orig_ind = df_interaction.index
orig_cols = df_interaction.columns
df_shape = df_interaction.shape

enc_user = dict(zip(orig_ind, np.arange(df_shape[0])))
enc_mov = dict(zip(orig_cols, np.arange(df_shape[1])))

dec_user = dict(zip(np.arange(df_shape[0]), orig_ind))
dec_mov = dict(zip(np.arange(df_shape[1]), orig_cols))

In [13]:
df_interaction.rename(index=enc_user, columns=enc_mov, inplace=True)

In [14]:
def movieId_to_title(movieId):
    tmdbId = df_links[df_links.movieId == movieId].tmdbId
    movie_title = df_movies.set_index('id').loc[tmdbId]
    return movie_title.iloc[0]['title']

def title_to_movieId(movie_title):
    tmdbId = df_movies[df_movies.title == movie_title].id
    movieId = df_links[df_links.tmdbId == tmdbId].movieId
    return movie_idx.index