In [3]:
!pip install -U --quiet mlfoundry
!pip install --quiet implicit

## Let's load the datasets

In [4]:
# load our datasets as dataframes
import pandas as pd

movie_meta_df = pd.read_csv('https://raw.githubusercontent.com/srihari-tf/recommender-system-tfy/master/movies_metadata.csv')
keywords_df = pd.read_csv('https://raw.githubusercontent.com/srihari-tf/recommender-system-tfy/master/keywords.csv')
ratings_df = pd.read_csv("https://raw.githubusercontent.com/srihari-tf/recommender-system-tfy/master/ratings_small.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
!curl -O https://raw.githubusercontent.com/srihari-tf/recommender-system-tfy/master/movies_metadata.csv
!curl -O https://raw.githubusercontent.com/srihari-tf/recommender-system-tfy/master/ratings_small.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 32.8M  100 32.8M    0     0   116M      0 --:--:-- --:--:-- --:--:--  116M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2381k  100 2381k    0     0  16.4M      0 --:--:-- --:--:-- --:--:-- 16.4M


## Let's take a look at the movie metadata

In [6]:
movie_desc_df = movie_meta_df.copy()
movie_desc_df = movie_desc_df[['overview', 'original_title', 'id']]
movie_desc_df['overview'] = movie_desc_df['overview'].fillna('')

In [7]:
# helper functions
def get_movie_row_by_index(n):
  return movie_desc_df.iloc[n]

def get_movie_row_by_movie_id(id):
  id = str(id)
  return movie_desc_df[movie_desc_df['id'] == id][0]

In [8]:
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [9]:
import numpy as np
from scipy.spatial import distance

def get_most_similar_movie_index(embeddings, n, n_movies = 1):
  distances = distance.cdist([embeddings[n]], embeddings, 'cosine')[0]
  # get smiliar movies, remove 1st element as it is the same
  return np.argsort(distances)[1:n_movies+1]

In [10]:
embeddings = embed((movie_desc_df['overview'])).numpy()

In [11]:
get_most_similar_movie_index(embeddings, 0, 10)

array([ 2997, 15348, 26211, 18263, 17290,  5214, 42721, 10491, 19976,
       17189])

## Train a collaborative filtering model using `implicit`

In [12]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [13]:
ratings_df['userId'] = ratings_df['userId'].astype("category")
ratings_df['movieId'] = ratings_df['movieId'].astype("category")

In [14]:
#helper functions
def get_movie_id_from_cat_code(cat_code):
  return ratings_df['movieId'].cat.categories[cat_code]

def get_user_id_from_cat_code(cat_code):
  return ratings_df['userId'].cat.categories[cat_code]

def get_cat_code_from_user_id(user_id):
  return ratings_df['userId'].cat.categories.get_loc(user_id)

def get_cat_code_from_movie_id(movie_id):
  return ratings_df['movieId'].cat.categories.get_loc(movie_id)

In [30]:
print(get_cat_code_from_movie_id(949))
print(get_movie_id_from_cat_code(6892))

769
58559


In [16]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [18]:
ratings = ratings_df['rating']
rows = ratings_df['userId'].cat.codes
cols = ratings_df['movieId'].cat.codes

In [19]:
print(ratings.head())
print(rows.head())
print(cols.head())

0    2.5
1    3.0
2    3.0
3    2.0
4    4.0
Name: rating, dtype: float64
0    0
1    0
2    0
3    0
4    0
dtype: int16
0     30
1    833
2    859
3    906
4    931
dtype: int16


In [20]:
from scipy.sparse import coo_matrix

r = coo_matrix((ratings, (rows, cols)))

In [21]:
# check that value for user 0, movie 30 is as expected
r.toarray()[0][906]

2.0

In [22]:
import implicit

model = implicit.als.AlternatingLeastSquares(factors=25)
model.fit(r)

  f"CUDA extension is built, but disabling GPU support because of '{e}'",
  "OpenBLAS detected. Its highly recommend to set the environment variable "


  0%|          | 0/15 [00:00<?, ?it/s]

In [23]:
def get_recommendation_for_user(user_id):
  user_cat_code = get_cat_code_from_user_id(user_id)
  return [get_movie_id_from_cat_code(cat_code) for cat_code in model.recommend(user_cat_code, r.tocsr().getrow(user_cat_code))[0]]

In [24]:
get_recommendation_for_user(1)

[1127, 1262, 1204, 2406, 1965, 1272, 1674, 1242, 3479, 2019]

## Save the model and log it to MLFoundry

In [25]:
import mlfoundry
mlfoundry.login()
run = mlfoundry.get_client().create_run(project_name="movie-recommendation")

API key is already configured.
Please use `mlfoundry login --relogin` or `mlfoundry.login(relogin=True)`to force relogin
[mlfoundry] 2022-09-22T13:07:25+0000 INFO No run_name given. Using a randomly generated name burgundy-emu. You can pass your own using the `run_name` argument
Link to the dashboard for the run: https://app.truefoundry.com/mlfoundry/306/4a78dc07f6874f9cb740445ea0675298/
[mlfoundry] 2022-09-22T13:07:27+0000 INFO Run 'truefoundry/srihari/movie-recommendation/burgundy-emu' has started.


In [26]:
model_version = run.log_model(
    name="reco-implicit",
    model=model,
    framework="sklearn",
    description="model trained for movie recommendation"
)

[mlfoundry] 2022-09-22T13:07:29+0000 INFO Logging model and additional files, this might take a while ...
[mlfoundry] 2022-09-22T13:07:29+0000 INFO Serializing model files to model version contents
[mlfoundry] 2022-09-22T13:07:33+0000 INFO Packaging and uploading files to remote ...
[mlfoundry] 2022-09-22T13:07:39+0000 INFO Logged model successfully with fqn 'model:truefoundry/srihari/movie-recommendation/reco-implicit:1'


In [27]:
run.log_artifact('/content/ratings_small.csv')
run.log_artifact('/content/movies_metadata.csv')

[mlfoundry] 2022-09-22T13:07:39+0000 INFO Logging '/content/ratings_small.csv' file as artifact to 'ratings_small.csv', this might take a while ...
[mlfoundry] 2022-09-22T13:07:41+0000 INFO Logging '/content/movies_metadata.csv' file as artifact to 'movies_metadata.csv', this might take a while ...
