In [26]:
import sys
import numpy as np
import pandas as pd
import pickle

sys.path.append('../')

from config.paths import RAW_DATA_PATH, ARTIFACTS_PATH, MODELS_PATH

from utils.files_management import load_model, load_netflix_data
from utils.data_processing import convert_columns_to_string, filter_unseen


In [27]:
# Load test data
data_path = RAW_DATA_PATH / "qualifying.txt"
save_path = RAW_DATA_PATH / "qualifying_df.csv"
df = load_netflix_data(
    file_path=data_path,
    has_ratings=False,
    verbose=True
    )

[INFO] Loaded 2,817,131 rows from /Users/robertogarces/data-science/projects/netflix/data/raw/qualifying.txt
   movie_id  customer_id       date
0         1      1046323 2005-12-19
1         1      1080030 2005-12-23
2         1      1830096 2005-03-14
3         1       368059 2005-05-26
4         1       802003 2005-11-07


In [28]:
movie_titles_path = RAW_DATA_PATH / "movie_titles_fixed.csv"
movie_titles = pd.read_csv(movie_titles_path, sep=';', encoding='latin1', header=None, names=['id', 'year', 'title'])

In [30]:
# Process the dataset
df = convert_columns_to_string(df, ['customer_id', 'movie_id'])

In [31]:
# Open the files that includes the IDs for the valid users and movies
with open(ARTIFACTS_PATH / "valid_users.pkl", "rb") as f:
    valid_users = pickle.load(f)

with open(ARTIFACTS_PATH / "valid_movies.pkl", "rb") as f:
    valid_movies = pickle.load(f)

In [32]:
df = filter_unseen(df, valid_users=valid_users, valid_movies=valid_movies)

[INFO] Dropped 43966 rows with unknown users or movies.


# Predictions

In [33]:
model = load_model(MODELS_PATH / "svd_model.pkl")

In [34]:
predictions = []

for _, row in df.iterrows():
    uid = str(row['customer_id'])  # surprise requiere strings
    iid = str(row['movie_id'])

    pred = model.predict(uid, iid)
    predictions.append(pred.est)  # valor predicho de la calificación

df["pred_rating"] = predictions

In [40]:
movie_titles['id'] = movie_titles['id'].astype(str)

In [41]:
final_df = pd.merge(df, movie_titles, how='left', left_on='movie_id', right_on='id')

In [42]:
final_df

Unnamed: 0,movie_id,customer_id,date,pred_rating,id,year,title
0,1,1046323,2005-12-19,3.775027,1,2003.0,Dinosaur Planet
1,1,1080030,2005-12-23,3.681262,1,2003.0,Dinosaur Planet
2,1,1830096,2005-03-14,4.019554,1,2003.0,Dinosaur Planet
3,1,368059,2005-05-26,3.765078,1,2003.0,Dinosaur Planet
4,1,802003,2005-11-07,3.765078,1,2003.0,Dinosaur Planet
...,...,...,...,...,...,...,...
2773160,9998,1521720,2005-12-07,3.451368,9998,1995.0,The Show
2773161,9998,1363704,2005-10-01,3.689404,9998,1995.0,The Show
2773162,9999,2153561,2005-11-15,3.389409,9999,1986.0,Breeders
2773163,9999,1490137,2005-12-22,3.796265,9999,1986.0,Breeders
