In [23]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
import numpy.ma as ma
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Lambda, Concatenate, Add
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_squared_error
import pickle
import tabulate
from utility import *
from IPython.display import Math

2025-11-20 14:34:01.968254: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-20 14:34:02.785240: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-20 14:34:08.437606: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


## Data Preprocessing and Features engineerings

### load data

In [114]:
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")


In [132]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,3273,5.0,964983536
1,1,3578,5.0,964980668
2,1,3617,4.0,964980683
3,1,3744,4.0,964980694
4,1,3793,5.0,964981855
...,...,...,...,...
39221,610,166534,4.0,1493848402
39222,610,168248,5.0,1493850091
39223,610,168250,5.0,1494273047
39224,610,168252,5.0,1493846352


In [74]:
movies

Unnamed: 0,movieId,title,genres,year
0,2769,"Yards, The (2000)",Crime|Drama,2000.0
1,3177,Next Friday (2000),Comedy,2000.0
2,3190,Supernova (2000),Adventure|Sci-Fi|Thriller,2000.0
3,3225,Down to You (2000),Comedy|Romance,2000.0
4,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000.0
...,...,...,...,...
4759,193579,Jon Stewart Has Left the Building (2015),Documentary,2015.0
4760,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,2017.0
4761,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,2017.0
4762,193585,Flint (2017),Drama,2017.0


In [None]:
user_to_movies = {}
for row in ratings.itertuples():
    movieId = row.movieId
    userId = row.userId
    rating = row.rating
    if userId not in user_to_movies:
        movies ={}
        movie = {movieId:rating}
        movies["movies"] = movie
        user_to_movies[userId] = movies
    else:
        user_to_movies[userId]["movies"][movieId] = rating
with open("user_to_movies.pkl", "wb") as f:
    pickle.dump(user_to_movies, f)

In [37]:
# 1. Identify all movies with "(no genres listed)"
#    These movies have no usable genre information, so we remove them from both
#    the movies  and the ratings dataframe.
no_genre_movies = movies[movies["genres"] == "(no genres listed)"]
to_remove = no_genre_movies["movieId"].tolist()

# Create masks to filter out ratings and movies that correspond to those IDs
movies_mask = movies["movieId"].isin(to_remove)
ratings_mask = ratings["movieId"].isin(to_remove)
movies = movies[~movies_mask].copy()
ratings = ratings[~ratings_mask].copy()

# 2. Remove the token "IMAX" from genres.
#    IMAX is a projection format, not a real genre, so we strip it out.
has_imax = movies[movies["genres"].str.contains("IMAX")]
imax_id = has_imax["movieId"].tolist()
movies["genres"] = movies["genres"].str.replace("|IMAX", "", regex=False)
movies.to_csv("movies.csv", index=False)
ratings.to_csv("ratings.csv", index=False)

In [38]:
# one hot encoding of genres
genre_dummies = movies["genres"].str.get_dummies(sep="|")
genre_cols = genre_dummies.columns.tolist()
# concatinate one hot encoded to movies
movies = pd.concat([movies[["movieId", "year"]] , genre_dummies], axis=1)

In [39]:
#compute AVG rating per movie and merge it with movies (movieId, year, one hot genres) to create movies features
movie_average = ratings.groupby("movieId")["rating"].mean().round(2).reset_index().rename(columns={"rating":"ave_rating"})
movies_features = movies.merge(movie_average, on="movieId", how="left").fillna(0)
desired_order = ["movieId", "year", "ave_rating"] + genre_cols
movies_features = movies_features[desired_order]


In [41]:
# compute num of ratings and AVG rating per user
user_stats = ratings.groupby("userId")["rating"].agg(["count", "mean"]).round(2).reset_index()
user_stats.rename(columns={"count":"rating_count", "mean":"rating_ave"}, inplace=True)


In [42]:
# merge ratigs with movies to compute user AVR rating per genre
merged = ratings.merge(movies, on="movieId", how="left")

#compute user AVG rating per genre
user_genre_averages = merged.groupby("userId")[genre_cols + ["rating"]].apply(
    lambda df: pd.Series({
        g: df.loc[df[g] == 1, "rating"].mean().round(2) if (df[g] == 1).any() else 0
        for g in genre_cols
    })
).reset_index()


In [43]:
# merge user_stat and user AVG rating per genre to create user features
user_features = user_stats.merge(user_genre_averages, on="userId", how="left").fillna(0)


In [44]:
user_features

Unnamed: 0,userId,rating_count,rating_ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,5,4.60,4.67,5.00,0.00,0.00,4.50,4.00,0.00,...,0.00,0.00,5.00,0.00,5.00,0.00,5.00,4.50,0.00,0.00
1,2,26,3.96,3.95,4.17,0.00,0.00,4.00,3.89,4.33,...,0.00,0.00,3.00,0.00,4.00,0.00,3.88,3.70,4.50,3.50
2,3,3,0.50,0.50,0.50,0.00,0.50,0.50,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.50,0.50,0.00,0.00
3,4,33,3.39,3.50,3.50,0.00,4.00,3.18,4.00,0.00,...,4.00,0.00,4.00,1.00,2.00,2.57,0.00,3.60,4.33,0.00
4,7,96,2.97,2.92,3.00,3.20,2.95,3.02,3.03,0.00,...,2.92,3.50,2.75,3.67,2.89,2.38,2.68,3.19,3.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,605,109,3.08,3.05,3.19,3.28,3.11,2.87,2.70,3.00,...,3.13,0.00,2.70,3.40,4.00,3.24,2.92,2.60,2.60,3.00
439,606,317,3.60,3.15,3.53,3.73,3.63,3.40,3.78,3.75,...,3.54,3.75,2.69,3.88,3.63,3.65,3.15,3.35,3.72,0.00
440,607,11,3.00,2.50,3.00,0.00,0.00,2.60,0.00,0.00,...,0.00,0.00,3.00,0.00,4.00,2.75,2.33,4.50,0.00,0.00
441,608,280,3.69,3.92,3.77,3.62,3.50,3.40,4.01,3.00,...,3.85,5.00,3.87,3.21,3.76,3.46,3.88,3.96,4.17,0.00


In [45]:
movies_features

Unnamed: 0,movieId,year,ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,2769,2000.0,3.40,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3177,2000.0,3.25,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3190,2000.0,2.00,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,3225,2000.0,2.00,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,3273,2000.0,2.67,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4759,193579,2015.0,3.50,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4760,193581,2017.0,4.00,1,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4761,193583,2017.0,3.50,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4762,193585,2017.0,3.50,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# merge user features with ratings and movies features to create the data for training and testing the model
df = ratings.merge(user_features, on="userId", suffixes=("", "_user"))
df = df.merge(movies_features, on="movieId", suffixes=("_user", "_movie"))
df.to_csv("user_movie_features.csv", index=False)

In [48]:
df

Unnamed: 0,userId,movieId,rating,timestamp,rating_count,rating_ave,Action_user,Adventure_user,Animation_user,Children_user,...,Fantasy_movie,Film-Noir_movie,Horror_movie,Musical_movie,Mystery_movie,Romance_movie,Sci-Fi_movie,Thriller_movie,War_movie,Western_movie
0,1,3273,5.0,964983536,5,4.60,4.67,5.00,0.00,0.00,...,0,0,1,0,1,0,0,1,0,0
1,1,3578,5.0,964980668,5,4.60,4.67,5.00,0.00,0.00,...,0,0,0,0,0,0,0,0,0,0
2,1,3617,4.0,964980683,5,4.60,4.67,5.00,0.00,0.00,...,0,0,0,0,0,0,0,0,0,0
3,1,3744,4.0,964980694,5,4.60,4.67,5.00,0.00,0.00,...,0,0,0,0,0,0,0,1,0,0
4,1,3793,5.0,964981855,5,4.60,4.67,5.00,0.00,0.00,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39221,610,166534,4.0,1493848402,1013,3.56,3.42,3.57,3.84,3.59,...,0,0,1,0,0,0,0,1,0,0
39222,610,168248,5.0,1493850091,1013,3.56,3.42,3.57,3.84,3.59,...,0,0,0,0,0,0,0,1,0,0
39223,610,168250,5.0,1494273047,1013,3.56,3.42,3.57,3.84,3.59,...,0,0,1,0,0,0,0,0,0,0
39224,610,168252,5.0,1493846352,1013,3.56,3.42,3.57,3.84,3.59,...,0,0,0,0,0,0,1,0,0,0


In [49]:
# select the columns for movies X train
user_cols = [col + "_user" for col in genre_cols]
X_user_cols = ["userId", "rating_count", "rating_ave"] + user_cols

# select the columns for movies X train
movie_cols = [col + "_movie" for col in genre_cols]
X_movie_cols = ["movieId", "year", "ave_rating"] + movie_cols
# users vectors
X_users = df[X_user_cols].copy()
X_users["rating_ave"] = X_users["rating_ave"].round(2)

# movies vectors
X_movies = df[X_movie_cols].copy()

# remove "_user" from user genres and "_movie" from movies genres they were added while merging
user_columns_map = {col: col.replace("_user", "") for col in user_cols}
movie_columns_map = {col:col.replace("_movie", "") for col in movie_cols}
X_users = X_users.rename(columns=user_columns_map)
X_movies = X_movies.rename(columns=movie_columns_map)


In [56]:
# save all the processed data, so that next we don't have to through all the process above
X_movies.to_csv("X_movies.csv", index=False)
X_users.to_csv("X_users.csv", index=False)
movies_features.to_csv("movie_features.csv", index=False)
user_features.to_csv("user_features.csv", index=False)

Y= df["rating"].values
np.save("Y.npy", Y)
np.save("X_u.npy", X_users.values)
np.save("X_m.npy", X_movies.values)

In [52]:
X_users

Unnamed: 0,userId,rating_count,rating_ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,5,4.60,4.67,5.00,0.00,0.00,4.50,4.00,0.0,...,0.00,0.00,5.00,0.00,5.00,0.00,5.00,4.50,0.00,0.00
1,1,5,4.60,4.67,5.00,0.00,0.00,4.50,4.00,0.0,...,0.00,0.00,5.00,0.00,5.00,0.00,5.00,4.50,0.00,0.00
2,1,5,4.60,4.67,5.00,0.00,0.00,4.50,4.00,0.0,...,0.00,0.00,5.00,0.00,5.00,0.00,5.00,4.50,0.00,0.00
3,1,5,4.60,4.67,5.00,0.00,0.00,4.50,4.00,0.0,...,0.00,0.00,5.00,0.00,5.00,0.00,5.00,4.50,0.00,0.00
4,1,5,4.60,4.67,5.00,0.00,0.00,4.50,4.00,0.0,...,0.00,0.00,5.00,0.00,5.00,0.00,5.00,4.50,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39221,610,1013,3.56,3.42,3.57,3.84,3.59,3.58,3.66,4.2,...,3.43,4.29,3.39,3.95,3.68,3.68,3.51,3.45,3.63,3.59
39222,610,1013,3.56,3.42,3.57,3.84,3.59,3.58,3.66,4.2,...,3.43,4.29,3.39,3.95,3.68,3.68,3.51,3.45,3.63,3.59
39223,610,1013,3.56,3.42,3.57,3.84,3.59,3.58,3.66,4.2,...,3.43,4.29,3.39,3.95,3.68,3.68,3.51,3.45,3.63,3.59
39224,610,1013,3.56,3.42,3.57,3.84,3.59,3.58,3.66,4.2,...,3.43,4.29,3.39,3.95,3.68,3.68,3.51,3.45,3.63,3.59


In [41]:
X_movies

Unnamed: 0,movieId,year,ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,3273,2000.0,2.67,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,3578,2000.0,3.94,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3617,2000.0,3.09,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3744,2000.0,2.66,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,3793,2000.0,3.70,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39221,166534,2017.0,3.33,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
39222,168248,2017.0,4.14,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
39223,168250,2017.0,3.63,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
39224,168252,2017.0,4.28,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## BUILD AND TRAIN THE MODEL

In [5]:
# we load the processed data
user_vecs_df = pd.read_csv("user_features.csv")
movie_vecs_df = pd.read_csv("movie_features.csv")
movies_lists = pd.read_csv("movies.csv")
users_vecs = user_vecs_df.values
movie_vecs = movie_vecs_df.values

In [6]:
movies = pd.read_csv("movies.csv")

with open("user_to_movies.pkl", "rb") as f:
    user_to_movies = pickle.load(f)

In [7]:
Y = np.load("Y.npy")
X_u = np.load("X_u.npy")
X_m = np.load("X_m.npy")


In [8]:
user_train, user_temp, movie_train, movie_temp, Y_train, Y_temp = train_test_split(X_u, X_m, Y, test_size=0.2, shuffle=True, random_state=42)

user_test, user_cv, movie_test, movie_cv, Y_test, Y_cv = train_test_split(user_temp, movie_temp, Y_temp, test_size=0.5, shuffle=True, random_state=42)

In [9]:
print(f"User Train shape {user_train.shape}, movie Train shape {movie_train.shape}, target length {len(Y_train)}")
print(f"User test shape {user_test.shape}, movie Test shape {movie_test.shape}, target length {len(Y_test)}")
print(f"User CV shape {user_cv.shape}, movie Cv shape {movie_cv.shape}, target length {len(Y_cv)}")



User Train shape (31380, 21), movie Train shape (31380, 21), target length 31380
User test shape (3923, 21), movie Test shape (3923, 21), target length 3923
User CV shape (3923, 21), movie Cv shape (3923, 21), target length 3923


In [10]:
# scale training data

user_unscaled = user_train
movie_unscaled  = movie_train
unscaled_target = Y_train


userScaler  = StandardScaler()
userScaler.fit(user_train)
user_train = userScaler.transform(user_train)

movieScaler = StandardScaler()
movieScaler.fit(movie_train)
movie_train = movieScaler.transform(movie_train)

targetScaler = MinMaxScaler((-1, 1))
targetScaler.fit(Y_train.reshape(-1, 1))
Y_train = targetScaler.transform(Y_train.reshape(-1, 1))

In [11]:
u_s = 3 # where user data start on training, removing columns like userId
m_s = 1 # Where movie data start removing movieId
uvs = 3 # user genres start
ivs = 3 # movie genre start


In [12]:
# Build the model using the keras functinal Api
outputs = 32
num_user_features = 18
num_movie_features = 20


user_NN = Sequential([
        Input(shape=(num_user_features,)),
        Dense(256, activation="relu"),
        Dense(128, activation="relu"),
        Dense(outputs, kernel_regularizer=tf.keras.regularizers.l2(1e-4))
])

movie_NN = Sequential([
    Input(shape=(num_movie_features,)),
    Dense(256, activation="relu"),
    Dense(128, activation="relu"),
    Dense(outputs, kernel_regularizer=tf.keras.regularizers.l2(1e-4))

])
    
user_input = Input(shape=(num_user_features,))
vu = user_NN(user_input)
vu = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

movie_input = Input(shape=(num_movie_features, ))
vm = movie_NN(movie_input)
vm = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

output = tf.keras.layers.Dot(axes=1)([vu, vm]) 

model = tf.keras.Model([user_input, movie_input], output)

model.summary()

In [14]:
tf.random.set_seed(1)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss = tf.keras.losses.MeanSquaredError()
)

In [15]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], movie_train[:, m_s:]], Y_train, epochs=50)


Epoch 1/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - loss: 0.1275
Epoch 2/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.1163
Epoch 3/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1120
Epoch 4/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1093
Epoch 5/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1072
Epoch 6/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1056
Epoch 7/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1042
Epoch 8/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1030
Epoch 9/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.1020
Epoch 10/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - l

<keras.src.callbacks.history.History at 0x79a7b45e6ea0>

In [16]:
# scale the test and cross validation datasets
unscaled_test_target = Y_test
unscaled_cv_target = Y_cv

user_test = userScaler.transform(user_test)
user_cv = userScaler.transform(user_cv)

movie_test = movieScaler.transform(movie_test)
movie_cv = movieScaler.transform(movie_cv)

Y_test = targetScaler.transform(Y_test.reshape(-1, 1))
Y_cv = targetScaler.transform(Y_cv.reshape(-1, 1))


In [17]:
model.evaluate([user_cv[:, u_s:], movie_cv[:, m_s:]], Y_cv)


[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1234


0.12335417419672012

In [18]:
model.evaluate([user_test[:, u_s:], movie_test[:, m_s:]], Y_test)

[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.1225


0.12247933447360992

In [19]:
train_pred = model.predict([user_train[:, u_s:], movie_train[:, m_s:]])
train_pred_pu = targetScaler.inverse_transform(train_pred)

mse = mean_squared_error(unscaled_target, train_pred_pu)

print(F"Mse on training Data : {mse}")

[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step
Mse on training Data : 0.4117745644035565


In [20]:
# mse on CV and test data
cv_pred = model.predict([user_cv[:, u_s:], movie_cv[:, m_s:]])
cv_pred_pu = targetScaler.inverse_transform(cv_pred)
cv_mse = mean_squared_error(unscaled_cv_target, cv_pred_pu)

test_pred = model.predict([user_test[:, u_s:], movie_test[:, m_s:]])
test_pred_pu = targetScaler.inverse_transform(test_pred)
test_mse = mean_squared_error(unscaled_test_target, test_pred_pu)

print(f"Cross validation mse :{cv_mse}, test mse {test_mse}")

[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Cross validation mse :0.6097486816634949, test mse 0.6053201231771124


## Prediction For New User

In [21]:
new_id = 611
new_count = 4
rating_ave = 0
new_action = 0
new_adventure = 0
new_animation = 4.5
new_children = 5
new_comedy = 4
new_crime = 0
new_documentary = 0
new_drama = 0
new_fantasy = 0
new_film_noir = 0
new_horror = 0
new_musical = 0
new_mystery = 0
new_romance = 0
new_sci = 0
new_thriller =0
new_war = 0
western = 0

user_vec = np.array([[new_id, new_count, rating_ave, new_action, new_adventure, new_animation, new_children, new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_film_noir, new_horror, new_musical, new_mystery, new_romance, new_sci, new_thriller, 
                      new_war, western]])



In [27]:
# duplicate user vector to matrix match the num of movies
user_vecs = generate_user_vecs(user_vec, len(movie_vecs))

scaled_movie_vecs = movieScaler.transform(movie_vecs)
scaled_user_vecs = userScaler.transform(user_vecs)
pred = model.predict([user_vecs[:, u_s:], scaled_movie_vecs[:, m_s:]])
y = targetScaler.inverse_transform(pred)
sorted_index = np.argsort(-y, axis=0).reshape(-1).tolist()
sorted_y = y[sorted_index]
sorted_movies = movie_vecs[sorted_index]

predict_movies(sorted_y, sorted_movies, movies, 10)

[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


y_p,movie id,rating ave,title,genres
4.7,168418,3.8,The Boss Baby (2017),Animation|Children|Comedy
4.7,167036,3.75,Sing (2016),Animation|Children|Comedy
4.7,84944,3.62,Rango (2011),Action|Adventure|Animation|Children|Comedy|Western
4.6,79091,3.68,Despicable Me (2010),Animation|Children|Comedy|Crime
4.6,193573,4.0,Love Live! The School Idol Movie (2015),Animation
4.6,163639,4.0,DC Super Hero Girls: Hero of the Year (2016),Animation
4.6,74791,3.75,"Town Called Panic, A (Panique au village) (2009)",Animation
4.6,57522,3.5,First Sunday (2008),Comedy|Crime
4.6,69122,3.63,"Hangover, The (2009)",Comedy|Crime
4.6,164200,4.0,Storks (2016),Animation|Children|Comedy


## Predict for existing user

In [29]:
#get user vector and predictions history
user_vecs, real_y = get_user_vecs(1, users_vecs, movie_vecs, user_to_movies)
real_y = real_y.reshape(-1, 1)
#scale
user_scaled_vecs = userScaler.transform(user_vecs)
movie_scaled_vecs = movieScaler.transform(movie_vecs)

# prediction
y_p = model.predict([user_scaled_vecs[:, u_s:], movie_scaled_vecs[:, m_s:]])
y_pu = targetScaler.inverse_transform(y_p)

sorted_indexes = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
sorted_ypu = y_pu[sorted_indexes]
sorted_movies_vecs = movie_vecs[sorted_indexes]
sorted_user = user_vecs[sorted_indexes]
sorted_y = real_y[sorted_indexes]
print_existing_user_pred(np.round(sorted_ypu, 2), sorted_y, sorted_user, sorted_movies_vecs, uvs, ivs, movies, 10)

[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.15,5,1,"[4.7,5.0,5.0]",3.94,3578,Gladiator (2000),Action|Adventure|Drama
4.05,5,1,"[4.5,5.0,5.0,4.5]",2.67,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller
4.01,4,1,[4.5],3.09,3617,Road Trip (2000),Comedy
3.91,5,1,"[4.7,5.0,5.0]",3.7,3793,X-Men (2000),Action|Adventure|Sci-Fi
3.74,4,1,"[4.7,4.0,4.5]",2.66,3744,Shaft (2000),Action|Crime|Thriller


## Simalarities Between Items

'''
- We build a model that outputs the movie embedding vector \(v_m\).

- Later, we compute the full movie–movie distance matrix using the squared Euclidean distance:

$$
\mathrm{dist}[i,j] = \| v_m^{(i)} - v_m^{(j)} \|^2
$$

- This allows us to find, for each movie, the closest movie in the embedding space.
'''

In [30]:
input_movies = Input(shape=(num_movie_features,))
vm_m = movie_NN(input_movies)
vm_m = Lambda(lambda x : tf.linalg.l2_normalize(x, axis=1))(vm_m)
model_m = tf.keras.Model(input_movies, vm_m)
model_m.summary()

In [31]:
scaled_movies_vecs = movieScaler.transform(movie_vecs)
vms = model_m.predict(scaled_movies_vecs[:, m_s:])
dim = len(vms)
dist = np.zeros((dim, dim))
for i in range(dim):
    for j in range(dim):
        dist[i, j] = sq_dist(vms[i, :], vms[j, :])


[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step



 We select 20 movies, and for each movie we identify its most similar movie.

In [32]:
count = 20

m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0])) 
disp = [["movie1", "genre", "movie2", "genre"]]

for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1Id = movie_vecs[i, 0]
    movie2Id = movie_vecs[min_idx, 0]
    movie1 = movies[movies["movieId"] == movie1Id]
    movie2 = movies[movies["movieId"] == movie2Id]

    disp.append([movie1["title"].iloc[0], movie1["genres"].iloc[0], movie2["title"].iloc[0], movie2["genres"].iloc[0]])

table = tabulate.tabulate(disp, tablefmt="html", headers="firstrow")
table

movie1,genre,movie2,genre.1
"Yards, The (2000)",Crime|Drama,Heist (2001),Crime|Drama
Next Friday (2000),Comedy,Pootie Tang (2001),Comedy
Supernova (2000),Adventure|Sci-Fi|Thriller,"Deep End, The (2001)",Drama
Down to You (2000),Comedy|Romance,Boys and Girls (2000),Comedy|Romance
Scream 3 (2000),Comedy|Horror|Mystery|Thriller,Hide and Seek (2005),Horror|Mystery|Thriller
"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller,Paid in Full (2002),Action|Drama
Gun Shy (2000),Comedy,Screwed (2000),Comedy
"Beach, The (2000)",Adventure|Drama,In This World (2002),Adventure|Drama
Snow Day (2000),Comedy,Nutty Professor II: The Klumps (2000),Comedy
"Tigger Movie, The (2000)",Animation|Children,"Road to El Dorado, The (2000)",Animation|Children
