In [117]:
import numpy as np
import numpy.ma as ma
# numpy.ma - Masked arrays are arrays that may have missing or invalid entries
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate # for printing tabular data in nicely formatted tables
from recsysNN_utils import *
pd.set_option("display.precision", 1)


In [118]:
# 9000 movies rated by 600 users
# nu - 397, nm - 847
top10_df = pd.read_csv("./data/content_top10_df.csv")
bygenre_df = pd.read_csv("./data/content_bygenre_df.csv")
top10_df

Unnamed: 0,movie id,num ratings,ave rating,title,genres
0,4993,198,4.1,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,188,4.0,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy
2,7153,185,4.1,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy
3,4306,170,3.9,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,58559,149,4.2,"Dark Knight, The",Action|Crime|Drama
5,6539,149,3.8,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
6,79132,143,4.1,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller
7,6377,141,4.0,Finding Nemo,Adventure|Animation|Children|Comedy
8,4886,132,3.9,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
9,7361,131,4.2,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi


In [119]:
bygenre_df

Unnamed: 0,genre,num movies,ave rating/genre,ratings per genre
0,Action,321,3.4,10377
1,Adventure,234,3.4,8785
2,Animation,76,3.6,2588
3,Children,69,3.4,2472
4,Comedy,326,3.4,8911
5,Crime,139,3.5,4671
6,Documentary,13,3.8,280
7,Drama,342,3.6,10201
8,Fantasy,124,3.4,4468
9,Horror,56,3.2,1345


In [120]:
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()
# print(f"item feature: {item_features[:2]}")
# print(f"item train: {item_train[:2]}")
# print(f"item vecs: {item_vecs[:2]}")
num_user_features = user_train.shape[1] - 3

num_item_features = item_train.shape[1] - 1

uvs = 3 # user genre vector start
ivs = 3 # item genre vector start
u_s = 3 # start of columns to use in traning, user
i_s = 1 # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 50884


In [121]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)
# features in bracket are not used for training

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9


In [122]:
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=True)

[movie id],year,ave rating,Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
6874,2003,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8798,2004,3.8,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
46970,2006,3.2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48516,2006,4.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
58559,2008,4.2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
# y_train is the movie rating given by the user
# now we'll prepare the training data
# y_train.reshape(-1,1).shape

In [124]:
item_tr_unscaled = item_train
user_tr_unscaled = user_train
y_tr_unscaled = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)


scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1,1)) # transforms feature by scaling each feature to a given range
scalerTarget.fit(y_train.reshape(-1,1))
y_train = scalerTarget.transform(y_train.reshape(-1,1))

print(np.allclose(item_tr_unscaled, scalerItem.inverse_transform(item_train)))
# allclose ->Returns True if two arrays are element-wise equal within a tolerance.
print(np.allclose(user_tr_unscaled, scalerUser.inverse_transform(user_train)))
# means that user_tr_unscaled and inversely transformed user train is element-wise equal

True
True


In [125]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (40707, 17)
movie/item test data shape: (10177, 17)


In [126]:
pprint_train(user_train , user_features, uvs, u_s, maxcount=5)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
1,0,-1.0,-0.8,-0.7,0.1,-0.0,-1.2,-0.4,0.6,-0.5,-0.5,-0.1,-0.6,-0.6,-0.7,-0.7
0,1,-0.7,-0.5,-0.7,-0.1,-0.2,-0.6,-0.2,0.7,-0.5,-0.8,0.1,-0.0,-0.6,-0.5,-0.4
-1,-1,-0.2,0.3,-0.4,0.4,0.5,1.0,0.6,-1.2,-0.3,-0.6,-2.3,-0.1,0.0,0.4,-0.0
0,-1,0.6,0.5,0.5,0.2,0.6,-0.1,0.5,-1.2,0.9,1.2,-2.3,-0.1,0.0,0.2,0.3
-1,0,0.7,0.6,0.5,0.3,0.5,0.4,0.6,1.0,0.6,0.3,0.8,0.8,0.4,0.7,0.7


In [127]:
num_outputs = 32
tf.random.set_seed(1) # for consistent result
user_NN = tf.keras.models.Sequential([
        keras.layers.Dense(units=256, activation="relu"),
        keras.layers.Dense(units=128, activation="relu"),
        keras.layers.Dense(units=num_outputs, activation="linear"),
    ])

item_NN = tf.keras.models.Sequential([
    keras.layers.Dense(units=256, activation="relu"),
    keras.layers.Dense(units=128, activation="relu"),
    keras.layers.Dense(units=num_outputs, activation="linear"),
])

input_user = tf.keras.layers.Input(shape=(num_user_features)) #
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

input_item = keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)
# tf.linalg12.l2_normalize -> Normalizes along dimension axis using an L2 norm.

output = tf.keras.layers.Dot(axes=1)([vu, vm])
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 14)]                 0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, 16)]                 0         []                            
                                                                                                  
 sequential_8 (Sequential)   (None, 32)                   40864     ['input_9[0][0]']             
                                                                                                  
 sequential_9 (Sequential)   (None, 32)                   41376     ['input_10[0][0]']            
                                                                                            

In [128]:
# mean squared error loss and adam optimizer
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [129]:
user_train.shape

(40707, 17)

In [130]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History object at 0x000001A630362770>

In [131]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)



0.08331505209207535

In [140]:
# prediction
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 0.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 4.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 5.0
new_fantasy = 0.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array(
    [new_user_id, new_rating_count, new_rating_ave, new_action, new_adventure, new_animation, new_childrens, new_comedy, new_crime, new_documentary, new_drama, new_fantasy, new_horror, new_mystery, new_romance, new_scifi, new_thriller
                    ])

In [141]:
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

y_p = model.predict([suser_vecs[:,u_s:], sitem_vecs[:,i_s:]])

y_pu = scalerTarget.inverse_transform(y_p)

sorted_index = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
sorted_ypu = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]

print_pred_movies(sorted_ypu, sorted_items, movie_dict, max)



y_p,movie id,rating ave,title,genres
4.2,53894,3.7,Sicko (2007),Documentary|Drama
3.9,5128,2.4,Queen of the Damned (2002),Fantasy|Horror
3.9,6888,2.3,Scary Movie 3 (2003),Comedy|Horror
3.9,6620,3.8,American Splendor (2003),Comedy|Drama
3.9,7323,3.8,"Good bye, Lenin! (2003)",Comedy|Drama
3.9,48394,3.8,"Pan's Labyrinth (Laberinto del fauno, El) (2006)",Drama|Fantasy|Thriller
3.9,109374,3.8,"Grand Budapest Hotel, The (2014)",Comedy|Drama
3.9,94959,3.8,Moonrise Kingdom (2012),Comedy|Drama|Romance
3.9,8366,3.9,Saved! (2004),Comedy|Drama
3.9,97921,3.7,Silver Linings Playbook (2012),Comedy|Drama


In [139]:
uid = 2

user_vecs, y_vecs = get_user_vecs(uid, user_tr_unscaled, item_vecs, user_to_genre)
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

y_pu = scalerTarget.inverse_transform(y_p)

sorted_index = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
sorted_ypu = y_pu[sorted_index]
sorted_item = item_vecs[sorted_index]
sorted_user = user_vecs[sorted_index]
sorted_y = y_vecs[sorted_index]

print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, ivs, uvs, movie_dict, maxcount = 50)




y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.4,5.0,2,[4.0],3.9,45728,Clerks II (2006),Comedy
4.2,3.5,2,[4.0],3.8,6296,"Mighty Wind, A (2003)",Comedy
4.2,4.0,2,[4.0],3.4,107348,Anchorman 2: The Legend Continues (2013),Comedy
4.1,4.5,2,[4.0],3.3,111113,Neighbors (2014),Comedy
4.1,4.0,2,"[4.0,4.2,4.0,0.0]",3.8,6539,Pirates of the Caribbean: The Curse of the Black Pearl (2003),Action|Adventure|Comedy|Fantasy
4.0,3.5,2,"[4.0,4.2,4.0,3.9,3.9]",3.9,48774,Children of Men (2006),Action|Adventure|Drama|Sci-Fi|Thriller
4.0,4.5,2,"[4.0,4.2,4.0,4.1]",4.0,119145,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime
4.0,4.5,2,[4.0],3.3,8376,Napoleon Dynamite (2004),Comedy
3.9,4.0,2,"[4.0,4.0]",3.8,109374,"Grand Budapest Hotel, The (2014)",Comedy|Drama
3.9,4.0,2,"[4.0,4.2,3.9]",3.9,34405,Serenity (2005),Action|Adventure|Sci-Fi
