In [23]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
import tabulate
import csv
from numpy import genfromtxt

In [2]:
x_train_user = pd.read_csv("csv/x_train_user.csv")
x_train_item = pd.read_csv("csv/x_train_item.csv")
y_train = pd.read_csv("csv/y_train.csv")

In [24]:
movie_dict = defaultdict(dict)
count = 0
with open('csv/movie_list.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1  #skip header
                #print(line) print
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]

In [3]:
print(f'train_user shape:{x_train_user.shape}')
print(f'train_item shape:{x_train_item.shape}')
print(f'y_train shape:{y_train.shape}')

train_user shape:(66657, 22)
train_item shape:(66657, 22)
y_train shape:(66657, 1)


In [4]:
# scale training data
unscaled_train_item = x_train_item
unscaled_train_user = x_train_user
y_train_unscaled    = y_train

scalerItem = StandardScaler()#scalerItem = StandardScaler()
scalerItem.fit(x_train_item)#scalerItem.fit(item_train)
x_train_item = scalerItem.transform(x_train_item)#item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()#scalerUser = StandardScaler()
scalerUser.fit(x_train_user)#scalerUser.fit(user_train)
x_train_user = scalerUser.transform(x_train_user)#user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))#scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.values.reshape(-1, 1))#scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.values.reshape(-1, 1))#y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(unscaled_train_item, scalerItem.inverse_transform(x_train_item)))
print(np.allclose(unscaled_train_user, scalerUser.inverse_transform(x_train_user)))

True
True


In [5]:
x_train_item, item_test = train_test_split(x_train_item, train_size=0.80, shuffle=True, random_state=1)
x_train_user, user_test = train_test_split(x_train_user, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {x_train_item.shape}")
print(f"movie/item test data shape: {item_test.shape}")
print(f"movie/user training data shape: {x_train_user.shape}")
print(f"movie/user test data shape: {user_test.shape}")

movie/item training data shape: (53325, 22)
movie/item test data shape: (13332, 22)
movie/user training data shape: (53325, 22)
movie/user test data shape: (13332, 22)


In [6]:
num_user_features = x_train_user.shape[1] - 3
num_item_features = x_train_item.shape[1] - 1
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

In [7]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###     
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
  
    ### END CODE HERE ###  
])

item_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###     
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
    
  
  
    ### END CODE HERE ###  
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           42144       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42656       ['input_2[0][0]']                
                                                                                              

In [8]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [9]:
tf.random.set_seed(1)
model.fit([x_train_user[:, u_s:], x_train_item[:, i_s:]], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1b370e2bdf0>

In [11]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)



0.16437526047229767

In [27]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 5.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 4.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 0.0
new_filmnoir = 0.0 ##
new_horror = 0.0
new_imax = 0.0 ##
new_musical = 0.0 ##
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_western = 5.0 ##
new_war = 0.0 ##
new_rating_count = 3
#user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,***film-noir**Horror,**imax**,***musical***Mystery,Romance,Sci-Fi,Thriller***war***,***western***

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_filmnoir, new_horror, new_imax, new_musical, new_mystery,
                      new_romance, new_scifi, new_thriller, new_western, new_war]])

In [17]:
def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict maxtrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (num_items, 1))
    return user_vecs

In [18]:
item_vecs = genfromtxt('./csv/item_vecs.csv', delimiter=',')

In [20]:
def print_pred_movies(y_p, item, movie_dict, maxcount=10):
    """ print results of prediction of a new user. inputs are expected to be in
        sorted order, unscaled. """
    count = 0
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        movie_id = item[i, 0].astype(int)
        disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),
                     movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
    return table

In [28]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 10)







y_p,movie id,rating ave,title,genres
2.8,1262,4.1,"Great Escape, The (1963)",Action|Adventure|Drama|War
2.8,594,3.6,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
2.8,1250,4.1,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War
2.8,2410,2.7,Rocky III (1982),Action|Drama
2.8,1385,2.9,Under Siege (1992),Action|Drama|Thriller
2.8,2411,2.7,Rocky IV (1985),Action|Drama
2.8,1204,4.3,Lawrence of Arabia (1962),Adventure|Drama|War
2.8,2944,4.0,"Dirty Dozen, The (1967)",Action|Drama|War
2.8,2018,3.4,Bambi (1942),Animation|Children|Drama
2.8,2409,3.2,Rocky II (1979),Action|Drama


In [None]:
np.shape(scalerItem.inverse_transform(x_train_item))

(100835, 21)

In [None]:
np.shape(scalerUser.inverse_transform(x_train_user))

(100835, 19)

In [None]:
np.shape(uncaled_train_item)

(100835, 21)

In [None]:
np.shape(unclaed_train_user)

(100835, 19)