In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
import tabulate
import csv
import pickle
from numpy import genfromtxt
from recsysNN_utils import *

In [2]:
x_user = pd.read_csv("x_user.csv")
x_item = pd.read_csv("x_item.csv")
y = pd.read_csv("y.csv")

with open('umr_dict.pickle', 'rb') as f:
    # Load the dictionary from the file using pickle
    umr_dict = pickle.load(f)


In [3]:
movie_dict = defaultdict(dict)
count = 0
with open('csv/small/movie_list.csv', newline='',encoding="utf8") as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1  #skip header
                #print(line) print
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]

In [4]:
print(f'train_user shape:{x_user.shape}')
print(f'train_item shape:{x_item.shape}')
print(f'y_train shape:{y.shape}')

train_user shape:(5630130, 17)
train_item shape:(5630130, 17)
y_train shape:(5630130, 1)


In [5]:
# scale training data
unscaled_train_item = x_item
unscaled_train_user = x_user
y_train_unscaled    = y

scalerItem = StandardScaler()#scalerItem = StandardScaler()
scalerItem.fit(x_item)#scalerItem.fit(item_train)
x_item = scalerItem.transform(x_item)#item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()#scalerUser = StandardScaler()
scalerUser.fit(x_user)#scalerUser.fit(user_train)
x_user = scalerUser.transform(x_user)#user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))#scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y.values.reshape(-1, 1))#scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y.values.reshape(-1, 1))#y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(unscaled_train_item, scalerItem.inverse_transform(x_item)))
print(np.allclose(unscaled_train_user, scalerUser.inverse_transform(x_user)))

True
True


In [6]:
x_train_item, item_test = train_test_split(x_item, train_size=0.80, shuffle=True, random_state=1)
x_train_user, user_test = train_test_split(x_user, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {x_train_item.shape}")
print(f"movie/item test data shape: {item_test.shape}")
print(f"movie/user training data shape: {x_train_user.shape}")
print(f"movie/user test data shape: {user_test.shape}")

movie/item training data shape: (4504104, 17)
movie/item test data shape: (1126026, 17)
movie/user training data shape: (4504104, 17)
movie/user test data shape: (1126026, 17)


In [7]:
num_user_features = x_train_user.shape[1] - 3
num_item_features = x_train_item.shape[1] - 1
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

In [8]:
with open('nn_model_23_05.pickle', 'rb') as f:
    model = pickle.load(f)

In [9]:
#model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

In [87]:
new_user_id = 5000
new_rating_ave = 5.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 5.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 1000
#user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,***film-noir**Horror,**imax**,***musical***Mystery,Romance,Sci-Fi,Thriller***war***,***western***

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [88]:
item_vecs = genfromtxt('item_vecs.csv', delimiter=',')

In [89]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 500)

 1/27 [>.............................] - ETA: 1s





y_p,movie id,rating ave,title,genres
4.2,1376,3.5,Star Trek IV: The Voyage Home (1986),Adventure|Comedy|Sci-Fi
4.2,1909,3.4,"X-Files: Fight the Future, The (1998)",Action|Crime|Mystery|Sci-Fi|Thriller
4.1,329,3.4,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi
4.0,41566,3.4,"Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)",Adventure|Children|Fantasy
4.0,1393,3.6,Jerry Maguire (1996),Drama|Romance
4.0,8636,3.8,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
4.0,4321,3.4,City Slickers (1991),Comedy|Western
4.0,31685,3.2,Hitch (2005),Comedy|Romance
4.0,1372,3.3,Star Trek VI: The Undiscovered Country (1991),Action|Mystery|Sci-Fi
3.9,733,3.6,"Rock, The (1996)",Action|Adventure|Thriller


In [16]:
uid = 5 
# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, unscaled_train_user.to_numpy(), item_vecs, umr_dict)

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display
sorted_user  = user_vecs[sorted_index]
sorted_y     = y_vecs[sorted_index]

#print sorted predictions for movies rated by the user
print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, ivs, uvs, movie_dict, maxcount = 50)





y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.7,5.0,5,"[4.1,4.0,3.8,4.2]",3.6,594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
4.3,5.0,5,"[4.1,4.0,4.2]",3.5,596,Pinocchio (1940),Animation|Children|Fantasy|Musical
4.2,5.0,5,"[3.5,3.8,3.0]",4.0,58,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance
4.1,4.0,5,"[3.6,4.0,3.7]",4.2,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
4.1,4.0,5,"[4.0,3.8]",3.7,34,Babe (1995),Children|Drama
4.1,5.0,5,[3.8],4.2,527,Schindler's List (1993),Drama|War
4.1,4.0,5,"[3.5,3.6,3.7]",3.5,21,Get Shorty (1995),Comedy|Crime|Thriller
4.0,5.0,5,"[3.5,3.6,3.8,3.7]",4.2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4.0,4.0,5,"[3.6,3.8]",3.8,36,Dead Man Walking (1995),Crime|Drama
3.9,3.0,5,"[3.5,3.6,3.8,3.7]",4.1,608,Fargo (1996),Comedy|Crime|Drama|Thriller
