In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
from keras.layers import *

In [None]:
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)

Downloading data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Extracting all the files now...
Done!


In [None]:
tags_file = movielens_dir / "movies.csv"
df_tags = pd.read_csv(tags_file)

In [None]:
gen = []

for element in df_tags['genres']:
  l = element.split('|')
  for g in l:
    if g not in gen:
      gen.append(g)


In [None]:
gen

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'War',
 'Musical',
 'Documentary',
 'IMAX',
 'Western',
 'Film-Noir',
 '(no genres listed)']

In [None]:
def vectorize(genr):
  lab = np.zeros(len(gen))
  l = genr.split('|')
  for g in l:
    lab[gen.index(g)] = 1
  return lab
    


In [None]:
df_tags['vec'] = [vectorize(ele) for ele in df_tags['genres']]

In [None]:
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

In [None]:
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)
num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)

min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [None]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "movie"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

movie_vec = df_tags['vec'].values
movie_enc = [[movie_vec[i]] for i in x[:,1]]
movie_enc = np.asarray(movie_enc)
movie_enc = np.reshape(movie_enc,(movie_enc.shape[0], -1))

In [None]:
embedding_size = 50

In [None]:
user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )

inputs = keras.Input(shape=(1))
emb = user_embedding(inputs)
dense1 = Dense(25,activation = 'relu')(emb)
dense2 = Dense(10,activation = 'sigmoid')(emb)
dense2 = Flatten()(dense2)
user_embedding_model = keras.Model(inputs, dense2)



In [None]:
movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )

inputs = keras.Input(shape=(1))
emb = movie_embedding(inputs)
dense1 = Dense(25,activation = 'relu')(emb)
dense2 = Dense(10,activation = 'sigmoid')(emb)
dense2 = Flatten()(dense2)
movie_embedding_model = keras.Model(inputs, dense2)

In [None]:

inputs = keras.Input(shape=(len(gen)))
dense1 = Dense(25,activation = 'relu')(inputs)
dense2 = Dense(10,activation = 'sigmoid')(dense1)
genre_embedding_model = keras.Model(inputs, dense2)

In [None]:


inputs = keras.Input(shape=(10))
dense1 = Dense(7,activation = 'relu')(inputs)
dense2 = Dense(5,activation = 'sigmoid')(dense1)
sia_model = keras.Model(inputs, dense2)


In [None]:
def euclidean_distance(vectors):
	# unpack the vectors into separate lists
	(featsA, featsB) = vectors
	# compute the sum of squared distances between the vectors
	sumSquared = K.sum(K.square(featsA - featsB), axis=1,
		keepdims=True)
	# return the euclidean distance between the vectors
	return K.sqrt(K.maximum(sumSquared, K.epsilon()))

In [None]:
import tensorflow.keras.backend as K
import tensorflow as tf
def contrastive_loss(y, preds, margin=1):
	# explicitly cast the true class label data type to the predicted
	# class label data type (otherwise we run the risk of having two
	# separate data types, causing TensorFlow to error out)
	y = tf.cast(y, preds.dtype)
	# calculate the contrastive loss between the true labels and
	# the predicted labels
	squaredPreds = K.square(preds)
	squaredMargin = K.square(K.maximum(margin - preds, 0))
	loss = K.mean(y * squaredPreds + (1 - y) * squaredMargin)
	# return the computed contrastive loss to the calling function
	return loss

In [None]:


input_m = keras.Input(shape=(1))
input_g = keras.Input(shape=(20))
me = movie_embedding_model(input_m)
ge = genre_embedding_model(input_g)

m_emb = sia_model(me)
g_emb = sia_model(ge)

distance = Lambda(euclidean_distance)([m_emb, g_emb])
model = keras.Model(inputs=[input_m, input_g], outputs=distance)


In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(), loss = contrastive_loss)

In [None]:

inputs_m = keras.Input(shape=(1))
inputs_u = keras.Input(shape=(1))

emb_m = movie_embedding_model(inputs_m)
emb_u = user_embedding_model(inputs_u)

d = tf.tensordot(emb_m, emb_u, 2)
d = tf.nn.sigmoid(d)

rec_model = keras.Model((inputs_m, inputs_u), d)

In [None]:
rec_model.compile(optimizer = tf.keras.optimizers.Adam(), loss = 'bce')

In [None]:
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

In [None]:
  x1 = x_train[batch1][:,1]
  y1 = movie_enc[batch1]
  x2 = x_train[batch1][:,1]
  y2 = movie_enc[batch2]
  x3 = np.concatenate((x1,x2))
  y3 = np.concatenate((y1,y2), axis=0)
  y = np.concatenate((np.ones(len(batch1)),np.ones(len(batch1))))

In [None]:
opt = tf.keras.optimizers.Adam(1e-4)

In [None]:
for i in range(10):
  batch = np.arange(len(x_train))
  np.random.shuffle(batch)
  batch1=batch[:8]
  batch2 = batch[10:18]
  xt1 = x_train[batch1]
  yt1 = y_train[batch1]

  x1 = x_train[batch1][:,1]
  y1 = movie_enc[batch1]
  x2 = x_train[batch1][:,1]
  y2 = movie_enc[batch2]
  x3 = np.concatenate((x1,x2))
  y3 = np.concatenate((y1,y2), axis=0)
  y = np.concatenate((np.ones(len(batch1)),np.ones(len(batch1))))


  with tf.GradientTape() as tape:
    out_rec = rec_model((xt1[:,1], xt1[:,0]))
    loss_value_rec = tf.keras.losses.mean_squared_error(yt1, out_rec)
    out_mod = model((x3[:1],y3))
    loss_con = contrastive_loss(y,out_mod)
    total_loss = loss_value_rec + loss_con
    
    grads = tape.gradient(total_loss, rec_model.trainable_weights)
    opt.apply_gradients(zip(grads, [rec_model.trainable_weights]))
    
    grads = tape.gradient(total_loss, model.trainable_weights)
    opt.apply_gradients(zip(grads, model.trainable_weights))  

In [None]:
movie_df = pd.read_csv(movielens_dir / "movies.csv")

# Let us get a user and see the top recommendations.
user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)
ratings = rec_model.predict((user_movie_array[:,1],user_movie_array[:,0]) ).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

Showing recommendations for user: 173
Movies with high ratings from user
--------------------------------
Sense and Sensibility (1995) : Drama|Romance
Jeffrey (1995) : Comedy|Drama
Forrest Gump (1994) : Comedy|Drama|Romance|War
Four Weddings and a Funeral (1994) : Comedy|Romance
Schindler's List (1993) : Drama|War
--------------------------------
Top 10 movie recommendations
--------------------------------
City Hall (1996) : Drama|Thriller
Bottle Rocket (1996) : Adventure|Comedy|Crime|Romance
Mr. Wrong (1996) : Comedy
Unforgettable (1996) : Mystery|Sci-Fi|Thriller
Happy Gilmore (1996) : Comedy
Bridges of Madison County, The (1995) : Drama|Romance
Nobody Loves Me (Keiner liebt mich) (1994) : Comedy|Drama
Muppet Treasure Island (1996) : Adventure|Children|Comedy|Musical
Catwalk (1996) : Documentary
Die Hard: With a Vengeance (1995) : Action|Crime|Thriller


In [None]:
genres = np.zeros((1,20))
movie_emb = movie_embedding_model(movie_df['movieId'].values)
genre_emb = genre_embedding_model(genres)
genre_emb = np.asarray([genre_emb for _ in range(movie_emb.shape[0])]).reshape((-1,10))
movie_emb.shape
rank = np.einsum('ij,ij->i',genre_emb,movie_emb)
top_ratings_indices = rank.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)

print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

Showing recommendations for user: 173
Top 10 movie recommendations
--------------------------------
Smoke (1995) : Comedy|Drama
Monty Python's Life of Brian (1979) : Comedy
Air Bud (1997) : Children|Comedy
Mouse Hunt (1997) : Children|Comedy
House II: The Second Story (1987) : Comedy|Fantasy|Horror
Married to the Mob (1988) : Comedy
Diner (1982) : Comedy|Drama
Sunshine (1999) : Drama
Jason X (2002) : Horror|Sci-Fi|Thriller
Start the Revolution Without Me (1970) : Comedy
