In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Multiply, Dense, Input
from tensorflow.keras.models import Model

In [2]:
# List available GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to prevent TensorFlow from using all GPU memory
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU is available and will be used!")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found!")



GPU is available and will be used!


In [3]:
df  = pd.read_csv("users.dat",sep="::", engine="python",header=None, names = ["UserID", "Gender", "Age", "Occupation", "Zip_code"])

In [4]:
df.head(10)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [5]:
unique_user_ids = df['UserID'].unique().tolist()

In [6]:
unique_user_ids.sort()

In [7]:
unique_user_ids[0]

1

In [8]:
len(unique_user_ids)

6040

In [9]:
df1 = pd.read_csv( "movies.dat", sep="::", engine="python", header=None, names=["MovieID", "Title", "Genres"], encoding="ISO-8859-1")

In [10]:
df1.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
unique_movie_ids = df1['MovieID'].unique().tolist()

In [12]:
len(unique_movie_ids)

3883

In [13]:
ratings = pd.read_csv("ratings.dat", sep="::", engine="python", header=None, names=["UserID", "MovieID", "Rating", "Timestamp"])

In [14]:
ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [15]:
user_movie_set = set(zip(ratings["UserID"], ratings["MovieID"]))

In [16]:
type(user_movie_set)

set

In [17]:
neg_samples = []
num_neg_per_pos = 4

for user in unique_user_ids:

    unrated_movies = list(set(unique_movie_ids) - set(ratings[ratings["UserID"] == user]["MovieID"]))

    sampled_movies = np.random.choice(
        unrated_movies,
        size=min(num_neg_per_pos * len(ratings[ratings["UserID"] == user]), len(unrated_movies)),
        replace=False
    )
    for movie in sampled_movies:
        neg_samples.append((user, movie, 0, -1))

neg_df = pd.DataFrame(neg_samples, columns=["UserID", "MovieID", "Rating", "Timestamp"])

In [18]:
ratings["Rating"] = 1


In [19]:
final_df = pd.concat([ratings, neg_df]).sample(frac=1).reset_index(drop=True)  # Shuffle


In [20]:
final_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,5482,1339,1,959870443
1,3754,1450,0,-1
2,3211,1094,1,968547788
3,2010,111,0,-1
4,962,3681,0,-1
...,...,...,...,...
4868910,3989,1277,0,-1
4868911,1483,3629,0,-1
4868912,3706,41,0,-1
4868913,185,2995,0,-1


In [21]:
final_df = final_df.sort_values(by=["UserID", "Timestamp"]).reset_index(drop=True)


In [22]:
final_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,806,0,-1
1,1,228,0,-1
2,1,1298,0,-1
3,1,3185,0,-1
4,1,2655,0,-1
...,...,...,...,...
4868910,6040,2917,1,997454429
4868911,6040,1784,1,997454464
4868912,6040,1921,1,997454464
4868913,6040,161,1,997454486


In [23]:
final_df.to_csv("final_df.csv", index=False)

In [24]:
final_df.shape

(4868915, 4)

In [25]:
training_data = []
testing_data = []

for user in unique_user_ids:
    user_data = final_df[final_df["UserID"] == user]

    unrated = user_data[user_data["Rating"] == 0]
    rated = user_data[user_data["Rating"] == 1]

    if not rated.empty:
        test_sample = rated.iloc[-1]

    split_index = int(0.8 * len(unrated))
    training_data.extend(unrated.iloc[:split_index].values.tolist())
    testing_data.extend(unrated.iloc[split_index:].values.tolist())

    testing_data.append(test_sample.values.tolist())
    training_data.extend(rated.iloc[:-1].values.tolist())

train_df = pd.DataFrame(training_data, columns=["UserID", "MovieID", "Rating", "Timestamp"])
test_df = pd.DataFrame(testing_data, columns=["UserID", "MovieID", "Rating", "Timestamp"])

In [26]:
train_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,806,0,-1
1,1,228,0,-1
2,1,1298,0,-1
3,1,3185,0,-1
4,1,2655,0,-1
...,...,...,...,...
4086751,6040,232,1,997454398
4086752,6040,2917,1,997454429
4086753,6040,1784,1,997454464
4086754,6040,1921,1,997454464


In [27]:
test_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,524,0,-1
1,1,2376,0,-1
2,1,3560,0,-1
3,1,1412,0,-1
4,1,3241,0,-1
...,...,...,...,...
782154,6040,492,0,-1
782155,6040,2239,0,-1
782156,6040,3045,0,-1
782157,6040,430,0,-1


## Building GMF


In [28]:
num_users = final_df["UserID"].nunique()
num_movies = final_df["MovieID"].nunique()


In [29]:
num_users, num_movies

(6040, 3883)

In [30]:
user_id_map = {id: idx for idx, id in enumerate(final_df["UserID"].unique())}
movie_id_map = {id: idx for idx, id in enumerate(final_df["MovieID"].unique())}

In [31]:

train_df["UserID"] = train_df["UserID"].map(user_id_map)
train_df["MovieID"] = train_df["MovieID"].map(movie_id_map)
test_df["UserID"] = test_df["UserID"].map(user_id_map)
test_df["MovieID"] = test_df["MovieID"].map(movie_id_map)


In [32]:
train_users = np.array(train_df["UserID"])
train_movies = np.array(train_df["MovieID"])
train_ratings = np.array(train_df["Rating"])

test_users = np.array(test_df["UserID"])
test_movies = np.array(test_df["MovieID"])
test_ratings = np.array(test_df["Rating"])

In [33]:
train_users

array([   0,    0,    0, ..., 6039, 6039, 6039])

In [34]:
embedding_dim = 32

user_input = Input(shape=(1,), dtype=tf.int32, name="user_input")
movie_input = Input(shape=(1,), dtype=tf.int32, name="movie_input")

user_embedding_gmf = Embedding(input_dim=num_users, output_dim=embedding_dim, name="user_embedding") # creates a matrix (num_users, output_dim)
movie_embedding_gmf = Embedding(input_dim=num_movies, output_dim=embedding_dim, name="movie_embedding")

user_emb = user_embedding_gmf(user_input)
movie_emb = movie_embedding_gmf(movie_input)

In [35]:

gmf_vector = Multiply()([user_emb, movie_emb])


gmf_vector = tf.keras.layers.Flatten()(gmf_vector)



In [36]:
output = Dense(1, activation="sigmoid")(gmf_vector)

In [37]:
gmf_model = Model(inputs=[user_input, movie_input], outputs=output)

gmf_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

gmf_model.summary()

In [38]:
gmf_model.fit([train_users, train_movies], train_ratings,
              epochs=10, batch_size=1024, validation_data=([test_users, test_movies], test_ratings))

Epoch 1/10
[1m3991/3991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.7941 - loss: 0.4942 - val_accuracy: 0.9264 - val_loss: 0.1929
Epoch 2/10
[1m3991/3991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.8615 - loss: 0.3076 - val_accuracy: 0.9269 - val_loss: 0.1811
Epoch 3/10
[1m3991/3991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.8756 - loss: 0.2806 - val_accuracy: 0.9289 - val_loss: 0.1740
Epoch 4/10
[1m3991/3991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.8845 - loss: 0.2636 - val_accuracy: 0.9292 - val_loss: 0.1719
Epoch 5/10
[1m3991/3991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8899 - loss: 0.2525 - val_accuracy: 0.9272 - val_loss: 0.1746
Epoch 6/10
[1m3991/3991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.8941 - loss: 0.2441 - val_accuracy: 0.9275 - val_loss: 0.1734
Epoch 7/10

<keras.src.callbacks.history.History at 0x7870ed60e910>

In [39]:
predictions = gmf_model.predict([test_users, test_movies])
print("Sample Predictions:", predictions[:10])

[1m24443/24443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 1ms/step
Sample Predictions: [[1.7799313e-01]
 [3.2653817e-04]
 [7.9143730e-10]
 [5.3063435e-05]
 [2.3436828e-02]
 [9.1748917e-03]
 [4.3824023e-01]
 [4.8170050e-09]
 [1.4164901e-01]
 [6.1426260e-02]]


In [40]:
dense_layer = gmf_model.get_layer("dense")

weights_gmf, biases_gmf = dense_layer.get_weights()


In [41]:
user_input.shape

(None, 1)

## MLP Implementation


In [42]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate

In [43]:
import itertools
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten



def build_model(num_layers, num_neurons, learning_rate):
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    movie_input = Input(shape=(1,), dtype='int32', name='movie_input')

    embedding_dim = 32
    user_embedding_mlp = Embedding(input_dim=num_users, output_dim=embedding_dim, name="user_embedding")(user_input)
    movie_embedding_mlp = Embedding(input_dim=num_movies, output_dim=embedding_dim, name="movie_embedding")(movie_input)

    x = Concatenate()([user_embedding_mlp, movie_embedding_mlp])

    for _ in range(num_layers):
        x = Dense(num_neurons, activation='relu')(x)

    x = Flatten()(x)

    output = Dense(1, activation='sigmoid', name="output")(x)

    model = Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss="binary_crossentropy",
                  metrics=["accuracy", "mse"])
    return model

num_layers_candidates = [2,3]
num_neurons_candidates = [32]
learning_rate_candidates = [0.001]


best_val_loss = float('inf')
best_params = None
results = []


for num_layers in num_layers_candidates:
    for num_neurons in num_neurons_candidates:
        for learning_rate in learning_rate_candidates:
            print(f"Training model with {num_layers} layers, {num_neurons} neurons per layer, lr={learning_rate}")

            model = build_model(num_layers, num_neurons, learning_rate)

            history = model.fit([train_users, train_movies], train_ratings,
                                epochs=3,
                                batch_size=1024,
                                validation_data=([test_users, test_movies], test_ratings),
                                verbose=0)

            val_loss = history.history['val_loss'][-1]
            print(f"Validation loss: {val_loss:.4f}")

            results.append((num_layers, num_neurons, learning_rate, val_loss))

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_params = (num_layers, num_neurons, learning_rate)
                best_model = model


print("\nBest Hyperparameters:")
print(f"Number of layers: {best_params[0]}")
print(f"Neurons per layer: {best_params[1]}")
print(f"Learning rate: {best_params[2]}")
print(f"Best validation loss: {best_val_loss:.4f}")

best_model.summary()


Training model with 2 layers, 32 neurons per layer, lr=0.001
Validation loss: 0.1893
Training model with 3 layers, 32 neurons per layer, lr=0.001
Validation loss: 0.1800

Best Hyperparameters:
Number of layers: 3
Neurons per layer: 32
Learning rate: 0.001
Best validation loss: 0.1800


In [44]:
user_embedding_matrix = best_model.get_layer("user_embedding").get_weights()[0]
print("User Embedding Shape:", user_embedding_matrix.shape)  # (num_users, embedding_dim)

movie_embedding_matrix = best_model.get_layer("movie_embedding").get_weights()[0]
print("Movie Embedding Shape:", movie_embedding_matrix.shape)  # (num_movies, embedding_dim)


User Embedding Shape: (6040, 32)
Movie Embedding Shape: (3883, 32)


In [45]:
user_embedding_mlp_layer = Embedding(
    input_dim=num_users,
    output_dim=32,
    weights=[user_embedding_matrix],
    trainable=True
)

movie_embedding_mlp_layer = Embedding(
    input_dim=num_movies,
    output_dim=32,
    weights=[movie_embedding_matrix],
    trainable=True
)


In [46]:
dense_layer = best_model.get_layer("output")

weights_mlp, biases_mlp = dense_layer.get_weights()

In [47]:
best_model.layers

[<InputLayer name=user_input, built=True>,
 <InputLayer name=movie_input, built=True>,
 <Embedding name=user_embedding, built=True>,
 <Embedding name=movie_embedding, built=True>,
 <Concatenate name=concatenate_1, built=True>,
 <Dense name=dense_3, built=True>,
 <Dense name=dense_4, built=True>,
 <Dense name=dense_5, built=True>,
 <Flatten name=flatten_2, built=True>,
 <Dense name=output, built=True>]

In [48]:
type(best_model.layers)

list

## NeuMF

In [49]:
from tensorflow.keras.optimizers import SGD

In [50]:
def build_nmf(alpha, l_rate):
  user_input = Input(shape=(1,), dtype='int32', name='user_input_final')
  movie_input = Input(shape=(1,), dtype='int32', name='movie_input_final')
  user_embedding_neumf_gmf = user_embedding_gmf(user_input)
  movie_embedding_neumf_gmf = movie_embedding_gmf(movie_input)
  gmf_vector = Multiply()([user_embedding_neumf_gmf, movie_embedding_neumf_gmf])
  gmf_vector = tf.keras.layers.Flatten()(gmf_vector)


  user_embedding_neumf_mlp = user_embedding_mlp_layer(user_input)
  movie_embedding_neumf_mlp = movie_embedding_mlp_layer(user_input)
  mlp_vector = Concatenate()([user_embedding_neumf_mlp, movie_embedding_neumf_mlp])

  for idx in range(5,5+best_params[0]):
    mlp_vector = Dense(best_params[1], activation='relu')(mlp_vector)
  mlp_vector = Flatten()(mlp_vector)

  neumf_vector = Concatenate()([gmf_vector, mlp_vector])

  # weights_gmf, biases_gmf
  # weights_mlp, biases_mlp
  combined_weights = tf.concat([alpha * weights_gmf, (1 - alpha) * weights_mlp ], axis=0)
  combined_bias = alpha * biases_gmf + (1 - alpha) * biases_mlp

  output_layer = Dense(1, activation='sigmoid')
  output_layer.build((None, neumf_vector.shape[1]))  # Build layer with correct input shape
  output_layer.set_weights([combined_weights, combined_bias])

  output = output_layer(neumf_vector)
  model = Model(inputs=[user_input, movie_input], outputs=output)
  sgd_optimizer = SGD(learning_rate=l_rate)
  model.compile(optimizer=sgd_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

  return model





In [51]:
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import load_model


In [52]:
import numpy as np

def hit_ratio_at_k(model, test_users, test_movies, test_ratings, k=10):

    hits = 0
    mp = {}

    for i in range(len(test_users)):
      mp[test_users[i]] = []

    for i in range(len(test_users)):
      mp[test_users[i]].append((test_movies[i],test_ratings[i]))

    for user in mp.keys():
      mp[user] = sorted(mp[user], key=lambda x: x[1], reverse=True)

    for user in mp.keys():
      user_input = np.array([user] * len(mp[user]))
      movie_input = np.array([x[0] for x in mp[user]])
      predictions = model.predict([user_input, movie_input], batch_size=4096).flatten()

      top_k_indices = np.argsort(-predictions)[:k]
      top_k_movies = [mp[user][j][0] for j in top_k_indices]

      if mp[user][0][0] in top_k_movies:
        hits += 1

    hr_k = hits / len(test_users)
    return hr_k

    # hits = 0
    # mp = {}

    # for i in range(len(test_users)):
    #   mp[test_users[i]] = []

    # for i in range(len(test_users)):
    #   mp[test_users[i]].append((test_movies[i],test_ratings[i]))

    # for user in mp.keys():
    #   mp[user] = sorted(mp[user], key=lambda x: x[1], reverse=True) # sort in according to descending order of rating

    # for user in mp.keys():
    #   user_input = np.array([user] * len(mp[user]))
    #   print("User Input")
    #   print(user_input)

    #   movie_input = np.array([x[0] for x in mp[user]])
    #   print("Movie Input")
    #   print(movie_input)

    #   predictions = model.predict([user_input, movie_input], batch_size=4096).flatten()
    #   print("Predictions")
    #   print(predictions)

    #   top_k_indices = np.argsort(-predictions)[:k]
    #   print("Top K Indices")
    #   print(top_k_indices)

    #   top_k_movies = [mp[user][j][0] for j in top_k_indices]
    #   print("Top K Movies")
    #   print(top_k_movies)

    #   print(mp[user][0][0])

    #   if mp[user][0][0] in top_k_movies:
    #     hits += 1

    #   break;

    # hr_k = hits
    # return hr_k


In [None]:
class HitRatioCallback(Callback):
    def __init__(self, validation_data, k=10):
        super().__init__()
        self.val_users, self.val_movies, self.val_ratings = validation_data
        self.k = k
        self.best_hr = 0  # Store best HR@K
        self.best_model_path = "best_neumf_model_hr.h5"

    def on_epoch_end(self, epoch, logs=None):
        hr_k = hit_ratio_at_k(self.model, self.val_users, self.val_movies, self.val_ratings, self.k)
        print(f"Epoch {epoch+1}: HR@{self.k} = {hr_k:.4f}")

        # Save best model based on HR@K
        if hr_k > self.best_hr:
            self.best_hr = hr_k
            self.model.save(self.best_model_path)
            print(f"✅ New best model saved with HR@{self.k} = {hr_k:.4f}")


In [None]:
learning_rates = [0.01, 0.001]
alpha = [0.3,0.5]

for learning_rate in learning_rates:
  for alpha_value in alpha:
    model = build_nmf(alpha_value, learning_rate)
    hit_ratio_callback = HitRatioCallback(validation_data=(test_users, test_movies, test_ratings), k=10)
    model.fit(
    x=[train_users, train_movies],
    y=train_ratings,
    epochs=10,
    batch_size=1024,
    callbacks=[hit_ratio_callback]
    )
best_model_HR_K = load_model("best_neumf_model_hr.h5")


# learning_rates = [0.01, 0.001]
# alpha = [0.3, 0.5]

# # Select a single training sample (first one as an example)
# single_user = np.array([train_users[0]])  # Pick the first user
# single_movie = np.array([train_movies[0]])  # Pick the first movie
# single_rating = np.array([train_ratings[0]])  # Pick the corresponding rating

# for learning_rate in learning_rates:
#     for alpha_value in alpha:
#         print(f"Training model with learning_rate={learning_rate}, alpha={alpha_value}")

#         model = build_nmf(alpha_value, learning_rate)

#         hit_ratio_callback = HitRatioCallback(validation_data=(test_users, test_movies, test_ratings), k=10)

#         history = model.fit(
#             x=[single_user, single_movie],  # Only one training instance
#             y=single_rating,  # Only one rating
#             epochs=1,  # Train for 1 epoch
#             batch_size=1,  # Use batch size of 1 since we have only one sample
#             callbacks=[hit_ratio_callback],
#             verbose=1  # Print training progress
#         )

#         # Break after first training run
#         break
#     break


In [None]:
import numpy as np

def ndcg_at_k(model, test_users, test_movies, test_ratings, k=10):

    mp = {}

    for i in range(len(test_users)):
      mp[test_users[i]] = []

    for i in range(len(test_users)):
      mp[test_users[i]].append((test_movies[i],test_ratings[i]))

    for user in mp.keys():
      mp[user] = sorted(mp[user], key=lambda x: x[1], reverse=True) # sort in according to descending order of rating

    ndcg_scores = []

    for user in mp.keys():
      user_input = np.array([user] * len(mp[user]))
      movie_input = np.array([x[0] for x in mp[user]])
      predictions = model.predict([user_input, movie_input], batch_size=4096).flatten()

      sorted_indices_top_k = np.argsort(-predictions)[:k]
      sorted_movies = [mp[user][j][0] for j in sorted_indices_top_k]

      dcg = 0
      for idx in range(len(sorted_movies)):
        if sorted_movies[idx] == mp[user][0][0]:  ## got the movie at idx position
          dcg = 1/np.log2(idx+1+1)
          break
      idcg = 1
      ndcg = dcg/idcg
      ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)

In [None]:
class NDCGCallBack(Callback):
    def __init__(self, validation_data, k=10):

        super().__init__()
        self.val_users, self.val_movies, self.val_ratings = validation_data
        self.k = k
        self.best_ndcg = 0  # Store best NDCG
        self.best_model_path = "best_neumf_model_ndcg.h5"

    def on_epoch_end(self, epoch, logs=None):
        ndcg_k = ndcg_at_k(self.model, self.val_users, self.val_movies, self.val_ratings, self.k)
        print(f"Epoch {epoch+1}: HR@{self.k} = {ndcg_k:.4f}")

        # Save best model based on HR@K
        if ndcg_k > self.best_ndcg:
            self.best_ndcg = ndcg_k
            self.model.save(self.best_model_path)
            print(f"✅ New best model saved with HR@{self.k} = {ndcg_k:.4f}")


In [None]:
learning_rates = [0.01, 0.001]
alpha = [0.3,0.5]

for learning_rate in learning_rates:
  for alpha_value in alpha:
    model = build_nmf(alpha_value, learning_rate)
    ndcg_callback = NDCGCallBack(validation_data=(test_users, test_movies, test_ratings), k=10)
    model.fit(
    x=[train_users, train_movies],
    y=train_ratings,
    epochs=10,  # Adjust epochs as needed
    batch_size=1024,
    callbacks=[ndcg_callback]
    )
best_model_ndcg_k = load_model("best_neumf_model_ndcg.h5")
