# Movielens Dataset Preprocessing

In [2]:
!unzip "ml-1m.zip"

'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

# Load user data
users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, engine='python',encoding='ISO-8859-1')
users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']

# Load movie data
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, engine='python',encoding='ISO-8859-1')
movies.columns = ['movie_id', 'title', 'genres']

# Create mapping between original movie IDs and contiguous indices
movie_to_index = {}
for i, movie_id in enumerate(movies['movie_id'].unique()):
    movie_to_index[movie_id] = i

# Load ratings data
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, engine='python',encoding='ISO-8859-1')
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

# Convert movie and user IDs to contiguous indices
ratings['user_id'] = ratings['user_id'] - 1
ratings['movie_id'] = ratings['movie_id'].apply(lambda x: movie_to_index[x])


In [4]:

# Create sparse adjacency matrix for input into KGCN (assuming movies are connected based on genre)
movie_genre = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, engine='python',encoding='ISO-8859-1')
movie_genre.columns = ['movie_id', 'title', 'genres']
genres = set()
for g in movie_genre['genres']:
    genres.update(g.split('|'))
genre_dict = {g: i for i, g in enumerate(genres)}
movie_genre_vec = []
for g in movie_genre['genres']:
    vec = np.zeros(len(genres))
    for gg in g.split('|'):
        vec[genre_dict[gg]] = 1
    movie_genre_vec.append(vec)
movie_genre_mat = np.array(movie_genre_vec)


# Create empty adjacency matrix
num_users = ratings['user_id'].nunique()
num_movies = len(movie_to_index)
adj_matrix = np.zeros((num_users, num_movies))

# Fill in adjacency matrix with ratings
for _, row in ratings.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    rating = row['rating']
    adj_matrix[user_id, movie_id] = rating

# Convert adjacency matrix to sparse format
adj_matrix = coo_matrix(adj_matrix)


In [5]:
ratings_subset = ratings[['user_id', 'movie_id', 'rating']]
genre_cols = [genre for genre, index in sorted(genre_dict.items(), key=lambda x: x[0])]

# Merge with movie_genre_mat matrix
merged_data = pd.merge(ratings_subset, pd.DataFrame(movie_genre_mat, columns=genre_cols), left_on='movie_id', right_index=True)



In [6]:

from sklearn.model_selection import train_test_split

# Split data into training and test sets
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

# Split training data into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

num_users = ratings['user_id'].max()+1
num_movies = ratings['movie_id'].max() +1
num_genres = len(genre_dict)
batch_size = num_movies

# Extract inputs and targets for training set
train_users = np.array(train_data['user_id'])
train_movies = np.array(train_data['movie_id'])
train_movie_genre = np.array(train_data[genre_cols])
train_ratings = np.array(train_data['rating'])
consistent_length = (train_users.shape[0] // batch_size) * batch_size
train_users = train_users[:consistent_length]
train_movies = train_movies[:consistent_length]
train_movie_genre = train_movie_genre[:consistent_length]
train_ratings = train_ratings[:consistent_length]


# Extract inputs and targets for validation set
val_users = np.array(val_data['user_id'])
val_movies = np.array(val_data['movie_id'])
val_movie_genre = np.array(val_data[genre_cols])
val_ratings = np.array(val_data['rating'])
consistent_length = (val_users.shape[0] // batch_size) * batch_size
val_users = val_users[:consistent_length]
val_movies = val_movies[:consistent_length]
val_movie_genre = val_movie_genre[:consistent_length]
val_ratings = val_ratings[:consistent_length]

# Extract inputs and targets for test set
test_users = np.array(test_data['user_id'])
test_movies = np.array(test_data['movie_id'])
test_movie_genre = np.array(test_data[genre_cols])
test_ratings = np.array(test_data['rating'])
consistent_length = (test_users.shape[0] // batch_size) * batch_size
test_users = test_users[:consistent_length]
test_movies = test_movies[:consistent_length]
test_movie_genre = test_movie_genre[:consistent_length]
test_ratings = test_ratings[:consistent_length]

# KGCN implementation

In [7]:
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Embedding, Concatenate, Dropout, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import RootMeanSquaredError

# Define hyperparameters
num_users = ratings['user_id'].max()+1
num_movies = ratings['movie_id'].max() +1

num_genres = len(genre_dict)

embedding_size = 32
dropout_rate = 0.2
learning_rate = 0.001
num_epochs = 20
batch_size = num_movies
reg_lambda = 0.01

# Define KGCN model architecture
user_input = Input(shape=(1,), name='user_input_kgcn')
movie_input = Input(shape=(1,), name='movie_input_kgcn')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size,  name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)
    
user_embedding = Dropout(dropout_rate)(user_embedding)
movie_embedding = Dropout(dropout_rate)(movie_embedding)


In [8]:
from keras.layers import Lambda
import sys


# Concatenate user and movie embeddings

genre_input = Input(shape=(num_genres,), name='genre_input')
# Define genre embedding and GCN layers
genre_embedding = Dense(embedding_size, activation='relu', kernel_regularizer=l2(reg_lambda), name='genre_embedding')(genre_input)
# genre_embedding_reshaped = tf.reshape(genre_embedding, shape=(-1, embedding_size)) 
# genre_embedding_reshaped = Reshape(target_shape=(1, embedding_size))(genre_embedding)
# genre_embedding_tiled = tf.tile(genre_embedding_reshaped, [1, num_movies, 1])

# print_layer = Lambda(lambda x: tf.print(x, [x], message='genre_embedding_reshaped = '))

gcn_1 = tf.linalg.matmul(adj_matrix.toarray(), tf.cast(genre_embedding, tf.float64) , name='gcn_1')
# gcn_1=tf.squeeze(gcn_1, axis=0)
gcn_2 = tf.linalg.matmul(adj_matrix.toarray().T, gcn_1, name='gcn_2')
gcn_2_reshaped = tf.expand_dims(gcn_2, axis=1)

user_movie_concat = Concatenate()([user_embedding, movie_embedding])


# Concatenate genre embeddings with user-movie embeddings
user_movie_genre_concat = Concatenate(axis=2)([user_movie_concat, gcn_2_reshaped])

# Define final dense layers and output
dense_1 = Dense(64, activation='relu', kernel_regularizer=l2(reg_lambda), name='dense_1')(user_movie_genre_concat)
dense_2 = Dense(32, activation='relu', kernel_regularizer=l2(reg_lambda), name='dense_2')(dense_1)
output = Dense(1, activation='linear', name='output')(dense_2)

kgcn_model = Model(inputs=[user_input, movie_input, genre_input], outputs=output)


In [9]:
# Compile model
optimizer = Adam(learning_rate=learning_rate)
rmse = RootMeanSquaredError()
kgcn_model.compile(loss='mse', optimizer=optimizer, metrics=[rmse])

In [10]:
# # Split data into train and test sets
# train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
# train_ratings, val_ratings = train_test_split(train_ratings, test_size=0.2, random_state=42)

In [11]:
merged_data

Unnamed: 0,user_id,movie_id,rating,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1176,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
120,1,1176,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1339,11,1176,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1518,14,1176,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1747,16,1176,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984335,5948,2129,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940262,5674,2634,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
957826,5779,2776,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
970914,5850,3538,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [12]:
# Train model
history = kgcn_model.fit([train_users, train_movies, train_movie_genre], train_ratings, validation_data=([val_users, val_movies, val_movie_genre], val_ratings),batch_size=batch_size, epochs=2, verbose=0)


In [13]:

# Evaluate model on test set
test_loss, test_rmse = kgcn_model.evaluate([test_users, test_movies, test_movie_genre], test_ratings, batch_size=batch_size,verbose =1)
print(f'Test Loss: {test_loss:.4f}, Test RMSE: {test_rmse:.4f}')

Test Loss: 62069240.0000, Test RMSE: 7878.4023


In [14]:
# Make predictions on test set
test_preds = kgcn_model.predict([test_users, test_movies, test_movie_genre], batch_size=batch_size)




In [15]:
from sklearn.metrics import mean_squared_error
import numpy as np

test_rmse = np.sqrt(mean_squared_error(test_ratings, np.squeeze(test_preds)))
print(f'Test RMSE: {test_rmse:.4f}')

Test RMSE: 7878.4019


In [16]:
model.summary()

NameError: name 'model' is not defined

# NeuMF implementation

In [None]:
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import RootMeanSquaredError

latent_dim_nn = 8
latent_dim_mf = 8
dropout_rate = 0.2
learning_rate = 0.001
num_epochs = 20
batch_size = 256
reg_lambda = 0.01

# Define NeuMF model architecture
user_input = Input(shape=(1,), name='nn_user_input')
movie_input = Input(shape=(1,), name='movie_input')

# Define MLP embedding layers
nn_user_embedding = Dense(latent_dim_nn, activation='relu', kernel_regularizer=l2(reg_lambda), name='nn_user_embedding')(user_input)
nn_movie_embedding = Dense(latent_dim_nn, activation='relu', kernel_regularizer=l2(reg_lambda), name='nn_movie_embedding')(movie_input)

# Define MLP layers
nn_layer_1 = Dense(64, activation='relu', kernel_regularizer=l2(reg_lambda), name='nn_layer_1')(Concatenate()([nn_user_embedding, nn_movie_embedding]))
nn_layer_1 = Dropout(dropout_rate)(nn_layer_1)
nn_layer_2 = Dense(32, activation='relu', kernel_regularizer=l2(reg_lambda), name='nn_layer_2')(nn_layer_1)
nn_layer_2 = Dropout(dropout_rate)(nn_layer_2)
nn_layer_3 = Dense(16, activation='relu', kernel_regularizer=l2(reg_lambda), name='nn_layer_3')(nn_layer_2)

# Define MF embedding layers
mf_user_embedding = Dense(latent_dim_mf, activation='relu', kernel_regularizer=l2(reg_lambda), name='mf_user_embedding')(user_input)
mf_movie_embedding = Dense(latent_dim_mf, activation='relu', kernel_regularizer=l2(reg_lambda), name='mf_movie_embedding')(movie_input)

# Concatenate nn and MF layers
nnmf_layer = Concatenate()([mf_user_embedding, mf_movie_embedding, nn_layer_3])

# Define output
output = Dense(1, activation='linear', name='outputnnmf')(nnmf_layer)

NN_model = Model(inputs=[user_input, movie_input], outputs=output)

# Compile model
optimizer = Adam(learning_rate=learning_rate)
rmse = RootMeanSquaredError()
NN_model.compile(loss='mse', optimizer=optimizer, metrics=[rmse])

# Train model
history = NN_model.fit([train_users, train_movies], train_ratings,
                    validation_data=([val_users, val_movies], val_ratings),
                    batch_size=batch_size, epochs=num_epochs, verbose=1)

# Evaluate model on test set
test_loss, test_rmse = NN_model.evaluate([test_users, test_movies], test_ratings)
print(f'Test Loss: {test_loss:.4f}, Test RMSE: {test_rmse:.4f}')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 1.3057, Test RMSE: 1.1343


# CCCFNet

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, Flatten, Softmax, Dot
from tensorflow.keras.models import Model

# Define input shapes
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))
genre_input = Input(shape=(num_genres,))


# Define user and movie embeddings
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding_cccfnet')(user_input)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding_cccfnet')(movie_input)

# Define MLP model for user and movie embeddings
mlp_layer1 = Concatenate()([Flatten()(user_embedding), Flatten()(movie_embedding)])
mlp_layer2 = Dense(64, activation='relu')(mlp_layer1)
mlp_layer3 = Dropout(0.2)(mlp_layer2)
mlp_layer4 = Dense(32, activation='relu')(mlp_layer3)


In [None]:

# Define attention-based model for genre embedding
genre_layer1 = Dense(embedding_size, activation='relu')(genre_input)
genre_layer2 = BatchNormalization()(genre_layer1)
genre_layer3 = Dense(embedding_size, activation='relu')(genre_layer2)
genre_layer4 = BatchNormalization()(genre_layer3)
genre_layer5 = Dense(embedding_size, activation='relu')(genre_layer4)
genre_layer5_reshaped = Reshape((1, embedding_size))(genre_layer5)
attention_scores = Dot(axes=(2,1))([movie_embedding, genre_layer5])
attention_scores = Reshape((1,))(attention_scores)
attention_weights = Softmax()(attention_scores)
attention_output = Dot(axes=(1,1))([movie_embedding, attention_weights])


In [None]:

# Combine the outputs from the MLP and attention-based models
final_layer1 = Concatenate()([mlp_layer4, attention_output])
final_layer2 = Dense(16, activation='relu')(final_layer1)
final_output = Dense(1)(final_layer2)


In [None]:

# Define the model
cccfnet_model = Model(inputs=[user_input, movie_input, genre_input], outputs=final_output)

# Compile the model
cccfnet_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])


In [None]:

# Train the model
history = cccfnet_model.fit([train_users, train_movies, train_movie_genre], train_ratings, validation_data=([val_users, val_movies, val_movie_genre], val_ratings), batch_size=batch_size, epochs=10, verbose=0)



In [None]:

# Evaluate the model on the validation set
val_loss, val_rmse = cccfnet_model.evaluate([test_users, test_movies, test_movie_genre], test_ratings, batch_size=batch_size, verbose=0)
print(f'Validation Loss: {val_loss:.4f}, Validation RMSE: {val_rmse:.4f}')


Validation Loss: 0.7830, Validation RMSE: 0.7830


# Combined Model

In [None]:
# Combine the outputs from the KGCN, NeuMF, and CCCFNet models
kgcn_output = kgcn_model.output
nn_output = NN_model.output
cccfnet_output = cccfnet_model.output

kgcn_output = Flatten()(kgcn_output)

combined_output = Concatenate(axis = 1)([kgcn_output, nn_output, cccfnet_output])
combined_layer1 = Dense(16, activation='relu')(combined_output)
final_output = Dense(1)(combined_layer1)

# Define the combined model
combined_model = Model(inputs=[kgcn_model.input, NN_model.input, cccfnet_model.input], outputs=final_output)

# Compile the combined model
combined_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse', 'mae'])

In [None]:
# Train the combined model on the training set
history = combined_model.fit([ train_users, train_movies, train_movie_genre, train_users, train_movies, train_movie_genre,train_users, train_movies, train_movie_genre],train_ratings,validation_data=([[val_users, val_movies, val_movie_genre],[val_users, val_movies, val_movie_genre],[val_users, val_movies, val_movie_genre]],[val_ratings, val_ratings, val_ratings]),batch_size=batch_size,epochs=num_epochs,verbose=1)

# Evaluate the performance of the combined model on the validation set
results = combined_model.evaluate([val_users, val_movies, val_movie_genre,val_users, val_movies, val_movie_genre,val_users, val_movies, val_movie_genre],val_ratings,verbose=0)
print(f"Validation loss: {results[0]}, Validation MSE: {results[1]}, Validation MAE: {results[2]}")


Epoch 1/20


AssertionError: in user code:

    File "c:\Users\ABHIJEET\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\ABHIJEET\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\ABHIJEET\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\ABHIJEET\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\ABHIJEET\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\ABHIJEET\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\functional.py", line 680, in _run_internal_graph
        assert x_id in tensor_dict, "Could not compute output " + str(x)

    AssertionError: Exception encountered when calling layer 'model_18' (type Functional).
    
    Could not compute output KerasTensor(type_spec=TensorSpec(shape=(3883, 1), dtype=tf.float32, name=None), name='dense_56/BiasAdd:0', description="created by layer 'dense_56'")
    
    Call arguments received by layer 'model_18' (type Functional):
      • inputs=('tf.Tensor(shape=(None,), dtype=int64)', 'tf.Tensor(shape=(None,), dtype=int64)', 'tf.Tensor(shape=(None, 18), dtype=float32)')
      • training=True
      • mask=None


In [None]:
combined_model.summary()

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, 18)]         0           []                               
                                                                                                  
 dense_50 (Dense)               (None, 32)           608         ['input_15[0][0]']               
                                                                                                  
 genre_input (InputLayer)       [(None, 18)]         0           []                               
                                                                                                  
 batch_normalization_20 (BatchN  (None, 32)          128         ['dense_50[0][0]']               
 ormalization)                                                                             