# Data preprocessing

In [21]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix


users = pd.read_csv('ml-1m/users.dat', sep='::', header=None,encoding='ISO-8859-1')
users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']

movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None,encoding='ISO-8859-1')
movies.columns = ['movie_id', 'title', 'genres']

movie_to_index = {}
for i, movie_id in enumerate(movies['movie_id'].unique()):
    movie_to_index[movie_id] = i

ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, engine='python',encoding='ISO-8859-1')
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings['user_id'] = ratings['user_id'] - 1
ratings['movie_id'] = ratings['movie_id'].apply(lambda x: movie_to_index[x])


  users = pd.read_csv('ml-1m/users.dat', sep='::', header=None,encoding='ISO-8859-1')
  movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None,encoding='ISO-8859-1')


In [None]:
movie_genre = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, engine='python',encoding='ISO-8859-1')
movie_genre.columns = ['movie_id', 'title', 'genres']
genres = set()
for g in movie_genre['genres']:
    genres.update(g.split('|'))
genre_dict = {g: i for i, g in enumerate(genres)}
movie_genre_vec = []
for g in movie_genre['genres']:
    vec = np.zeros(len(genres))
    for gg in g.split('|'):
        vec[genre_dict[gg]] = 1
    movie_genre_vec.append(vec)
movie_genre_mat = np.array(movie_genre_vec)

In [None]:
num_users = ratings['user_id'].nunique()
num_movies = len(movie_to_index)
adj_matrix = np.zeros((num_users, num_movies))

for _, row in ratings.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    rating = row['rating']
    adj_matrix[user_id, movie_id] = rating

adj_matrix = coo_matrix(adj_matrix)

In [None]:
from scipy.sparse import save_npz

save_npz('adj_matrix.npz', adj_matrix)


##### Train-Val-Test Split for Model Inputs

In [None]:
ratings_subset = ratings[['user_id', 'movie_id', 'rating']]
genre_cols = [genre for genre, index in sorted(genre_dict.items(), key=lambda x: x[0])]

merged_data = pd.merge(ratings_subset, pd.DataFrame(movie_genre_mat, columns=genre_cols), left_on='movie_id', right_index=True)



In [None]:
merged_data.to_csv("preprocess.txt",  sep='\t', index=False)

# KGCN implementation

In [None]:
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Embedding, Concatenate, Dropout, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import RootMeanSquaredError

num_users = ratings['user_id'].max()+1
num_movies = ratings['movie_id'].max() +1

num_genres = len(genre_dict)

embedding_size = 32
dropout_rate = 0.2
learning_rate = 0.001
num_epochs = 20
batch_size = num_movies
reg_lambda = 0.01

user_input = Input(shape=(1,), name='user_input_kgcn')
movie_input = Input(shape=(1,), name='movie_input_kgcn')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size,  name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)
    
user_embedding = Dropout(dropout_rate)(user_embedding)
movie_embedding = Dropout(dropout_rate)(movie_embedding)


In [None]:
genre_input = Input(shape=(num_genres,), name='genre_input')
genre_embedding = Dense(embedding_size, activation='relu', kernel_regularizer=l2(reg_lambda), name='genre_embedding')(genre_input)

gcn_1 = tf.linalg.matmul(adj_matrix.toarray(), tf.cast(genre_embedding, tf.float64) , name='gcn_1')
gcn_2 = tf.linalg.matmul(adj_matrix.toarray().T, gcn_1, name='gcn_2')
gcn_2_reshaped = tf.expand_dims(gcn_2, axis=1)

user_movie_concat = Concatenate()([user_embedding, movie_embedding])
user_movie_genre_concat = Concatenate(axis=2)([user_movie_concat, gcn_2_reshaped])

dense_1 = Dense(64, activation='relu', kernel_regularizer=l2(reg_lambda), name='dense_1')(user_movie_genre_concat)
dense_2 = Dense(32, activation='relu', kernel_regularizer=l2(reg_lambda), name='dense_2')(dense_1)
output = Dense(1, activation='linear', name='output')(dense_2)

kgcn_model = Model(inputs=[user_input, movie_input, genre_input], outputs=output)


In [None]:
optimizer = Adam(learning_rate=learning_rate)
rmse = RootMeanSquaredError()
kgcn_model.compile(loss='mse', optimizer=optimizer, metrics=[rmse])

In [None]:

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

num_users = ratings['user_id'].max()+1
num_movies = ratings['movie_id'].max() +1
num_genres = len(genre_dict)
batch_size = num_movies

train_users = np.array(train_data['user_id'])
train_movies = np.array(train_data['movie_id'])
train_movie_genre = np.array(train_data[genre_cols])
train_ratings = np.array(train_data['rating'])
consistent_length = (train_users.shape[0] // batch_size) * batch_size
train_users = train_users[:consistent_length]
train_movies = train_movies[:consistent_length]
train_movie_genre = train_movie_genre[:consistent_length]
train_ratings = train_ratings[:consistent_length]


val_users = np.array(val_data['user_id'])
val_movies = np.array(val_data['movie_id'])
val_movie_genre = np.array(val_data[genre_cols])
val_ratings = np.array(val_data['rating'])
consistent_length = (val_users.shape[0] // batch_size) * batch_size
val_users = val_users[:consistent_length]
val_movies = val_movies[:consistent_length]
val_movie_genre = val_movie_genre[:consistent_length]
val_ratings = val_ratings[:consistent_length]

test_users = np.array(test_data['user_id'])
test_movies = np.array(test_data['movie_id'])
test_movie_genre = np.array(test_data[genre_cols])
test_ratings = np.array(test_data['rating'])
consistent_length = (test_users.shape[0] // batch_size) * batch_size
test_users = test_users[:consistent_length]
test_movies = test_movies[:consistent_length]
test_movie_genre = test_movie_genre[:consistent_length]
test_ratings = test_ratings[:consistent_length]

In [None]:
history = kgcn_model.fit([train_users, train_movies, train_movie_genre], train_ratings, validation_data=([val_users, val_movies, val_movie_genre], val_ratings),batch_size=batch_size, epochs=2, verbose=0)

kgcn_model.save('kgcn_model.h5')

In [None]:
test_loss, test_rmse = kgcn_model.evaluate([test_users, test_movies, test_movie_genre], test_ratings, batch_size=batch_size,verbose =0)

In [None]:
from sklearn.metrics import mean_squared_error

test_preds = kgcn_model.predict([test_users, test_movies, test_movie_genre], batch_size=batch_size, verbose=0)
test_rmse = np.sqrt(mean_squared_error(test_ratings, np.squeeze(test_preds)))