In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import numpy as np

# Load data
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')

# Preprocess movies dataset (convert genres into binary vectors)
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)
movies = pd.concat([movies, genre_df], axis=1)

# Merge movie data and ratings
data = pd.merge(ratings, movies, on='movieId')

# Map userId and movieId to indices
user_ids = data['userId'].unique().tolist()
movie_ids = data['movieId'].unique().tolist()

user_to_index = {x: i for i, x in enumerate(user_ids)}
movie_to_index = {x: i for i, x in enumerate(movie_ids)}

data['userId'] = data['userId'].apply(lambda x: user_to_index[x])
data['movieId'] = data['movieId'].apply(lambda x: movie_to_index[x])

# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Extract features
X_train_users = train_data['userId'].values
X_train_movies = train_data['movieId'].values
X_train_genres = train_data[mlb.classes_].values
y_train = train_data['rating'].values

X_test_users = test_data['userId'].values
X_test_movies = test_data['movieId'].values
X_test_genres = test_data[mlb.classes_].values
y_test = test_data['rating'].values


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate

# Parameters
num_users = len(user_ids)
num_movies = len(movie_ids)
embedding_size = 50

# User and movie embeddings
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding')(user_input)
user_vector = Flatten()(user_embedding)

movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)
movie_vector = Flatten()(movie_embedding)

# Collaborative filtering: Dot product of user and movie embeddings
collab_score = Dot(axes=1)([user_vector, movie_vector])


In [3]:
# Genre input for the content-based part
genre_input = Input(shape=(len(mlb.classes_),), name='genre_input')

# Combine genre data with the collaborative score
x = Concatenate()([user_vector, movie_vector, genre_input])

# Deep neural network for combining features
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)

# Final output layer
output = Dense(1)(x)

# Define the hybrid model
model = Model(inputs=[user_input, movie_input, genre_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


In [4]:
# Training the model
history = model.fit([X_train_users, X_train_movies, X_train_genres], y_train,
                    validation_data=([X_test_users, X_test_movies, X_test_genres], y_test),
                    epochs=5, batch_size=256, verbose=1)


Epoch 1/5
[1m62501/62501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2777s[0m 44ms/step - loss: 0.7936 - mae: 0.6791 - val_loss: 0.6568 - val_mae: 0.6173
Epoch 2/5
[1m62501/62501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2536s[0m 41ms/step - loss: 0.6152 - mae: 0.5957 - val_loss: 0.6270 - val_mae: 0.6035
Epoch 3/5
[1m62501/62501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2469s[0m 39ms/step - loss: 0.5605 - mae: 0.5646 - val_loss: 0.6173 - val_mae: 0.5969
Epoch 4/5
[1m62501/62501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2436s[0m 39ms/step - loss: 0.5234 - mae: 0.5419 - val_loss: 0.6164 - val_mae: 0.5966
Epoch 5/5
[1m62501/62501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2513s[0m 40ms/step - loss: 0.4940 - mae: 0.5234 - val_loss: 0.6203 - val_mae: 0.5932


In [5]:
pred_ratings = model.predict([X_test_users[:10], X_test_movies[:10], X_test_genres[:10]])
print("Predicted ratings: ", pred_ratings)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Predicted ratings:  [[3.2636611]
 [4.242443 ]
 [3.7497861]
 [3.7291288]
 [2.6100636]
 [2.3304958]
 [3.4665904]
 [2.4458408]
 [3.1607926]
 [4.237241 ]]


In [6]:
test_loss, test_mae = model.evaluate([X_test_users, X_test_movies, X_test_genres], y_test, verbose=1)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

[1m125002/125002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 1ms/step - loss: 0.6192 - mae: 0.5927
Test Loss: 0.6202795505523682, Test MAE: 0.5931825041770935


In [7]:
model.save('movie_recommender_model.h5')
print('Model Saved Succesfully ....')

Model Saved Succesfully ....
