In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import datetime


ratings = pd.read_csv('/content/drive/MyDrive/datasets/ratings.csv', sep='::', engine='python',names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

movies = pd.read_csv('/content/drive/MyDrive/datasets/movies1.csv', sep='::', engine='python', encoding = 'latin1',
                        names=['MovieID', 'Title', 'Genres'])

users = pd.read_csv('/content/drive/MyDrive/datasets/users.csv', sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])

# cleaning the data
non_numeric = movies[~movies['MovieID'].str.isdigit()]
movies = movies[movies['MovieID'].str.isdigit()]
movies['MovieID'] = movies['MovieID'].astype(int)

#merging the user, movie, rating datasets
merged_data = pd.merge(ratings, movies, on='MovieID')
merged_data = pd.merge(merged_data, users, on='UserID')

#print(merged_data['Rating'].value_counts())

#converting gender column to bin format
merged_data['Gender'] = merged_data['Gender'].map({'M': 0, 'F': 1})

#converting genres into in 0,1 and adding it to axis 1
genres_split = merged_data['Genres'].str.get_dummies('|')
merged_data = pd.concat([merged_data, genres_split], axis=1)

#timestamp to numeric data and formatting
merged_data['Timestamp'] = pd.to_datetime(merged_data['Timestamp'], unit='s')
merged_data['Year'] = merged_data['Timestamp'].dt.year


#encoding the non numeric data
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

merged_data['UserID'] = user_encoder.fit_transform(merged_data['UserID'])
merged_data['MovieID'] = movie_encoder.fit_transform(merged_data['MovieID'])


#splitting the dataset to train and test
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

#changing the range of the "Rating" column to range of 0-1.
train_data['Rating'] = train_data['Rating'] / 5.0
test_data['Rating'] = test_data['Rating'] / 5.0



#Mapping the id's in dataset and map them to the train and test datasets.
user_ids = train_data['UserID'].unique()
movie_ids = train_data['MovieID'].unique()

user_to_index = {user: idx for idx, user in enumerate(user_ids)}
movie_to_index = {movie: idx for idx, movie in enumerate(movie_ids)}

train_data['UserIndex'] = train_data['UserID'].map(user_to_index)
train_data['MovieIndex'] = train_data['MovieID'].map(movie_to_index)
test_data['UserIndex'] = test_data['UserID'].map(user_to_index)
test_data['MovieIndex'] = test_data['MovieID'].map(movie_to_index)


test_data.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['MovieID'] = movies['MovieID'].astype(int)


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Year,UserIndex,MovieIndex
455520,3652,2559,1.0,2000-08-16 19:22:28,For a Few Dollars More (1965),Western,0,18,15,96661,...,0,0,0,0,0,0,1,2000,4377,288.0
338156,2751,1492,0.6,2000-11-02 20:13:36,Beetlejuice (1988),Comedy|Fantasy,0,25,15,77006,...,0,0,0,0,0,0,0,2000,937,626.0
589516,4646,2387,0.8,2000-07-19 04:35:45,Funny Bones (1995),Comedy,0,35,20,80222,...,0,0,0,0,0,0,0,2000,1245,1966.0
77838,677,1352,0.6,2001-04-30 13:37:43,Out of Africa (1985),Drama|Romance,0,25,0,34952,...,0,0,1,0,0,0,0,2001,106,1098.0
324231,2600,464,1.0,2000-11-09 21:38:55,Fargo (1996),Crime|Drama|Thriller,0,35,16,27613,...,0,0,0,0,1,0,0,2000,5759,181.0


In [None]:
# Separate features (X) and target (y)
X_train = train_data[['UserIndex', 'MovieIndex']]  # Features: User and Movie indices
y_train = train_data['Rating']  # Target: Normalized Ratings

X_test = test_data[['UserIndex', 'MovieIndex']]  # Features: User and Movie indices
y_test = test_data['Rating']  # Target: Normalized Ratings

# Display the prepared data for confirmation
print("X_train sample:")
print(X_train.head())

print("\ny_train sample:")
print(y_train.head())

print("\nX_test sample:")
print(X_test.head())

print("\ny_test sample:")
print(y_test.head())


X_train sample:
        UserIndex  MovieIndex
285449          0           0
470921          1           1
443863          2           2
496843          3           3
543145          4           4

y_train sample:
285449    0.8
470921    0.2
443863    0.8
496843    0.6
543145    0.8
Name: Rating, dtype: float64

X_test sample:
        UserIndex  MovieIndex
455520       4377       288.0
338156        937       626.0
589516       1245      1966.0
77838         106      1098.0
324231       5759       181.0

y_test sample:
455520    1.0
338156    0.6
589516    0.8
77838     0.6
324231    1.0
Name: Rating, dtype: float64


In [None]:
# Define the neural network model
n_users = merged_data['UserID'].nunique()
n_movies = merged_data['MovieID'].nunique()
embedding_dim = 128


user_input = tf.keras.layers.Input(shape=(1,), name='user_input')
user_embedding = tf.keras.layers.Embedding(
    input_dim=n_users + 1,
    output_dim=embedding_dim,
    name='user_embedding',
    embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
)(user_input)
user_vector = tf.keras.layers.Flatten(name='user_vector')(user_embedding)

movie_input = tf.keras.layers.Input(shape=(1,), name='movie_input')
movie_embedding = tf.keras.layers.Embedding(
    input_dim=n_movies + 1,
    output_dim=embedding_dim,
    name='movie_embedding',
    embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
)(movie_input)
movie_vector = tf.keras.layers.Flatten(name='movie_vector')(movie_embedding)

interaction = tf.keras.layers.Dot(axes=1)([user_vector, movie_vector])

# Hidden layers with batch normalization and increased dropout
hidden = tf.keras.layers.Dense(256, activation='relu')(interaction)
hidden = tf.keras.layers.BatchNormalization()(hidden)
hidden = tf.keras.layers.Dropout(0.5)(hidden)

for _ in range(5):  # 5 additional hidden layers
    hidden = tf.keras.layers.Dense(128, activation='relu')(hidden)
    hidden = tf.keras.layers.BatchNormalization()(hidden)
    hidden = tf.keras.layers.Dropout(0.4)(hidden)

hidden = tf.keras.layers.Dense(64, activation='relu')(hidden)
hidden = tf.keras.layers.BatchNormalization()(hidden)
hidden = tf.keras.layers.Dropout(0.3)(hidden)

output = tf.keras.layers.Dense(1, activation='linear')(hidden)

# Model
model = tf.keras.models.Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mae', metrics=['mae'])

# Mapping UserIndex and MovieIndex
X_train['UserIndex'] = X_train['UserIndex'].map(user_to_index).fillna(0).astype(int)
X_train['MovieIndex'] = X_train['MovieIndex'].map(movie_to_index).fillna(0).astype(int)

X_test['UserIndex'] = X_test['UserIndex'].map(user_to_index).fillna(0).astype(int)
X_test['MovieIndex'] = X_test['MovieIndex'].map(movie_to_index).fillna(0).astype(int)


assert X_train['UserIndex'].min() >= 0 and X_train['MovieIndex'].min() >= 0, "Negative indices detected in train data!"
assert X_test['UserIndex'].min() >= 0 and X_test['MovieIndex'].min() >= 0, "Negative indices detected in test data!"

# Learning rate
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Training
history = model.fit(
    [X_train['UserIndex'], X_train['MovieIndex']], y_train,
    validation_data=([X_test['UserIndex'], X_test['MovieIndex']], y_test),
    epochs=50,
    batch_size=32,
    callbacks=[lr_scheduler]
)

# Save the model
model.save('improved_customer_recommendation_model.h5')


evaluation = model.evaluate([X_test['UserIndex'], X_test['MovieIndex']], y_test)

print(f"Test Loss: {evaluation[0]}, Test MAE: {evaluation[1]}")


Epoch 1/50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['UserIndex'] = X_train['UserIndex'].map(user_to_index).fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['MovieIndex'] = X_train['MovieIndex'].map(movie_to_index).fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['UserIndex'] = X_test['Us

[1m18925/18925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 5ms/step - loss: 0.2331 - mae: 0.2317 - val_loss: 0.1761 - val_mae: 0.1717 - learning_rate: 0.0010
Epoch 2/50
[1m18925/18925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 5ms/step - loss: 0.1746 - mae: 0.1693 - val_loss: 0.1755 - val_mae: 0.1681 - learning_rate: 0.0010
Epoch 3/50
[1m18925/18925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 5ms/step - loss: 0.1710 - mae: 0.1632 - val_loss: 0.1742 - val_mae: 0.1657 - learning_rate: 0.0010
Epoch 4/50
[1m18925/18925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 5ms/step - loss: 0.1688 - mae: 0.1602 - val_loss: 0.1738 - val_mae: 0.1647 - learning_rate: 0.0010
Epoch 5/50
[1m18925/18925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 5ms/step - loss: 0.1666 - mae: 0.1573 - val_loss: 0.1733 - val_mae: 0.1636 - learning_rate: 0.0010
Epoch 6/50
[1m18925/18925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 5ms/step - loss: 0.16



[1m4732/4732[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 0.1646 - mae: 0.1601
Test Loss: 0.1648799180984497, Test MAE: 0.1603059321641922


In [None]:
import numpy as np

# Predictions
y_pred = model.predict([X_test['UserIndex'], X_test['MovieIndex']])

# Define accuracy based on threshold
def calculate_accuracy(y_true, y_pred, threshold=0.5):
    y_pred = y_pred.flatten()  # Flatten predictions
    within_threshold = np.abs(y_true - y_pred) <= threshold
    accuracy = np.mean(within_threshold) * 100  # Convert to percentage
    return accuracy

# Calculate accuracy
accuracy_05 = calculate_accuracy(y_test, y_pred, threshold=0.5)
accuracy_1 = calculate_accuracy(y_test, y_pred, threshold=1.0)

print(f"Accuracy within ±0.5 rating: {accuracy_05:.2f}%")
print(f"Accuracy within ±1.0 rating: {accuracy_1:.2f}%")


[1m4732/4732[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step
Accuracy within ±0.5 rating: 97.06%
Accuracy within ±1.0 rating: 100.00%
