In [84]:
# --- 1. Imports ---
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC, Accuracy
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

# --- 2. Load Data ---
ratings_df = pd.read_csv("bbalanced_simulated_ratings_50_users_renamed.csv")
exhibits_df = pd.read_excel("data set - New.xlsx")
with open("all_image_features1.pkl", "rb") as f:
    image_features_dict = pkl.load(f)


In [85]:
exhibits_df

Unnamed: 0,exhibits,Text in Arabic,Text in English,Category,Time Period,Exhibit_id
0,the_female_peasent,تمثال الفلاحة :يعتبر هذا التمثال من الأعمال ال...,The Femal Peasent: This statue is considered o...,Statue,Modern Egypt,1
1,Statue_ofthe_sphinx,تمثال لأبي الهول: تمثال مصغر لأبي الهول لا يوج...,"Statue of the Sphinx: \nStatue of Sphinx, whic...",Statue,Old Kingdom,2
2,Hassan_Fathi,حسن فتحی ( ۱۹۰۰ م- ۱۹۸۹ م ): ولد في الأسكندرية...,Hassan Fathi (1900-1989): Hassan Fathi was bo...,Architecture,Modern Egypt,3
3,Royal_Statues,التماثيل المالكية : عند قامت دولة البطالمة في ...,Royal Statues: When the Ptolemaic state was es...,Statue,Ptolemaic,4
4,Greek_Statues,تماثيل يونانية:كان للفن في اليونان القديمة أثر...,Greek Statues: The art of ancient Greece had a...,Statue,Hellenistic,5
5,Khonsu,خونسو: كان خونسو إلها للقمر وأحد أفراد ثالوث م...,Khonsu: Khonsu was the god of the moon and a m...,Statue,New Kingdom,6
6,Ra_Horakhty,رع حور أختي: كان رع حور أختي يمثل أتحادا عقائد...,Ra-Horakhty: Ra-Horakhty represented a doctrin...,Statue,New Kingdom,7
7,Senenmut,سننموت : عمل سننموت كمستشار مقرب ومعماري للملك...,Senenmut : Senenmut served as a close advisor ...,Statue,New Kingdom,8
8,Box_ofthe_Holy Qur’an,صندوق المصحف الشريف : صندوق لحفظ المصحف الشريف...,Box of the Holy Qur'an: This huge box was made...,Artifact,Mamluk,9
9,The_HolyQuran,المصحف الشريف : ...,The Holy Quran: Finely calligraphed and bound ...,Artifact,Modern Egypt,10


In [86]:
ratings_df

Unnamed: 0,user_id,exhibit_id,rating
0,user_1,45,1
1,user_1,32,1
2,user_1,5,1
3,user_1,50,1
4,user_1,1,1
...,...,...,...
995,user_50,33,0
996,user_50,22,0
997,user_50,4,0
998,user_50,5,0


In [131]:


ratings_df = pd.read_csv("bbalanced_simulated_ratings_50_users_renamed.csv")
exhibits_df = pd.read_excel("data set - New.xlsx")
with open("all_image_features1.pkl", "rb") as f:
    image_features_dict = pkl.load(f)

exhibits_df = exhibits_df[['Exhibit_id', 'exhibits', 'Text in English', 'Category', 'Time Period']].dropna()

# ---  Image Features ---
image_features = []
for name in exhibits_df['exhibits']:
    if name in image_features_dict and len(image_features_dict[name]) > 0:
        vectors = [entry['feature'] for entry in image_features_dict[name]]
        avg_vector = np.mean(np.stack(vectors), axis=0)
    else:
        avg_vector = np.zeros(2048)
    image_features.append(avg_vector)
image_features = np.array(image_features)
image_features /= np.linalg.norm(image_features, axis=1, keepdims=True) + 1e-10

# ---  Text + Metadata Features ---
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
texts = exhibits_df['Text in English'].astype(str).tolist()
tokens = tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors="tf")
bert_output = bert_model(input_ids=tokens['input_ids'], attention_mask=tokens['attention_mask'])
text_embeddings = bert_output.last_hidden_state[:, 0, :].numpy()

encoder = OneHotEncoder(sparse_output=False)
category_encoded = encoder.fit_transform(exhibits_df[['Category']])
time_period_encoded = encoder.fit_transform(exhibits_df[['Time Period']])
combined_features = np.concatenate([text_embeddings, category_encoded, time_period_encoded, image_features], axis=1)

# ---  Match Features to Exhibit IDs ---
exhibit_to_vector = dict(zip(exhibits_df['Exhibit_id'], combined_features))
ratings_df['features'] = ratings_df['exhibit_id'].map(exhibit_to_vector)
ratings_df.dropna(subset=['features'], inplace=True)

# ---  Encode Users & Items ---
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
ratings_df['user_id_encoded'] = user_encoder.fit_transform(ratings_df['user_id'])
ratings_df['item_id_encoded'] = item_encoder.fit_transform(ratings_df['exhibit_id'])

ratings_df['stratify_col'] = ratings_df['user_id_encoded'].astype(str) + "_" + ratings_df['rating'].astype(str)

train_df, test_df = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42,
    stratify=ratings_df['stratify_col']
)

train_df.drop(columns=['stratify_col'], inplace=True)
test_df.drop(columns=['stratify_col'], inplace=True)

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)  # 16% validation



def prepare_inputs(data):
    return [
        np.array(data['user_id_encoded']),
        np.array(data['item_id_encoded']),
        np.stack(data['features'].values)
    ], np.array(data['rating']).astype('float32')

train_inputs, train_labels = prepare_inputs(train_df)
val_inputs, val_labels = prepare_inputs(val_df)
test_inputs, test_labels = prepare_inputs(test_df)




# --- 9. Build the Model (No Dropout) ---
num_users = ratings_df['user_id_encoded'].nunique()
num_items = ratings_df['item_id_encoded'].nunique()
feature_dim = combined_features.shape[1]

user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')
features_input = Input(shape=(feature_dim,), name='features_input')

user_embedding = Embedding(input_dim=num_users, output_dim=64)(user_input)
item_embedding = Embedding(input_dim=num_items, output_dim=64)(item_input)
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

from tensorflow.keras.layers import Dropout  # Make sure this is imported

# Collaborative Filtering Branch
cf_concat = Concatenate()([user_vec, item_vec])
cf_dense = Dense(128, activation='relu')(cf_concat)
cf_dropout = Dropout(0.3)(cf_dense)
cf_out = Dense(64, activation='relu')(cf_dropout)

# Content-Based Branch
cb_dense1 = Dense(256, activation='relu')(features_input)
cb_dropout1 = Dropout(0.3)(cb_dense1)
cb_dense2 = Dense(128, activation='relu')(cb_dropout1)
cb_dropout2 = Dropout(0.3)(cb_dense2)
cb_out = Dense(64, activation='relu')(cb_dropout2)

# Fusion + Deep Feedforward Layers
fusion = Concatenate()([cf_out, cb_out])

fusion_dense1 = Dense(512, activation='relu')(fusion)
fusion_dropout1 = Dropout(0.3)(fusion_dense1)

fusion_dense2 = Dense(256, activation='relu')(fusion_dropout1)
fusion_dropout2 = Dropout(0.3)(fusion_dense2)

fusion_dense3 = Dense(64, activation='relu')(fusion_dropout2)
fusion_dropout3 = Dropout(0.3)(fusion_dense3)

# Final prediction layer
output = Dense(1, activation='sigmoid')(fusion_dropout3)


# Model setup
model = Model(inputs=[user_input, item_input, features_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=[Accuracy(name="accuracy"), AUC(name="auc")])

model.summary()


# --- 10. Train ---
model.fit(train_inputs, train_labels, validation_data=(val_inputs, val_labels), epochs=50, batch_size=32)



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.0000e+00 - auc: 0.4671 - loss: 0.7001 - val_accuracy: 0.0000e+00 - val_auc: 0.6337 - val_loss: 0.6855
Epoch 2/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.0000e+00 - auc: 0.5245 - loss: 0.6951 - val_accuracy: 0.0000e+00 - val_auc: 0.6839 - val_loss: 0.6794
Epoch 3/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.0000e+00 - auc: 0.6457 - loss: 0.6723 - val_accuracy: 0.0000e+00 - val_auc: 0.6886 - val_loss: 0.6481
Epoch 4/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.0000e+00 - auc: 0.6806 - loss: 0.6497 - val_accuracy: 0.0000e+00 - val_auc: 0.6862 - val_loss: 0.6480
Epoch 5/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0000e+00 - auc: 0.6969 - loss: 0.6338 - val_accuracy: 0.0000e+00 - val_auc: 0.6933 - val_loss: 0.

<keras.src.callbacks.history.History at 0x7afd274ddc50>

In [140]:
# Predict probabilities (between 0 and 1)
y_probs = model.predict(test_inputs)

# Convert probabilities to binary predictions using threshold (e.g., 0.5)
y_preds = (y_probs >= 0.5).astype(int).flatten()


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score



# Actual labels
y_true = test_labels

# Calculate each metric
precision = precision_score(y_true, y_preds)
recall = recall_score(y_true, y_preds)
accuracy = accuracy_score(y_true, y_preds)
auc = roc_auc_score(y_true, y_probs)
f1 = 2 * (precision * recall) / (precision + recall + 1e-10)


# Print results
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"AUC:       {auc:.4f}")
print(f"F1 Score:  {f1:.4f}")


In [142]:
model.save("hybrid_recommender_model.h5")


