In [1]:
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl
import numpy as np


In [2]:
df = pd.read_excel("data set - New.xlsx")
df = df[['exhibits', 'Text in English', 'Category', 'Time Period']].dropna().reset_index(drop=True)

df['cleaned_exhibit'] = df['exhibits'].astype(str).str.strip().str.lower()
df

Unnamed: 0,exhibits,Text in English,Category,Time Period,cleaned_exhibit
0,the_female_peasent,The Femal Peasent: This statue is considered o...,Statue,Modern Egypt,the_female_peasent
1,Statue_ofthe_sphinx,"Statue of the Sphinx: \nStatue of Sphinx, whic...",Statue,Old Kingdom,statue_ofthe_sphinx
2,Hassan_Fathi,Hassan Fathi (1900-1989): Hassan Fathi was bo...,Architecture,Modern Egypt,hassan_fathi
3,Royal_Statues,Royal Statues: When the Ptolemaic state was es...,Statue,Ptolemaic,royal_statues
4,Greek_Statues,Greek Statues: The art of ancient Greece had a...,Statue,Hellenistic,greek_statues
5,Khonsu,Khonsu: Khonsu was the god of the moon and a m...,Statue,New Kingdom,khonsu
6,Ra_Horakhty,Ra-Horakhty: Ra-Horakhty represented a doctrin...,Statue,New Kingdom,ra_horakhty
7,Senenmut,Senenmut : Senenmut served as a close advisor ...,Statue,New Kingdom,senenmut
8,Box_ofthe_Holy Qur’an,Box of the Holy Qur'an: This huge box was made...,Artifact,Mamluk,box_ofthe_holy qur’an
9,The_HolyQuran,The Holy Quran: Finely calligraphed and bound ...,Artifact,Modern Egypt,the_holyquran


In [3]:
texts = df['Text in English'].astype(str).tolist()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

tokens = tokenizer(
    texts,
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors="tf"
)

#outputs = model(**tokens)
outputs = model(input_ids=tokens['input_ids'], attention_mask=tokens['attention_mask'])
cls_embeddings = outputs.last_hidden_state[:, 0, :]
text_embeddings = cls_embeddings.numpy()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [4]:
encoder = OneHotEncoder(sparse_output=False)
category_encoded = encoder.fit_transform(df[['Category']])
time_period_encoded = encoder.fit_transform(df[['Time Period']])

In [5]:
with open("all_image_features1.pkl", "rb") as f:
    train_data_dict = pkl.load(f)


In [6]:
image_features = []
exhibit_names_from_df = df['exhibits'].tolist()

for name in exhibit_names_from_df:
    if name in train_data_dict and len(train_data_dict[name]) > 0:
        vectors = [entry['feature'] for entry in train_data_dict[name]]
        avg_vector = np.mean(np.stack(vectors), axis=0)
    else:
        avg_vector = np.zeros(2048)
    image_features.append(avg_vector)

image_features = np.array(image_features)

# Safe normalization
norms = np.linalg.norm(image_features, axis=1, keepdims=True)
norms[norms == 0] = 1
image_features = image_features / norms


In [7]:
# 6. Combine All Features

combined_features = np.concatenate([
    text_embeddings ,
    category_encoded ,
    time_period_encoded,
    image_features
], axis=1)


In [8]:
similarity_matrix = cosine_similarity(combined_features)

In [9]:
def recommend(
    exhibit_name, top_k=5,
    filter_category=None,
    filter_time_period=None
):
    cleaned_exhibit = exhibit_name.strip().lower()

    if cleaned_exhibit not in df['cleaned_exhibit'].values:
        print(f"❌ Exhibit '{exhibit_name}' not found.")
        suggestions = df['exhibits'][df['cleaned_exhibit'].str.contains(cleaned_exhibit[:3])]
        if not suggestions.empty:
            print("🔍 Did you mean:", ", ".join(suggestions.values))
        return pd.DataFrame()

    exhibit_index = df[df['cleaned_exhibit'] == cleaned_exhibit].index[0]

    similarity_scores = list(enumerate(similarity_matrix[exhibit_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_indices = similarity_scores[1:]  # skip self

    recommendations = []
    for i, score in top_indices:
        exhibit = df.loc[i]
        category = exhibit['Category'].strip()
        time_period = exhibit['Time Period'].strip()

        if filter_category and category != filter_category:
            continue
        if filter_time_period and time_period != filter_time_period:
            continue

        recommendations.append({
            "Index": i,
            "Exhibit": exhibit['exhibits'],
            "Category": category,
            "Time Period": time_period,
            "Similarity Score": round(score, 4)
        })

        if len(recommendations) >= top_k:
            break

    recommendations_df = pd.DataFrame(recommendations)
    print(f"\n📍 Top {top_k} Recommendations for: {df.loc[exhibit_index, 'exhibits']} (Index: {exhibit_index})")
    return recommendations_df


In [10]:
recommend("Hassan_Fathi", top_k=5)


📍 Top 5 Recommendations for: Hassan_Fathi (Index: 2)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,27,Mohamed_Talaat_Pasha_Harb_00,Statue,Modern Egypt,0.8407
1,0,the_female_peasent,Statue,Modern Egypt,0.8216
2,48,Zaynab_Khatun_house,Architecture,Mamluk,0.8184
3,30,Puplit _of_the_Mosque_of_Abu_Bakr_bin_Mazhar,Architecture,Mamluk,0.8064
4,8,Box_ofthe_Holy Qur’an,Artifact,Mamluk,0.7959


In [11]:
recommend("the_female_peasent", top_k=5)


📍 Top 5 Recommendations for: the_female_peasent (Index: 0)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,37,AQueen_in_the_form_of_the_Sphinx,Statue,Old Kingdom,0.8618
1,16,Pen_Menkh_TheGovernerOf_Dendara,Statue,Ptolemaic,0.8591
2,25,Mamluk_Lamps,Artifact,Mamluk,0.8575
3,17,TheCoffinOf_Lady_Isis,Statue,New Kingdom,0.8517
4,18,CoffinOf_Nedjemankh,Statue,Ptolemaic,0.8472


In [12]:
recommend("King_Fouad_I", top_k=5)


📍 Top 5 Recommendations for: King_Fouad_I (Index: 11)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,29,Muhammad_Ali_Pasha,Statue,Modern Egypt,0.8582
1,27,Mohamed_Talaat_Pasha_Harb_00,Statue,Modern Egypt,0.8024
2,26,Khedive_Ismail,Statue,Modern Egypt,0.7819
3,12,theVizier_Paser,Statue,New Kingdom,0.7807
4,2,Hassan_Fathi,Architecture,Modern Egypt,0.7671


In [13]:
recommend("King_Thutmose_III", top_k=5)


📍 Top 5 Recommendations for: King_Thutmose_III (Index: 10)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,7,Senenmut,Statue,New Kingdom,0.9109
1,41,Baker,Statue,Old Kingdom,0.8965
2,47,Stela_of_King_Qa'a,Statue,Old Kingdom,0.8944
3,44,Hapi_the_Scribe\n,Statue,New Kingdom,0.8923
4,17,TheCoffinOf_Lady_Isis,Statue,New Kingdom,0.8922


In [14]:

recommend("Khedive_Ismail", top_k=5)


📍 Top 5 Recommendations for: Khedive_Ismail (Index: 26)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,27,Mohamed_Talaat_Pasha_Harb_00,Statue,Modern Egypt,0.8543
1,29,Muhammad_Ali_Pasha,Statue,Modern Egypt,0.8478
2,35,king_Akhenaten,Statue,New Kingdom,0.8333
3,47,Stela_of_King_Qa'a,Statue,Old Kingdom,0.7842
4,14,Amun_Ra_Kingof_TheGods,Statue,New Kingdom,0.7824


In [15]:
recommend("Khonsu", top_k=5)


📍 Top 5 Recommendations for: Khonsu (Index: 5)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,42,The_Protective_Goddesses,Statue,New Kingdom,0.8856
1,47,Stela_of_King_Qa'a,Statue,Old Kingdom,0.8785
2,17,TheCoffinOf_Lady_Isis,Statue,New Kingdom,0.8571
3,19,TheCoffinOf_Sennedjem,Statue,New Kingdom,0.8569
4,10,King_Thutmose_III,Statue,New Kingdom,0.856


In [None]:
from sklearn.neighbors import NearestNeighbors


In [None]:
nn_model = NearestNeighbors(n_neighbors=6, metric='cosine')  # n_neighbors = top_k + 1 (because first one is itself)
nn_model.fit(combined_features)

# 7. Define Recommend Function
def recommend(exhibit_name, top_k=5, filter_category=None, filter_time_period=None):
    cleaned_exhibit = exhibit_name.strip().lower()

    if cleaned_exhibit not in df['cleaned_exhibit'].values:
        print(f"❌ Exhibit '{exhibit_name}' not found.")
        suggestions = df['exhibits'][df['cleaned_exhibit'].str.contains(cleaned_exhibit[:3])]
        if not suggestions.empty:
            print("🔍 Did you mean:", ", ".join(suggestions.values))
        return pd.DataFrame()

    exhibit_index = df[df['cleaned_exhibit'] == cleaned_exhibit].index[0]
    exhibit_vector = combined_features[exhibit_index].reshape(1, -1)

    distances, indices = nn_model.kneighbors(exhibit_vector)

    recommendations = []
    for idx, dist in zip(indices[0][1:], distances[0][1:]):  # skip self (first neighbor is itself)
        exhibit = df.loc[idx]
        category = exhibit['Category'].strip()
        time_period = exhibit['Time Period'].strip()

        if filter_category and category != filter_category:
            continue
        if filter_time_period and time_period != filter_time_period:
            continue

        recommendations.append({
            "Index": idx,
            "Exhibit": exhibit['exhibits'],
            "Category": category,
            "Time Period": time_period,
            "Similarity Score": round(1 - dist, 4)  # similarity = 1 - distance for cosine
        })

        if len(recommendations) >= top_k:
            break

    recommendations_df = pd.DataFrame(recommendations)
    print(f"\n📍 Top {top_k} Recommendations for: {df.loc[exhibit_index, 'exhibits']} (Index: {exhibit_index})")
    return recommendations_df

# 8. Test it
recommend("Hassan_Fathi", top_k=5)


📍 Top 5 Recommendations for: Hassan_Fathi (Index: 2)


Unnamed: 0,Index,Exhibit,Category,Time Period,Similarity Score
0,27,Mohamed_Talaat_Pasha_Harb_00,Statue,Modern Egypt,0.8407
1,0,the_female_peasent,Statue,Modern Egypt,0.8216
2,48,Zaynab_Khatun_house,Architecture,Mamluk,0.8184
3,30,Puplit _of_the_Mosque_of_Abu_Bakr_bin_Mazhar,Architecture,Mamluk,0.8064
4,8,Box_ofthe_Holy Qur’an,Artifact,Mamluk,0.7959
