# Packages

In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install tqdm scikit-learn

In [None]:
import os
import time
import random
import requests
import zipfile
from io import BytesIO
import numpy as np
import pandas as pd
import torch
import clip
import joblib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from torchvision import transforms


# Data Loading

In [None]:
df = pd.read_csv("/content/photos.tsv000", sep="\t")
urls = df["photo_image_url"].dropna().tolist()
image_urls = [url + "?w=512" for url in urls]

# Training Pipeline

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
zip_path = "/content/clip_embeddings.zip"
extract_path = "/content"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

In [None]:
BATCH_SIZE = 64
CHUNK_SIZE = 1000
SAVE_DIR = "/content/clip_embeddings"
os.makedirs(SAVE_DIR, exist_ok=True)


existing_files = set(os.listdir(SAVE_DIR))

for chunk_start in range(0, len(image_urls), CHUNK_SIZE):
    chunk_end = min(chunk_start + CHUNK_SIZE, len(image_urls))
    chunk_name = f"embeddings_{chunk_start}_{chunk_end}.pt"

    if chunk_name in existing_files:
        print(f"Skipping already saved: {chunk_name}")
        continue

    print(f"Processing chunk: {chunk_start} to {chunk_end}")

    image_embeddings = []
    valid_indices = []
    batch_images = []
    batch_indices = []

    start = time.time()

    for rel_idx, url in enumerate(tqdm(image_urls[chunk_start:chunk_end], desc="Processing Chunk")):
        idx = chunk_start + rel_idx
        try:
            response = requests.get(url, timeout=5)
            img = Image.open(BytesIO(response.content)).convert("RGB")
            processed = preprocess(img)

            batch_images.append(processed)
            batch_indices.append(idx)

            if len(batch_images) == BATCH_SIZE or idx == chunk_end - 1:
                input_batch = torch.stack(batch_images).to(device)

                with torch.no_grad():
                    features = model.encode_image(input_batch)
                    features = features / features.norm(dim=-1, keepdim=True)

                image_embeddings.extend(features.cpu())
                valid_indices.extend(batch_indices)

                batch_images = []
                batch_indices = []

        except Exception as e:
            print(f"Failed at {idx}: {e}")

    if image_embeddings:
        save_path = os.path.join(SAVE_DIR, chunk_name)
        torch.save({
            "embeddings": torch.stack(image_embeddings),
            "indices": valid_indices
        }, save_path)
        print(f"Saved {len(image_embeddings)} embeddings to {save_path}")

    torch.cuda.empty_cache()

    end = time.time()
    print(f"Chunk {chunk_start}-{chunk_end} done in {(end - start)/60:.2f} minutes.")

    # Exit after one chunk (remove this break if you want to process all at once)
    # break


In [None]:
embedding_dir = "/content/clip_embeddings/"
all_embeddings = []
all_indices = []

for file in sorted(os.listdir(embedding_dir)):
    if file.endswith(".pt"):
        data = torch.load(os.path.join(embedding_dir, file))
        all_embeddings.append(data["embeddings"])
        all_indices.extend(data["indices"])

embedding_matrix = torch.cat(all_embeddings, dim=0).numpy()
print(f"Loaded {embedding_matrix.shape[0]} embeddings.")

In [None]:
knn = NearestNeighbors(n_neighbors=6, metric="cosine")
knn.fit(embedding_matrix)
print("KNN model fitted.")

In [None]:
image_urls = df["photo_image_url"].dropna().tolist()
image_urls = [url + "?w=512" for url in image_urls]

image_thumbnails = []

for idx in tqdm(all_indices, desc="🔗 Storing thumbnail URLs only"):
    try:
        response = requests.get(image_urls[idx], timeout=5)
        if response.status_code == 200:
            image_thumbnails.append(image_urls[idx])
        else:
            image_thumbnails.append(None)
    except Exception as e:
        image_thumbnails.append(None)

# Output

In [None]:
def load_image_and_show_matches(url,
                                model,
                                knn,
                                image_thumbnails,
                                device,
                                n_neighbors=30,
                                top_k=6):
    """ Given an image URL, this function computes its CLIP embedding,
    retrieves the top-k similar images, and visualizes the matches.

    Args:
        url (str):  URL of the input image
        model: Loaded CLIP model.
        knn: Trained NearestNeighbors model on image embeddings
        image_thumbnails (list): List of image thumbnail URLs (same order as embeddings)
        device: torch.device ('cuda' or 'cpu')
        n_neighbors (int, optional): Number of nearest neighbors to search. Defaults to 30
        top_k (int, optional): Number of matches to display. Defaults to 6
    """

    input_image = Image.open(BytesIO(requests.get(url).content)).convert("RGB")

    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    input_tensor = preprocess(input_image).unsqueeze(0).to(device)

    with torch.no_grad():
        input_embedding = model.encode_image(input_tensor)
        input_embedding = input_embedding / input_embedding.norm(dim=-1, keepdim=True)
    input_embedding_np = input_embedding.cpu().numpy()

    _, all_indices = knn.kneighbors(input_embedding_np, n_neighbors=n_neighbors)

    def resize_image(img, size=(224, 224)):
        return img.resize(size)

    plt.figure(figsize=(10, 9))
    plt.subplot(3, 3, 2)
    plt.imshow(resize_image(input_image))
    plt.title("Your Input")
    plt.axis("off")

    match_count = 0
    plot_position = 4

    for idx in all_indices[0]:
        if image_thumbnails[idx] is None:
            continue
        try:
            match_img = Image.open(
                BytesIO(requests.get(image_thumbnails[idx]).content)).convert("RGB")
            resized_match = resize_image(match_img)
            plt.subplot(3, 3, plot_position)
            plt.imshow(resized_match)
            plt.title(f"Match {match_count + 1}")
            plt.axis("off")
            match_count += 1
            plot_position += 1
            if match_count == top_k:
                break
        except:
            continue

    plt.tight_layout(pad=0.5)
    plt.subplots_adjust(wspace=0.05, hspace=0.3)
    plt.show()


In [None]:
load_image_and_show_matches(
    url="https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg",
    model=model_loaded
    knn=knn_loaded,
    image_thumbnails=image_thumbnails_loaded,
    device=device
    n_neighbors=30,
    top_k=6
)


In [None]:
file_name='similar_image_model.sav'
model_data = {
    'knn_model': knn,
    'embedding_matrix': embedding_matrix,
    'image_thumbnails': image_thumbnails,
    'all_indices': all_indices,
    'image_urls': image_urls
}

with open(file_name, 'wb') as f:
  pickle.dump(model_data, f)

# Evaluation

We have saved our model, embeddings, indexes and image url as sav. We will load and do the evaluation.

In [None]:
with open(file_name, 'rb') as f:
  loaded_model = pickle.load(f)

knn_loaded = loaded_model['knn_model']
embedding_matrix_loaded = loaded_model['embedding_matrix']
image_thumbnails_loaded = loaded_model['image_thumbnails']
all_indices_loaded = loaded_model['all_indices']
image_urls_loaded = loaded_model['image_urls']

### CLIP Similarity Score

In [None]:
def compute_clip_similarity_scores(embedding_matrix, knn_model, num_queries=1000, k=6):
    total_images = len(embedding_matrix)
    query_indices = random.sample(range(total_images), num_queries)

    clip_scores = []
    score_lookup = {}

    for query_idx in query_indices:
        query_embed = embedding_matrix[query_idx].reshape(1, -1)
        retrieved_indices = knn_model.kneighbors(query_embed, n_neighbors=k)[1][0]
        retrieved_embeds = embedding_matrix[retrieved_indices]
        sim_scores = cosine_similarity(query_embed, retrieved_embeds)[0]
        avg_sim = np.mean(sim_scores)
        clip_scores.append(avg_sim)
        score_lookup[query_idx] = avg_sim

    return clip_scores, query_indices, score_lookup

In [None]:
def plot_clip_similarity(clip_scores):
    avg_score = np.mean(clip_scores)
    min_score = np.min(clip_scores)
    max_score = np.max(clip_scores)

    print(f"✅ CLIP Similarity Summary (Top 6 avg over {len(clip_scores)} queries):")
    print(f"Average Score : {avg_score:.4f}")
    print(f"Max Score     : {max_score:.4f}")
    print(f"Min Score     : {min_score:.4f}")

    plt.figure(figsize=(10, 4))

    # Histogram
    plt.subplot(1, 2, 1)
    plt.hist(clip_scores, bins=25, color='skyblue', edgecolor='black')
    plt.axvline(avg_score, color='red', linestyle='dashed', linewidth=1)
    plt.title("CLIP Similarity Histogram")
    plt.xlabel("Avg CLIP Cosine Similarity (Top 6)")
    plt.ylabel("Number of Queries")

    # Boxplot
    plt.subplot(1, 2, 2)
    plt.boxplot(clip_scores, vert=False)
    plt.title("Boxplot of CLIP Similarities")
    plt.xlabel("Avg CLIP Cosine Similarity")

    plt.tight_layout()
    plt.show()


### Intra-query diversity score

In [None]:
def compute_intra_query_diversity(embedding_matrix, knn_model, query_indices, k=6):
    diversity_scores = []

    for query_idx in query_indices:
        query_embed = embedding_matrix[query_idx].reshape(1, -1)
        retrieved_indices = knn_model.kneighbors(query_embed, n_neighbors=k)[1][0]
        embeds = embedding_matrix[retrieved_indices]
        sims = cosine_similarity(embeds)
        n = sims.shape[0]
        upper_tri_similarities = [sims[i, j] for i in range(n) for j in range(i+1, n)]
        avg_sim = np.mean(upper_tri_similarities)
        diversity = 1 - avg_sim
        diversity_scores.append(diversity)

    return diversity_scores


In [None]:
def plot_diversity_scores(diversity_scores):
    avg_div = np.mean(diversity_scores)
    print(f"✅ Average Diversity Score: {avg_div:.4f}")

    plt.figure(figsize=(8, 5))
    plt.hist(diversity_scores, bins=25, color='lightgreen', edgecolor='black')
    plt.title("Intra-query Diversity Score Distribution (1 - Avg Pairwise Similarity)")
    plt.xlabel("Diversity Score (0 = very similar, 1 = very different)")
    plt.ylabel("Number of Queries")
    plt.grid(True)
    plt.show()


### CLIP Similarity v Intra-query diversity scores

In [None]:
def plot_similarity_vs_diversity(clip_scores, diversity_scores):
    plt.figure(figsize=(8, 5))
    plt.scatter(clip_scores, diversity_scores, alpha=0.6, color='purple')
    plt.title("CLIP Similarity vs. Intra-query Diversity")
    plt.xlabel("Avg CLIP Similarity (Top 6)")
    plt.ylabel("Diversity Score")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
clip_scores, query_indices, score_lookup = compute_clip_similarity_scores(
    embedding_matrix=embedding_matrix_loaded,
    knn_model=knn_loaded,
    num_queries=1000,
    k=6
)

diversity_scores = compute_intra_query_diversity(
    embedding_matrix=embedding_matrix_loaded,
    knn_model=knn_loaded,
    query_indices=query_indices,
    k=6
)




In [None]:
plot_diversity_scores(diversity_scores)


In [None]:
plot_clip_similarity(clip_scores)


In [None]:
plot_similarity_vs_diversity(clip_scores, diversity_scores)


### t-SNE Visualization

In [None]:
def plot_tsne_for_multiple_queries(embedding_matrix, knn_model, num_queries=50, background_size=2000, k=6):

    total_images = len(embedding_matrix)
    query_sample_indices = random.sample(range(total_images), num_queries)

    all_query_and_matches = []
    for q in query_sample_indices:
        top_k = knn_model.kneighbors(embedding_matrix[q].reshape(1, -1), n_neighbors=k)[1][0]
        all_query_and_matches.extend([q] + list(top_k))

    background_indices = random.sample(range(total_images), background_size)
    all_indices = list(set(background_indices + all_query_and_matches))

    embeds = embedding_matrix[all_indices]
    tsne_result = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(embeds)
    index_to_2d = {idx: tsne_result[i] for i, idx in enumerate(all_indices)}

    colors = cm.rainbow(np.linspace(0, 1, num_queries))
    plt.figure(figsize=(12, 10))

    bg_coords = np.array([index_to_2d[idx] for idx in background_indices if idx in index_to_2d])
    plt.scatter(bg_coords[:, 0], bg_coords[:, 1], c='lightgray', s=10, label='Other Images')

    for i, q_idx in enumerate(query_sample_indices):
        color = colors[i]
        if q_idx not in index_to_2d:
            continue
        q_coord = index_to_2d[q_idx]
        plt.scatter(q_coord[0], q_coord[1], marker='X', color=color, s=100, edgecolor='black', label=f"Query {i+1}")

        top_k = knn_model.kneighbors(embedding_matrix[q_idx].reshape(1, -1), n_neighbors=k)[1][0]
        for match_idx in top_k:
            if match_idx in index_to_2d:
                match_coord = index_to_2d[match_idx]
                plt.scatter(match_coord[0], match_coord[1], color=color, s=40)

    plt.title(f"t-SNE: {num_queries} Queries and Their Top-{k} Matches")
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
plot_tsne_for_multiple_queries(
    embedding_matrix=embedding_matrix_loaded,
    knn_model=knn_loaded,
    num_queries=1,
    background_size=2000,
    k=6
)
