Initial tests to check code is working

In [None]:
# python
import os
import numpy as np
import pandas as pd
from transformers import CLIPModel, CLIPProcessor
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# local
from k9kmeans.utils import load_and_preprocess_image, get_embeddings

In [None]:
batch_size = 32
device = 'cpu'
N_LIMIT = 100
n_clusters = 5
outfile = '/home/sdysch/Documents/repos/k9_means/data/results_initial.csv'

# Load model

In [None]:
model_name = 'openai/clip-vit-base-patch32'
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)

# Load subset of images

In [None]:
base_path = '/home/sdysch/Documents/repos/k9_means/images/'
filenames = [
    os.path.join(base_path, f)
    for f in os.listdir(base_path)
    if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp'))
][:N_LIMIT]

In [None]:
images = [load_and_preprocess_image(f) for f in filenames]

# Run process

## Get embeddings

In [None]:
all_embeddings = []

for i in tqdm(range(0, len(images), batch_size), desc='Extracting embeddings'):
    batch = images[i:i+batch_size]
    emb = get_embeddings(batch, processor, model, device=device)
    all_embeddings.append(emb)

all_embeddings = np.vstack(all_embeddings)
print(f'Embeddings shape: {all_embeddings.shape}')

## Reduce dimensions

In [None]:
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(all_embeddings)

## Clustering

In [None]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(reduced_embeddings)

## Visualisation

In [None]:
pca_2d = PCA(n_components=2)
emb_2d = pca_2d.fit_transform(all_embeddings)

df_clusters = pd.DataFrame({
    'x': emb_2d[:, 0],
    'y': emb_2d[:, 1],
    'cluster': labels
})

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.scatterplot(
    data=df_clusters,
    x='x',
    y='y',
    hue='cluster',
    palette='tab10',
    s=50,
    alpha=0.8
)
ax.legend(loc='best', title='Cluster')

## Save results for streamlit visualiation

In [None]:
df_results = pd.DataFrame(filenames, columns=['filename'])
df_results['cluster'] = labels
df_results.to_csv(outfile)