## Install dependencies

In [None]:
# install dependencies for huggingfface
!pip install torch
!pip install transformers
!pip install sentence-transformers
!pip install tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.1.0


## Test sentence transformers

In [None]:
# Cosine similarity between adjacent chunks
import numpy as np
from sentence_transformers import SentenceTransformer, util
import json

# Lightweight Embedding and Chunking Evaluation Script
import numpy as np
import matplotlib.pyplot as plt
import umap
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import random


In [None]:
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import json

def calculate_embeddings(all_data):
    # Load model on GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

    for i, chunk in enumerate(tqdm(all_data, desc="Calculating embeddings")):
        # chunk["data"] is a list of all the chunks. This should be the individual chunk
        # Access the current chunk's text using the index i and move it to the device
        embeddings = model.encode(chunk["text"], convert_to_tensor=True, device=device) #Pass the device
        #print("embedding computed")
        all_data[i]["embedding"] = embeddings.cpu().tolist()  # Move embeddings back to CPU before converting to list

    return all_data

def save_as_json(all_data, file_path):
    with open(file_path, 'w') as f:
        json.dump(all_data, f, indent=2)

In [None]:
import json
with open("/content/all_article_chunks.json", 'r') as f:
  groups = json.load(f)

embeddings = calculate_embeddings(groups)
save_as_json(embeddings, "embeddings.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Calculating embeddings: 100%|██████████| 19515/19515 [02:41<00:00, 120.60it/s]


In [None]:
import torch
vectors = []
for chunk in embeddings:
  vectors.append(chunk["embedding"])
vectors = torch.Tensor(vectors) # Convert vectors to a PyTorch tensor


In [None]:
def cosine_similarity_comparison(embeddings, num_samples=100):
    # Sample a subset of embeddings
    sampled_indices = random.sample(range(len(embeddings)), min(num_samples, len(embeddings)))
    sampled_embeddings = [embeddings[i] for i in sampled_indices]

    # Calculate cosine similarities
    cos_similarities = cosine_similarity(sampled_embeddings)

    inter_chunk_similarities = []
    intra_chunk_similarities = []

    for i in range(len(sampled_embeddings)):
        for j in range(i + 1, len(sampled_embeddings)):
            if sampled_indices[i] // 10 != sampled_indices[j] // 10:  # Assuming 10 chunks per article
              inter_chunk_similarities.append(cos_similarities[i, j])
            else:
              intra_chunk_similarities.append(cos_similarities[i, j])

    return inter_chunk_similarities, intra_chunk_similarities

# Example usage
inter_sim, intra_sim = cosine_similarity_comparison(vectors)


print(f"Average inter-chunk cosine similarity: {np.mean(inter_sim)}")
print(f"Average intra-chunk cosine similarity: {np.mean(intra_sim)}")


In [None]:
query = "what's the YCBA"
query_embed = model.encode(query, convert_to_tensor=True)
top_k = 5
hits = util.semantic_search(query_embed, vectors, top_k=top_k)[0]
for hit in hits:
    print(f"Score: {hit['score']:.2f}\nChunk: {embeddings[hit['corpus_id']]['text']}\n--- Publication date: {embeddings[hit['corpus_id']]['publication_date']}")

Score: 0.43
Chunk: While Schmolka and Friedlaender agree that the col-
lection serves as a nexus for engaging with imperial leg-
acy, they also emphasize the complementary relevance of 
recent programming. Schmolka and Sonia Gadre, another 
Student Guide, noted the “Things of Beauty Growing” 
exhibition that closed last December, which explored how 
vessel traditions from the British Isles, China, and Korea 
came together in novel ways to birth contemporary British 
pottery. 


Students have been crucial in transforming the muse-
um’s approach to race and representation. Since 2014, the 
Guides have had a hand in shaping the YCBA’s collection 
by selecting a work on paper for acquisition through the 
John O’Brien Fund each year. Both initiatives are shifting 
attention from the dazzling metropolitan centers of the for-
mer British Empire to the margins, with an emphasis on 
the traces of colonial rule. To Gadre, the Student Guides 
are challenging the notion that the YCBA is antiquated

In [None]:
### Config ###
NUM_SAMPLES = 30  # Number of random chunks to manually check
TOP_K = 5         # Number of neighbors to retrieve
NUM_CLUSTERS = 10 # For clustering evaluation

# Get chunks
def sample_chunks(chunks, embeddings, n=NUM_SAMPLES):
    indices = random.sample(range(len(chunks)), n)
    return [(i, chunks[i], embeddings[i]) for i in indices]

# KNN
def nearest_neighbors(embeddings, idx, top_k=TOP_K):
    sims = cosine_similarity([embeddings[idx]], embeddings)[0]
    neighbors = sims.argsort()[-(top_k+1):][::-1]  # Include itself
    return neighbors[1:]  # Exclude itself

# Cluster eval
def clustering_score(embeddings, num_clusters=NUM_CLUSTERS):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    score = silhouette_score(embeddings, labels)
    return score

# UMAP
def plot_umap(embeddings, chunks=None):
    reducer = umap.UMAP(random_state=42)
    emb_2d = reducer.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    plt.scatter(emb_2d[:, 0], emb_2d[:, 1], s=5)
    plt.title("UMAP projection of embeddings")
    plt.show()

# chunk length distribution
def plot_chunk_lengths(chunks):
    lengths = [len(chunk.split()) for chunk in chunks]
    plt.hist(lengths, bins=30)
    plt.title("Chunk Word Count Distribution")
    plt.xlabel("Words per chunk")
    plt.ylabel("Frequency")
    plt.show()

# main Evaluation
def evaluate_embeddings_and_chunks(chunks, embeddings):
    print("\n--- Sampling Random Chunks for Manual Review ---")
    samples = sample_chunks(chunks, embeddings)
    for idx, text, emb in samples:
        print(f"\nChunk {idx}: {text[:200]}...")
        neighbor_idxs = nearest_neighbors(embeddings, idx)
        print("Top neighbors:")
        for nidx in neighbor_idxs:
            print(f" - Neighbor {nidx}: {chunks[nidx][:100]}...")

    print("\n--- Running Clustering Tightness Check ---")
    sil_score = clustering_score(embeddings)
    print(f"Silhouette Score (higher = tighter clusters): {sil_score:.4f}")

    print("\n--- Visualizing with UMAP ---")
    plot_umap(embeddings)

    print("\n--- Plotting Chunk Lengths ---")
    plot_chunk_lengths(chunks)

    print("\nDone! Now you can manually assess coherence, neighbor relevance, and chunk size balance.")

# Example usage:
# embeddings = np.load("embeddings.npy")
# with open("chunks.txt", "r") as f:
#     chunks = f.read().splitlines()
# evaluate_embeddings_and_chunks(chunks, embeddings)
