# Vec2GC Step by Step Analysis

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from embeddings import create_sentence_embeddings
from vec2gc_ver2 import HierarchicalClustering
import numpy as np
from datasets import load_dataset
import datasets
import pandas as pd

In [3]:
def clean_dataset(dataset, size=0):
    dataset_df = pd.DataFrame(dataset)
    print(dataset_df.shape)
    dataset_df.dropna(inplace=True)
    dataset_df['text'] = (dataset_df['text'] 
                        .str.replace(r'\n+', ' ', regex=True)
                        .str.replace(r'\s+', ' ', regex=True)
                     )
    dataset_df.drop(dataset_df[dataset_df['text']==' '].index, inplace=True)
    dataset_df.drop(dataset_df[dataset_df['text']==''].index, inplace=True)
    dataset_df.drop_duplicates(inplace=True)
    print(dataset_df.shape)
    if size != 0:
        cleaned_dataset = datasets.Dataset.from_pandas(dataset_df.sample(size))
    else:
        cleaned_dataset = datasets.Dataset.from_pandas(dataset_df)
    return cleaned_dataset


In [4]:
# dataset_name = 'setfit/bbc-news'
dataset_name = 'fancyzhx/ag_news'
dataset = load_dataset(dataset_name, split='train',)
cleaned_dataset = clean_dataset(dataset, 10_000)

(120000, 2)
(119973, 2)


In [5]:
print("Creating embeddings...")
embeddings = create_sentence_embeddings(
    dataset=cleaned_dataset,
    # dataset_name='SetFit/20_newsgroups',
    model_name='sentence-transformers/all-mpnet-base-v2',
    device='mps',
    batch_size=128
)

Creating embeddings...
Loading sentence-transformer model: sentence-transformers/all-mpnet-base-v2
Using device: mps
Features: dict_keys(['text', 'label', '__index_level_0__'])
Auto-detected text column: text
Processing 10000 sentences...


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Created embeddings matrix: torch.Size([10000, 768])


### Calculating each step individually

In [6]:
def compute_cosine_similarity(embeddings: np.ndarray) -> np.ndarray:
    """
    Optimized cosine similarity using pre-normalized embeddings.
    If embeddings are already normalized, this is just a dot product.
    """
    # Normalize embeddings once (if not already normalized)
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    print(norms.shape)
    embeddings_norm = embeddings / (norms + 1e-10)  # Avoid division by zero
    
    # Cosine similarity = dot product for normalized vectors
    return np.dot(embeddings_norm, embeddings_norm.T)


In [42]:
import networkit as nk

n_items = embeddings.shape[0]
similarity_matrix = compute_cosine_similarity(embeddings.cpu())

# Create NetworKit graph
nk_graph = nk.Graph(n_items, weighted=True)

# Vectorized edge creation without nested loops
# Get upper triangular indices (avoid duplicate edges and self-loops)
i_indices, j_indices = np.triu_indices(n_items, k=1)

# Get similarities for all potential edges
edge_similarities = similarity_matrix[i_indices, j_indices]

# Find edges that meet the threshold
valid_edges = edge_similarities > 0.4


(10000, 1)


In [None]:
print(i_indices[np.where(edge_similarities > 0.9)])
print(j_indices[np.where(edge_similarities > 0.9)])

In [44]:
valid_edges.sum()

np.int64(178460)

In [45]:
# Filter to get only valid edge indices and similarities
valid_i = i_indices[valid_edges]
valid_j = j_indices[valid_edges]
valid_similarities = edge_similarities[valid_edges]
# valid_similarities[valid_similarities > 1.0] = 1.0

# Calculate weights using the new formula: 1 / (1 - cosine_similarity)
# Add small epsilon to avoid division by zero
epsilon = 1e-10
weights = 1.0 / (1.0 - valid_similarities + epsilon)


In [46]:
# Add edges to NetworKit graph
if len(valid_i) > 0:
    for i, j, weight in zip(valid_i, valid_j, weights):
        nk_graph.addEdge(int(i), int(j), float(weight))


In [47]:
nk_graph.numberOfEdges()

178460

In [48]:
communities = nk.community.detectCommunities(nk_graph)

Communities detected in 0.02600 [s]
solution properties:
-------------------  ------------
# communities         279
min community size      1
max community size   4443
avg. community size    35.8423
imbalance             123.417
edge cut             4636.37
edge cut (portion)      0.0259799
modularity              0.183883
-------------------  ------------
