# Common Imports

In [None]:
import faiss
import pickle
import numpy as np
from typing import List, Dict, Any, Optional

from start_utils import db_session

from dtos.services.etl.profile import ProfileData
from dtos.services.search.hard_criteria import HardCriteria

# ETL Load

In [256]:
from services.etl.load import ETLLoadService

In [193]:
file_path = "data/linkedin_data_subset.json"

etl_load_service = ETLLoadService(
    file_path=file_path
)

In [194]:
dataset = etl_load_service.run()

[32m2025-07-30 16:14:38.497[0m | [1mINFO    [0m | [36mservices.etl.load[0m:[36mrun[0m:[36m22[0m - [1mLoading data from data/linkedin_data_subset.json...[0m
[32mJuly-30-2025[0m | [30m16:14:38[0m | [1mINFO[0m | [36mLoading data from data/linkedin_data_subset.json...[0m | [35mservices.etl.load:run:22[0m | [33m{}[0m
[32m2025-07-30 16:27:33.968[0m | [1mINFO    [0m | [36mservices.etl.load[0m:[36mrun[0m:[36m25[0m - [1mLoaded 193796 profiles[0m
[32mJuly-30-2025[0m | [30m16:27:33[0m | [1mINFO[0m | [36mLoaded 193796 profiles[0m | [35mservices.etl.load:run:25[0m | [33m{}[0m


# ETL Extraction

In [254]:
from services.etl.extract import ETLExtractionService
etl_extract_service = ETLExtractionService(session=db_session)

In [255]:
profiles = etl_extract_service.run(profiles=dataset)

[32m2025-07-30 17:31:50.899[0m | [1mINFO    [0m | [36mservices.etl.extract[0m:[36mrun[0m:[36m252[0m - [1mExtracting profiles: 193796[0m
[32mJuly-30-2025[0m | [30m17:31:50[0m | [1mINFO[0m | [36mExtracting profiles: 193796[0m | [35mservices.etl.extract:run:252[0m | [33m{}[0m
100%|██████████| 193796/193796 [07:18<00:00, 441.98it/s] 
[32m2025-07-30 17:39:09.381[0m | [1mINFO    [0m | [36mservices.etl.extract[0m:[36mrun[0m:[36m379[0m - [1mProfiles created: 193796[0m
[32mJuly-30-2025[0m | [30m17:39:09[0m | [1mINFO[0m | [36mProfiles created: 193796[0m | [35mservices.etl.extract:run:379[0m | [33m{}[0m


# ETL Transform

In [252]:
from services.etl.transform import ETLTransformService
etl_transform_service = ETLTransformService()

In [253]:
documents = etl_transform_service.run(profiles=profiles)

[32m2025-07-30 17:29:59.281[0m | [1mINFO    [0m | [36mservices.etl.transform[0m:[36mrun[0m:[36m25[0m - [1mTransforming 193796 profiles...[0m
[32mJuly-30-2025[0m | [30m17:29:59[0m | [1mINFO[0m | [36mTransforming 193796 profiles...[0m | [35mservices.etl.transform:run:25[0m | [33m{}[0m
[32m2025-07-30 17:31:42.730[0m | [1mINFO    [0m | [36mservices.etl.transform[0m:[36mrun[0m:[36m27[0m - [1mTransformed 193796 profiles...[0m
[32mJuly-30-2025[0m | [30m17:31:42[0m | [1mINFO[0m | [36mTransformed 193796 profiles...[0m | [35mservices.etl.transform:run:27[0m | [33m{}[0m


# Vector Embedding of profiles

In [249]:
from services.vector_store.embedding import VectorStoreEmbeddingService

vector_store_embedding_service = VectorStoreEmbeddingService(
    model_name="voyage-3",
    max_concurrent_requests=100,
    use_progress=True,
    save_embeddings=True,
    embeddings_dir="embeddings",
)

embeddings = await vector_store_embedding_service.run_async(documents)

[32m2025-07-30 17:25:12.804[0m | [1mINFO    [0m | [36mservices.vector_store.embedding[0m:[36m__init__[0m:[36m51[0m - [1mInitialized VectorStoreEmbeddingService with max 100 concurrent requests[0m


[32mJuly-30-2025[0m | [30m17:25:12[0m | [1mINFO[0m | [36mInitialized VectorStoreEmbeddingService with max 100 concurrent requests[0m | [35mservices.vector_store.embedding:__init__:51[0m | [33m{}[0m
Embedding texts:   0%|          | 0/711 [00:00<?, ?it/s][32m2025-07-30 17:25:22.625[0m | [31m[1mERROR   [0m | [36mservices.vector_store.embedding[0m:[36msave_embeddings_to_json[0m:[36m187[0m - [31m[1mError saving embeddings to JSON: Object of type ndarray is not JSON serializable[0m
[32mJuly-30-2025[0m | [30m17:25:22[0m | [31m[1mERROR[0m | [36mError saving embeddings to JSON: Object of type ndarray is not JSON serializable[0m | [35mservices.vector_store.embedding:save_embeddings_to_json:187[0m | [33m{}[0m
Embedding texts:  14%|█▍        | 100/711 [00:09<00:59, 10.26it/s][32m2025-07-30 17:25:36.619[0m | [31m[1mERROR   [0m | [36mservices.vector_store.embedding[0m:[36msave_embeddings_to_json[0m:[36m187[0m - [31m[1mError saving embeddings to JS

CancelledError: 

# Load Embeddings

In [None]:
import os
import json
files = os.listdir("embeddings")
existing_dataset = []
for file in files:
    with open(os.path.join("embeddings", file), "r") as f:
        d = json.load(f)
        temp = list(filter(lambda x: x is not None, d))
        existing_dataset.extend(temp)

len(existing_dataset)

embeddings = []
for data in existing_dataset:
    embeddings.append(np.array(data.get("embedding")))

# Kmean Clustering on embeddings

In [247]:
from services.clustering.kmean import KMeansClusteringService

In [None]:
kmeans_clustering_service = KMeansClusteringService(
    n_clusters=14,
)

kmean_cluster_labels, cluster_centers = kmeans_clustering_service.run(np.array(embeddings))

[32m2025-07-30 14:27:30.354[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m161[0m - [1mCreating 14 clusters...[0m
[32mJuly-30-2025[0m | [30m14:27:30[0m | [1mINFO[0m | [36mCreating 14 clusters...[0m | [35m__main__:run:161[0m | [33m{}[0m


# Write indices to vector store

In [243]:
from services.vector_store.write import VectorStoreWriteService

In [244]:
vector_store_write_service = VectorStoreWriteService()

faiss_indices = vector_store_write_service.run(
    embeddings=np.array(embeddings),
    cluster_labels=kmean_cluster_labels,
)

[32m2025-07-30 17:17:41.875[0m | [1mINFO    [0m | [36mservices.vector_store.write[0m:[36mrun[0m:[36m87[0m - [1mRunning vector store write service for 193796 clusters[0m
[32mJuly-30-2025[0m | [30m17:17:41[0m | [1mINFO[0m | [36mRunning vector store write service for 193796 clusters[0m | [35mservices.vector_store.write:run:87[0m | [33m{}[0m
[32m2025-07-30 17:17:41.889[0m | [1mINFO    [0m | [36mservices.vector_store.write[0m:[36mrun[0m:[36m93[0m - [1mWriting embeddings for cluster 0[0m
[32mJuly-30-2025[0m | [30m17:17:41[0m | [1mINFO[0m | [36mWriting embeddings for cluster 0[0m | [35mservices.vector_store.write:run:93[0m | [33m{}[0m
[32m2025-07-30 17:17:41.961[0m | [1mINFO    [0m | [36mservices.vector_store.write[0m:[36mwrite[0m:[36m41[0m - [1mCluster 0: 17728 profiles, 17728 profile indices[0m
[32mJuly-30-2025[0m | [30m17:17:41[0m | [1mINFO[0m | [36mCluster 0: 17728 profiles, 17728 profile indices[0m | [35mservices.vector

# Save System

In [272]:
def save_system(
    profiles: List[ProfileData],
    clusters: np.ndarray,
    cluster_centers: np.ndarray,
    embeddings: np.ndarray,
    faiss_indices: Dict[int, Dict[str, Any]],
    cluster_labels: np.ndarray,
    filepath: str,
):
    """
    Save the clustering system to disk
    """
    system_data = {
        'profiles': profiles,
        'clusters': clusters,
        'cluster_centers': cluster_centers,
        'embeddings': embeddings,
        'faiss_indices': faiss_indices,
        'cluster_labels': cluster_labels,
    }
    
    with open(filepath, 'wb') as f:
        pickle.dump(system_data, f)
    
    # Save FAISS indices separately
    for cluster_id, cluster_info in faiss_indices.items():
        faiss.write_index(cluster_info['index'], f"{filepath}_cluster_{cluster_id}.faiss")
    
    print(f"System saved to {filepath}")

In [274]:
save_system(
    profiles=profiles,
    clusters=kmean_cluster_labels,
    cluster_centers=kmeans_clustering_service.cluster_centers,
    embeddings=np.array(embeddings),
    faiss_indices=faiss_indices,
    filepath="data/clustering_system_v2.pkl",
    cluster_labels=kmean_cluster_labels
)

System saved to data/clustering_system_v2.pkl
