In [51]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams, ScoredPoint, Filter, FieldCondition, MatchValue
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import hashlib
from datetime import datetime, timezone
import os
import uuid
import numpy as np
from bson import ObjectId
import torch
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr

#### Connections

In [52]:
from pymongo import MongoClient

# mongo conection

mongo_uri = "mongodb://localhost:27017/" #compass
mongo_client = MongoClient(mongo_uri)
db = mongo_client['pipelineos']

# loading the mongodb into variables:

icp_profiles = db["icp_profiles"]
discovered_companies = db["discovered_companies"]
scored_companies = db['scored_companies']

#### Qdrant connections

In [53]:
# qdrant connection via https

from qdrant_client.models import VectorParams, Distance
qdrant = QdrantClient(
    host="localhost",
    port=6333)
#using qdrants https method as used 6333 port and gRPC is :
# qdrant = QdrantClient(
#     host="localhost", 
#     port=6334, 
#     grpc=True
# )

#### Run the Qdrant

In [54]:
# quadrant setup via docker:
# run this in ubuntu terminal:

# docker pull qdrant/qdrant
# docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant

#### Models

In [55]:
# # vectorize setup:

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #384-dim vector
# model = SentenceTransformer("BAAI/bge-small-en")  # 384 D
model = SentenceTransformer("all-MiniLM-L12-v2") # 384 D
# model = SentenceTransformer("all-mpnet-base-v2") # 768 d
# model = SentenceTransformer("all-distilroberta-v1") # 768
# model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") # 384 d

ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /api/models/sentence-transformers/all-MiniLM-L12-v2/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7947f5d7ab60>: Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 3c920a45-4310-44fb-b192-cd63590f71e2)')

#### To fetch the latest ICP from the MongoDB

In [None]:
# Aggregation pipeline to get the latest active ICP
pipeline = [
    {"$match": {"active": True}},
    {"$addFields": {
        "effective_time": {
            "$cond": {
                "if": {"$gt": ["$updated_at", "$created_at"]},
                "then": "$updated_at",
                "else": "$created_at"
            }
        }
    }},
    {"$sort": {"effective_time": -1}},
    {"$limit": 1}
]

latest_icp = list(icp_profiles.aggregate(pipeline))
if latest_icp:
    latest_icp = latest_icp[0]
    print("Latest active ICP:")
    print(latest_icp)
else:
    print("No active ICP found.")

Latest active ICP:
{'_id': ObjectId('689584be825c9ee6d4573435'), 'name': 'Flowcast AI solutions', 'filters': {'locations': [], 'tech_stack': ['ai', 'AWS', 'LangChain', 'python'], 'keywords': ['ai', 'ml', 'gen'], 'designations': ['ceo', 'er'], 'department': ['Engineering', 'Product'], 'industry': ['FinTech', 'finance'], 'employee_count': {'min': 30, 'max': 20}, 'founded_after': 2019}, 'tags': ['tect'], 'active': True, 'created_at': datetime.datetime(2025, 8, 8, 5, 1, 50, 228000), 'updated_at': datetime.datetime(2025, 8, 8, 5, 1, 50, 228000), 'effective_time': datetime.datetime(2025, 8, 8, 5, 1, 50, 228000)}


In [None]:
latest_icp

{'_id': ObjectId('689584be825c9ee6d4573435'),
 'name': 'Flowcast AI solutions',
 'filters': {'locations': [],
  'tech_stack': ['ai', 'AWS', 'LangChain', 'python'],
  'keywords': ['ai', 'ml', 'gen'],
  'designations': ['ceo', 'er'],
  'department': ['Engineering', 'Product'],
  'industry': ['FinTech', 'finance'],
  'employee_count': {'min': 30, 'max': 20},
  'founded_after': 2019},
 'tags': ['tect'],
 'active': True,
 'created_at': datetime.datetime(2025, 8, 8, 5, 1, 50, 228000),
 'updated_at': datetime.datetime(2025, 8, 8, 5, 1, 50, 228000),
 'effective_time': datetime.datetime(2025, 8, 8, 5, 1, 50, 228000)}

#### Delta Ware Filter

In [None]:
latest_icp = list(icp_profiles.aggregate(pipeline))
companies_to_score = []

for icp in latest_icp:
    icp_id = str(icp["_id"])
    icp_version = icp.get("version", 1)

    for company in discovered_companies.find({}):
        company_id = company["_id"]
        last_scraped = company["last_scraped"]


        # Look up the scored company metadata from separate collection
        score_doc = scored_companies.find_one({
            "company_id": company_id,
            "icp_id": icp_id
        })

        # CASE A: Never scored for this ICP
        if not score_doc:
            companies_to_score.append({"company": company, "icp": icp})
            continue

        # CASE B: Rescraped after last scoring
        last_scored = score_doc.get("last_scored")
        if last_scored is None or last_scored > last_scraped: # logic changed just to check, reverse it once the task is done
            companies_to_score.append({"company": company, "icp": icp})
            continue

        # CASE C: ICP version has changed
        scored_version = score_doc.get("icp_version")
        if scored_version != icp_version:
            companies_to_score.append({"company": company, "icp": icp})
            continue

print(f"Total <company, ICP> pairs to score: {len(companies_to_score)}")


Total <company, ICP> pairs to score: 5


### Collections Creation in Qdrant

In [None]:
# === First: Create collection if it doesn't exist ===


# if qdrant.collection_exists("icp_vectors"):
#     qdrant.delete_collection("icp_vectors")

# ICP_Vectors

if not qdrant.collection_exists("icp_vectors"):
    qdrant.create_collection(
        collection_name="icp_vectors",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)  # 384 for MiniLM
    )



# if qdrant.collection_exists("company_vectors"):
#     qdrant.delete_collection("company_vectors")

# Company_Vectors

if not qdrant.collection_exists("company_vectors"):
    qdrant.create_collection(
        collection_name="company_vectors",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

#### Company Vectorization and upsert with filter 

In [None]:
# import datetime
# import numpy as np
# import uuid
# from pymongo import UpdateOne
# from qdrant_client.models import PointStruct
# from bson import ObjectId

# def normalize_list(value):
#     if isinstance(value, str):
#         return [value.lower()]
#     if isinstance(value, list):
#         return list({v.strip().lower() for v in value if isinstance(v, str)})
#     return []

# def prepare_vector_text(company):
#     parts = []
#     parts += normalize_list(company.get("industries"))
#     parts += normalize_list(company.get("tech_stack"))

#     country = company.get("hq_location", {}).get("country", "").lower()
#     if country:
#         parts.append(country)

#     return " ".join(parts).strip()

# points = []
# bulk_updates = []

# for pair in companies_to_score:
#     company = pair["company"]
#     icp = pair["icp"]

#     domain = company.get("domain")
#     if not domain:
#         continue

#     # changed_at = company.get("changed_at")
#     # last_vectorized = company.get("last_vectorized")

#     # if changed_at and last_vectorized:
#     #     if changed_at < last_vectorized:
#     #         print(f"Skipping {domain}: No changes since last vectorization.")
#     #         print(f"[DEBUG] {domain} | changed_at: {changed_at}, last_vectorized: {last_vectorized}")

#     #         continue

#     vector_text = prepare_vector_text(company)
#     if not vector_text:
#         print(f"Skipping {domain}: No vector text.")
#         continue

#     try:
#         embedding = model.encode(vector_text)
#         if embedding is None or not isinstance(embedding, (list, np.ndarray)) or np.all(np.array(embedding) == 0):
#             print(f"Skipping {domain}: Invalid embedding.")
#             continue

#         embedding = np.array(embedding, dtype=np.float32).tolist()
#         point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"company:{domain}"))

#         point = PointStruct(
#             id=point_id,
#             vector=embedding,
#             payload={
#                 "company_id": str(company.get("_id")),
#                 "icp_id": str(company.get("icp_id", "")),
#                 "industry": normalize_list(company.get("industries")),
#                 "location": company.get("hq_location", {}).get("country", "").lower(),
#                 "tech_stack": normalize_list(company.get("tech_stack"))
#             }
#         )
#         points.append(point)

#         bulk_updates.append(UpdateOne(
#             {"_id": company["_id"]},
#             {"$set": {"last_vectorized": datetime.datetime.now(datetime.timezone.utc)}}
#         ))

#     except Exception as e:
#         print(f"Error vectorizing {domain}: {e}")

# if points:
#     qdrant.upsert(collection_name="company_vectors", points=points)
#     print(f"Upserted {len(points)} companies to Qdrant.")
# else:
#     print("No valid company vectors to upsert.")

# if bulk_updates:
#     db["discovered_companies"].bulk_write(bulk_updates)
#     print(f"Updated last_vectorized for {len(bulk_updates)} companies.")


#### ICP Vectorization and upsert with filter 

In [None]:
# from datetime import datetime
# from uuid import uuid5, NAMESPACE_DNS
# import numpy as np
# from qdrant_client.models import PointStruct

# # === Helper: Normalize and join values ===
# def normalize_list(value):
#     if isinstance(value, str):
#         return [value.lower()]
#     if isinstance(value, list):
#         return list({v.strip().lower() for v in value if isinstance(v, str)})
#     return []

# # === Load active ICPs ===
# latest_icp = list(icp_profiles.aggregate(pipeline))
# icp_points = []

# for icp in latest_icp:
#     icp_id = str(icp["_id"])
#     filters = icp.get("filters", {})
#     updated_at = icp.get("updated_at", datetime.min)
#     last_vectorized = icp.get("last_vectorized")  # May be None

#     # === Skip if already vectorized and not updated ===
#     # if last_vectorized:
#     #     if updated_at < last_vectorized:
#     #         print(f"Skipping ICP {icp_id}: No changes since last vectorization.")
#     #         continue

#     # === Normalize and build vector text ===
#     text_parts = []

#     industries = normalize_list(filters.get("industries", []))
#     locations = normalize_list(filters.get("locations", []))
#     tech_stack = normalize_list(icp.get("tech_stack", []))
#     keywords = normalize_list(icp.get("keywords", []))

#     text_parts += industries + locations + tech_stack + keywords
#     vector_text = " ".join(text_parts).strip()

#     if not vector_text:
#         continue

#     try:
#         embedding = model.encode(vector_text)
#         if embedding is None or not isinstance(embedding, (list, np.ndarray)) or np.all(np.array(embedding) == 0):
#             continue

#         embedding = np.array(embedding, dtype=np.float32).tolist()
#         point_id = str(uuid5(NAMESPACE_DNS, f"icp:{icp_id}"))

#         point = PointStruct(
#             id=point_id,
#             vector=embedding,
#             payload={
#                 "icp_id": icp_id,
#                 "industry": industries,
#                 "location": locations,
#                 "tech_stack": tech_stack,
#                 "keywords": keywords
#             }
#         )
#         icp_points.append(point)

#         # === Update last_vectorized timestamp in MongoDB ===
#         # icp_profiles.update_one(
#         #     {"_id": icp["_id"]},
#         #     {"$set": {"last_vectorized": datetime.utcnow()}}
#         # )

#     except Exception as e:
#         print(f"Error encoding ICP {icp_id}: {e}")

# # === Final upsert to Qdrant ===
# if icp_points:
#     qdrant.upsert(collection_name="icp_vectors", points=icp_points)
#     print(f"Upserted {len(icp_points)} ICP profiles to Qdrant.")
# else:
#     print("No valid ICP vectors to upsert.")


#### without filter ICP and Company Vectors:

In [None]:
# === Helper: Normalize and join string lists ===
def normalize_list(value):
    if isinstance(value, str):
        return [value.lower()]
    if isinstance(value, list):
        return list({v.strip().lower() for v in value if isinstance(v, str)})
    return []

# === Helper: Prepare vector text from normalized fields ===
def prepare_vector_text(company):
    parts = []

    industries = normalize_list(company.get("industries"))
    tech_stack = normalize_list(company.get("tech_stack"))
    country = company.get("hq_location", {}).get("country", "").lower()

    parts += industries
    parts += tech_stack
    if country:
        parts.append(country)

    return " ".join(parts).strip()

# === Vectorize and Upsert to Qdrant ===
points = []
for pair in companies_to_score:
    company = pair["company"]
    icp = pair["icp"]

    domain = company.get("domain")
    if not domain:
        continue

    # === Check vector freshness ===
    # changed_at = company.get("changed_at")
    # last_vectorized = company.get("last_vectorized")

    # if changed_at and last_vectorized:
    #     try:
    #         if isinstance(changed_at, str):
    #             changed_at = datetime.datetime(changed_at)
    #         if isinstance(last_vectorized, str):
    #             last_vectorized = datetime.datetime(last_vectorized)

    #         if changed_at <= last_vectorized:
    #             print(f"Skipping {domain}: No changes since last vectorization.")
    #             continue
    #     except Exception as e:
    #         print(f"Date parse error for {domain}: {e}")

    vector_text = prepare_vector_text(company)
    if not vector_text:
        print(f"Skipping {domain}: No vector text.")
        continue

    try:
        embedding = model.encode(vector_text)
        if embedding is None or not isinstance(embedding, (list, np.ndarray)) or np.all(np.array(embedding) == 0):
            print(f"Skipping {domain}: Invalid embedding.")
            continue

        embedding = np.array(embedding, dtype=np.float32).tolist()
        point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"company:{domain}"))

        point = PointStruct(
            id=point_id,
            vector=embedding,
            payload={
                "company_id": str(company.get("_id")),
                "icp_id": str(company.get("icp_id", "")),
                "industry": normalize_list(company.get("industries")),
                "location": company.get("hq_location", {}).get("country", "").lower(),
                "tech_stack": normalize_list(company.get("tech_stack"))
            }
        )
        points.append(point)
    except Exception as e:
        print(f"Error vectorizing {domain}: {e}")

# === Upsert all vectors into Qdrant ===
if points:
    qdrant.upsert(collection_name="company_vectors", points=points)
    print(f"Upserted {len(points)} companies to Qdrant.")
else:
    print("No valid company vectors to upsert.")


Upserted 5 companies to Qdrant.


In [None]:
# === Helper: Normalize and join values ===
def normalize_list(value):
    if isinstance(value, str):
        return [value.lower()]
    if isinstance(value, list):
        return list({v.strip().lower() for v in value if isinstance(v, str)})
    return []

# === Load active ICPs ===
latest_icp = list(icp_profiles.aggregate(pipeline))
icp_points = []

for icp in latest_icp:
    icp_id = str(icp["_id"])
    filters = icp.get("filters", {})
    text_parts = []

    # Normalize and extract fields
    # FIXED
    industry = normalize_list(filters.get("industry", []))
    locations = normalize_list(filters.get("locations", []))
    tech_stack = normalize_list(filters.get("tech_stack", []))


    text_parts += industry + locations + tech_stack
    vector_text = " ".join(text_parts).strip()

    if not vector_text:
        continue
    print(f"[{icp_id}] vector_text: '{vector_text}' | filters: {filters}")

    try:
        embedding = model.encode(vector_text)
        if embedding is None or not isinstance(embedding, (list, np.ndarray)) or np.all(np.array(embedding) == 0):
            continue

        embedding = np.array(embedding, dtype=np.float32).tolist()
        point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"icp:{icp_id}"))

        point = PointStruct(
            id=point_id,
            vector=embedding,
            payload={
                "icp_id": icp_id,
                "industry": industry,
                "location": locations,
                "tech_stack": tech_stack
            }
        )
        icp_points.append(point)

    except Exception as e:
        print(f"Error encoding ICP {icp_id}: {e}")

# === Upsert to Qdrant ===
if icp_points:
    qdrant.upsert(collection_name="icp_vectors", points=icp_points)
    print(f"Upserted {len(icp_points)} ICP profiles to Qdrant.")
else:
    print("No valid ICP vectors to upsert.")
    print(f"[{icp_id}] vector_text: '{vector_text}' | filters: {filters}")
    print(f"[{icp_id}] Embedding generated: {embedding is not None} | Zero vector: {np.all(np.array(embedding)==0) if embedding is not None else 'N/A'}")



[689584be825c9ee6d4573435] vector_text: 'fintech finance ai aws python langchain' | filters: {'locations': [], 'tech_stack': ['ai', 'AWS', 'LangChain', 'python'], 'keywords': ['ai', 'ml', 'gen'], 'designations': ['ceo', 'er'], 'department': ['Engineering', 'Product'], 'industry': ['FinTech', 'finance'], 'employee_count': {'min': 30, 'max': 20}, 'founded_after': 2019}
Upserted 1 ICP profiles to Qdrant.


In [None]:
industry

['fintech', 'finance']

In [None]:
text_parts

['fintech', 'finance', 'ai', 'aws', 'python', 'langchain']

#### cos_sim scoring:

In [None]:
# WEIGHTS = {
#     "industry": 20,
#     "employee_count": 15,
#     "location": 15,
#     "tech_stack": 25,
#     "keywords": 10,
#     "founded_year": 5,
#     "github_signal": 10
# }
 
# # --- 1. Get latest active ICP ---
# latest_icp = list(icp_profiles.aggregate(pipeline))
 
# if not latest_icp:
#     print("❌ No active ICP found.")
#     exit()
# icp = latest_icp[0]
# icp_id = str(icp["_id"])
# icp_version = icp.get("version", 1)
 
# # --- 2. Get ICP vector from Qdrant ---
 
# icp_vector_resp = qdrant.retrieve(
#     collection_name="icp_vectors",
#     ids= [str(uuid.uuid5(uuid.NAMESPACE_DNS, f"icp:{icp_id}"))],
#     with_vectors=True
# )
 
# if not icp_vector_resp or not icp_vector_resp[0].vector:
#     print("❌ ICP vector not found.")
#     exit()
 
# icp_embedding = icp_vector_resp[0].vector
 
# # --- 3. Get all company vectors from Qdrant ---
# search_result = qdrant.query_points(
#     collection_name="company_vectors",
#     query=icp_embedding,
#     limit=10000,
#     with_payload=True,
#     with_vectors=True
# )
# print(f"Total points returned from Qdrant: {len(search_result.points)}")
 
# icp_founded_after = icp.get("filters", {}).get("founded_after", 2015)
 
# # --- 4. Scoring Rule Function () ---
# def rule_score(company_doc, icp_tokens, icp_founded_after):
#     rule_score = 0
#     breakdown = {k: 0 for k in WEIGHTS}
#     def safe_list(field):
#         value = company_doc.get(field, [])
#         if isinstance(value, str): return [value]
#         if isinstance(value, list): return value
#         return []
    
#     # Match tokens in industry, location, tech stack, keywords
#     for field, key in [("industries", "industry"), ("hq_location", "location"),
#                        ("tech_stack", "tech_stack"), ("keywords", "keywords")]:
#         values = (
#             safe_list(field) if field != "hq_location"
#             else list(company_doc.get("hq_location", {}).values())
#         )
#         if any(token in str(v).lower() for v in values for token in icp_tokens):
#             rule_score += WEIGHTS[key]
#             breakdown[key] = WEIGHTS[key]
 
#     # Match employee count
#     emp = company_doc.get("employee_count_estimate", {})
#     if emp.get("min", 0) >= 10 and emp.get("max", 0) <= 500:
#         rule_score += WEIGHTS["employee_count"]
#         breakdown["employee_count"] = WEIGHTS["employee_count"]
 
#     # Founded year match
#     if isinstance(company_doc.get("founded_year"), int) and company_doc["founded_year"] >= icp_founded_after:
#         rule_score += WEIGHTS["founded_year"]
#         breakdown["founded_year"] = WEIGHTS["founded_year"]
 
#     # GitHub presence
#     urls = company_doc.get("source_urls", [])
#     if any("github" in url.lower() for url in urls):
#         rule_score += WEIGHTS["github_signal"]
#         breakdown["github_signal"] = WEIGHTS["github_signal"]
 
#     return rule_score, breakdown
 
# # --- 5. Tokenize ICP filters ---
# icp_tokens = [token.lower().strip() for token in f"{' '.join(icp.get('filters', {}).get('industries', []))} {' '.join(icp.get('filters', {}).get('locations', []))}".split()]
 
# # --- 6. Score companies using util.cos_sim() ---
# scored_results_util = []
# icp_tensor = torch.tensor(model.encode([' '.join(icp_tokens)]), dtype=torch.float32)  # Convert ICP to tensor
 
# for point in search_result.points:
#     payload = point.payload
#     company_id = payload.get("company_id")
#     company_vector = point.vector
    
#     # Calculate cosine similarity using util.cos_sim()
#     company_tensor = torch.tensor([company_vector], dtype=torch.float32)
#     similarity = util.cos_sim(icp_tensor, company_tensor)[0][0].item()
    
#     try:
#         if isinstance(company_id, str):
#             company_id = ObjectId(company_id)
#     except Exception as e:
#         print(f'{company_id} have {e}')
#         continue 
 
#     company = discovered_companies.find_one({"_id": company_id})
#     if not company:
#         continue
        
#     rule, breakdown = rule_score(company, icp_tokens, icp_founded_after)
#     if rule == 0:
#         final_score = round(similarity * 0.1, 4)
#     else:
#         final_score = round(similarity * rule, 4)
    
#     breakdown["vector_similarity"] = round(similarity, 4)
 
#     scored_doc = {
#         "company_id": company_id,
#         "icp_id": icp_id,
#         "icp_version": icp_version,
#         "final_score": final_score,
#         "breakdown": breakdown,
#         "weights": WEIGHTS,
#         "last_scored": datetime.now(timezone.utc),
#         "method": "util_cos_sim"
#     }
    
#     # Store in a different collection or add method flag
#     scored_companies.update_one(
#         {"company_id": company_id, "icp_id": icp_id, "method": "util_cos_sim"},
#         {"$set": scored_doc},
#         upsert=True
#     )
#     scored_results_util.append((company.get("domain", "unknown"), final_score))
 
# # --- 7. Print Top Results ---
# print("Scoring complete using util.cos_sim(). Top results:")
# for domain, score in sorted(scored_results_util, key=lambda x: x[1], reverse=True)[:20]:
#     print(f"{domain} → Score: {score}")
 

Total points returned from Qdrant: 5
Scoring complete using util.cos_sim(). Top results:
ssynthai.io → Score: 3.1059
flowcast.ai → Score: 2.2858
neogencloud.com → Score: 0.0691
retico.ai → Score: 0.0085
ddocstream.com → Score: -0.6692


## Test Code

In [None]:
WEIGHTS = {
    "industry": 20,
    "employee_count": 15,
    "location": 15,
    "tech_stack": 25,
    "keywords": 10,
    "founded_year": 5,
    "github_signal": 10
}
 
# --- 1. Get latest active ICP ---
latest_icp = list(icp_profiles.aggregate(pipeline))
 
if not latest_icp:
    print("❌ No active ICP found.")
    exit()
icp = latest_icp[0]
icp_id = str(icp["_id"])
icp_version = icp.get("version", 1)
 
# --- 2. Get ICP vector from Qdrant ---
 
icp_vector_resp = qdrant.retrieve(
    collection_name="icp_vectors",
    ids= [str(uuid.uuid5(uuid.NAMESPACE_DNS, f"icp:{icp_id}"))],
    with_vectors=True
)
 
if not icp_vector_resp or not icp_vector_resp[0].vector:
    print("❌ ICP vector not found.")
    exit()
 
icp_embedding = icp_vector_resp[0].vector
 
# --- 3. Get all company vectors from Qdrant ---
search_result = qdrant.query_points(
    collection_name="company_vectors",
    query=icp_embedding,
    limit=10000,
    with_payload=True,
    with_vectors=True
)
print(f"Total points returned from Qdrant: {len(search_result.points)}")
 
icp_founded_after = icp.get("filters", {}).get("founded_after", 2015)
 
# --- 4. Scoring Rule Function () ---
def rule_score(company_doc, icp_tokens, icp_founded_after):
    rule_score = 0
    breakdown = {k: 0 for k in WEIGHTS}
    def safe_list(field):
        value = company_doc.get(field, [])
        if isinstance(value, str): return [value]
        if isinstance(value, list): return value
        return []
    
    # Match tokens in industry, location, tech stack, keywords
    for field, key in [("industries", "industry"), ("hq_location", "location"),
                       ("tech_stack", "tech_stack"), ("keywords", "keywords")]:
        values = (
            safe_list(field) if field != "hq_location"
            else list(company_doc.get("hq_location", {}).values())
        )
        if any(token in str(v).lower() for v in values for token in icp_tokens):
            rule_score += WEIGHTS[key]
            breakdown[key] = WEIGHTS[key]
 
    # Match employee count
    emp = company_doc.get("employee_count_estimate", {})
    if emp.get("min", 0) >= 10 and emp.get("max", 0) <= 500:
        rule_score += WEIGHTS["employee_count"]
        breakdown["employee_count"] = WEIGHTS["employee_count"]
 
    # Founded year match
    if isinstance(company_doc.get("founded_year"), int) and company_doc["founded_year"] >= icp_founded_after:
        rule_score += WEIGHTS["founded_year"]
        breakdown["founded_year"] = WEIGHTS["founded_year"]
 
    # GitHub presence
    urls = company_doc.get("source_urls", [])
    if any("github" in url.lower() for url in urls):
        rule_score += WEIGHTS["github_signal"]
        breakdown["github_signal"] = WEIGHTS["github_signal"]
 
    return rule_score, breakdown
 
# --- 5. Tokenize ICP filters ---
icp_filters = icp.get('filters', {})
icp_tokens =[]

for field_name in ['industry', 'industries']:
    if field_name in icp_filters:
        industries = icp_filters[field_name]
        if isinstance(industries, list):
            for industry in industries:
                tokens = str(industry).lower().split()
                icp_tokens.extend(tokens)
        elif isinstance(industries, str):
            tokens = industries.lower().split()
            icp_tokens.extend(tokens)

# Extract tech stack tokens
if 'tech_stack' in icp_filters:
    tech_stack = icp_filters['tech_stack']
    if isinstance(tech_stack, list):
        for tech in tech_stack:
            # Keep full tech names for exact matching
            icp_tokens.append(str(tech).lower())
    elif isinstance(tech_stack, str):
        icp_tokens.append(tech_stack.lower())
 
# Extract keyword tokens
if 'keywords' in icp_filters:
    keywords = icp_filters['keywords']
    if isinstance(keywords, list):
        for keyword in keywords:
            icp_tokens.append(str(keyword).lower())
    elif isinstance(keywords, str):
        icp_tokens.append(keywords.lower())
 
# Clean up tokens
icp_tokens = [token.strip() for token in icp_tokens if token.strip()]
icp_tokens = list(set(icp_tokens))  # Remove duplicates
 
print(f"✅ Extracted ICP tokens: {icp_tokens}")
 

icp_tokens = list(set([token for token in icp_tokens if token.strip()]))

# --- 6. Score companies using util.cos_sim() ---
scored_results_util = []
icp_tensor = torch.tensor([icp_embedding], dtype=torch.float32)  # Convert ICP to tensor

for point in search_result.points:
    payload = point.payload
    company_id = payload.get("company_id")
    company_vector = point.vector

    # Calculate cosine similarity using util.cos_sim()
    company_tensor = torch.tensor([company_vector], dtype=torch.float32)
    raw_similarity = util.cos_sim(icp_tensor, company_tensor)[0][0].item()
    vector_similarity_score = ((raw_similarity + 1) / 2) * 100  # Normalize to 0–100

    try:
        if isinstance(company_id, str):
            company_id = ObjectId(company_id)
    except Exception as e:
        print(f'{company_id} has invalid ObjectId: {e}')
        continue 

    company = discovered_companies.find_one({"_id": company_id})
    if not company:
        continue

    # Apply rule-based scoring
    rule_score_total, breakdown = rule_score(company, icp_tokens, icp_founded_after)

    # Final score weights
    vector_weight = 0.4
    rule_weight = 0.6

    final_score = round(
        (vector_similarity_score * vector_weight) + (rule_score_total * rule_weight),
        2
    )

    breakdown["vector_similarity"] = round(vector_similarity_score, 2)

    scored_doc = {
        "company_id": company_id,
        "icp_id": icp_id,
        "icp_version": icp_version,
        "final_score": final_score,
        "breakdown": breakdown,
        "weights": WEIGHTS,
        "last_scored": datetime.now(timezone.utc),
        "method": "util_cos_sim"
    }

    scored_companies.update_one(
        {"company_id": company_id, "icp_id": icp_id, "method": "util_cos_sim"},
        {"$set": scored_doc},
        upsert=True
    )

    scored_results_util.append((company.get("domain", "unknown"), final_score))

# --- 7. Print Top Results ---
print(f"📈 Total companies scored: {len(scored_results_util)}")

sorted_results = sorted(scored_results_util, key=lambda x: x[1], reverse=True)

print("\n🏆 TOP RESULTS (Weighted Average Method):")
for i, (domain, score) in enumerate(sorted_results[:10], 1):
    print(f"{i:2d}. {domain:20s} → {score:6.2f}%")


Total points returned from Qdrant: 5
✅ Extracted ICP tokens: ['fintech', 'aws', 'ai', 'langchain', 'gen', 'ml', 'python', 'finance']
📈 Total companies scored: 5

🏆 TOP RESULTS (Weighted Average Method):
 1. flowcast.ai          →  76.58%
 2. ssynthai.io          →  69.80%
 3. neogencloud.com      →  61.05%
 4. retico.ai            →  58.12%
 5. ddocstream.com       →  49.36%


## test code 2

In [None]:
WEIGHTS = {
    "industry": 20,
    "employee_count": 15,
    "location": 15,
    "tech_stack": 25,
    "keywords": 10,
    "founded_year": 5,
    "github_signal": 10
}
 
# --- 1. Get latest active ICP ---
latest_icp = list(icp_profiles.aggregate(pipeline))
 
if not latest_icp:
    print("❌ No active ICP found.")
    exit()
icp = latest_icp[0]
icp_id = str(icp["_id"])
icp_version = icp.get("version", 1)
 
# --- 2. Get ICP vector from Qdrant ---
 
icp_vector_resp = qdrant.retrieve(
    collection_name="icp_vectors",
    ids= [str(uuid.uuid5(uuid.NAMESPACE_DNS, f"icp:{icp_id}"))],
    with_vectors=True
)
 
if not icp_vector_resp or not icp_vector_resp[0].vector:
    print("❌ ICP vector not found.")
    exit()
 
icp_embedding = icp_vector_resp[0].vector
 
# --- 3. Get all company vectors from Qdrant ---
search_result = qdrant.query_points(
    collection_name="company_vectors",
    query=icp_embedding,
    limit=10000,
    with_payload=True,
    with_vectors=True
)
print(f"Total points returned from Qdrant: {len(search_result.points)}")
 
icp_founded_after = icp.get("filters", {}).get("founded_after", 2015)
 
# --- 4. Scoring Rule Function () ---
def rule_score(company_doc, icp_tokens, icp_founded_after, icp_field_tokens=None):
    rule_score = 0
    breakdown = {k: 0 for k in WEIGHTS}
    def safe_list(field):
        value = company_doc.get(field, [])
        if isinstance(value, str): return [value]
        if isinstance(value, list): return value
        return []
    
    # 1. INDUSTRY MATCHING
    if icp_field_tokens and 'industry' in icp_field_tokens:
        industry_tokens = icp_field_tokens['industry']
        company_industries = safe_list("industries")
        if any(token in str(v).lower() for v in company_industries for token in industry_tokens):
            rule_score += WEIGHTS["industry"]
            breakdown["industry"] = WEIGHTS["industry"]
    
    # 2. LOCATION MATCHING  
    if icp_field_tokens and 'location' in icp_field_tokens:
        location_tokens = icp_field_tokens['location']
        company_location_values = list(company_doc.get("hq_location", {}).values())
        if any(token in str(v).lower() for v in company_location_values for token in location_tokens):
            rule_score += WEIGHTS["location"]
            breakdown["location"] = WEIGHTS["location"]
    
    # 3. TECH STACK MATCHING
    if icp_field_tokens and 'tech_stack' in icp_field_tokens:
        tech_tokens = icp_field_tokens['tech_stack']
        company_tech = safe_list("tech_stack")
        if any(token in str(v).lower() for v in company_tech for token in tech_tokens):
            rule_score += WEIGHTS["tech_stack"]
            breakdown["tech_stack"] = WEIGHTS["tech_stack"]
    
    # 4. KEYWORDS MATCHING
    if icp_field_tokens and 'keywords' in icp_field_tokens:
        keyword_tokens = icp_field_tokens['keywords']
        text_fields = [
            company_doc.get("description", ""),
            company_doc.get("name", ""),
            " ".join(safe_list("industries")),
            " ".join(safe_list("tech_stack"))
        ]
        company_text = " ".join(text_fields).lower()
        if any(token in company_text for token in keyword_tokens):
            rule_score += WEIGHTS["keywords"]
            breakdown["keywords"] = WEIGHTS["keywords"]
    # 5. EMPLOYEE COUNT MATCHING (DYNAMIC)
    emp = company_doc.get("employee_count_estimate", {})
    emp_min = emp.get("min", 0)
    emp_max = emp.get("max", float('inf'))
    
    if icp_field_tokens and 'employee_count' in icp_field_tokens:
        icp_emp = icp_field_tokens['employee_count']
        if isinstance(icp_emp, dict):
            icp_min = icp_emp.get('min')
            icp_max = icp_emp.get('max')
            
            # Check if company employee count falls within the ICP range
            if not (emp_max< icp_min or emp_min > icp_max):
                rule_score += WEIGHTS["employee_count"]
                breakdown["employee_count"] = WEIGHTS["employee_count"]
    # 6. FOUNDED YEAR MATCHING (DYNAMIC)
    founded_year = company_doc.get("founded_year")
    
    if icp_field_tokens and 'founded_year' in icp_field_tokens:
        year_threshold = icp_field_tokens['founded_year']
        
        if isinstance(founded_year, int) and founded_year >= year_threshold:
            rule_score += WEIGHTS["founded_year"]
            breakdown["founded_year"] = WEIGHTS["founded_year"]
    # 7. GITHUB SIGNAL MATCHING
    urls = company_doc.get("source_urls", [])
    if any("github" in url.lower() for url in urls):
        rule_score += WEIGHTS["github_signal"]
        breakdown["github_signal"] = WEIGHTS["github_signal"]
    return rule_score, breakdown
 

# --- 5. Tokenize ICP filters ---
icp_filters = icp.get('filters', {})
icp_tokens = []
icp_field_tokens = {}  # Track tokens by field for better debugging
 
# Extract industry tokens
for field_name in ['industry', 'industries']:
    if field_name in icp_filters:
        industries = icp_filters[field_name]
        field_tokens = []
        if isinstance(industries, list):
            for industry in industries:
                tokens = str(industry).lower().split()
                field_tokens.extend(tokens)
                icp_tokens.extend(tokens)
        elif isinstance(industries, str):
            tokens = industries.lower().split()
            field_tokens.extend(tokens)
            icp_tokens.extend(tokens)
        icp_field_tokens['industry'] = field_tokens
 
# Extract location tokens
for field_name in ['location', 'locations']:
    if field_name in icp_filters:
        locations = icp_filters[field_name]
        field_tokens = []
        if isinstance(locations, list):
            for location in locations:
                tokens = str(location).lower().split()
                field_tokens.extend(tokens)
                icp_tokens.extend(tokens)
        elif isinstance(locations, str):
            tokens = locations.lower().split()
            field_tokens.extend(tokens)
            icp_tokens.extend(tokens)
        icp_field_tokens['location'] = field_tokens
 
# Extract tech stack tokens
if 'tech_stack' in icp_filters:\n    tech_stack = icp_filters['tech_stack']\n    field_tokens = []
    if isinstance(tech_stack, list):
        for tech in tech_stack:
            # Keep full tech names for exact matching
            token = str(tech).lower()
            field_tokens.append(token)
            icp_tokens.append(token)
    elif isinstance(tech_stack, str):
        token = tech_stack.lower()
        field_tokens.append(token)
        icp_tokens.append(token)
    icp_field_tokens['tech_stack'] = field_tokens
 
# Extract keyword tokens
if 'keywords' in icp_filters:\n    keywords = icp_filters['keywords']\n    field_tokens = []
    if isinstance(keywords, list):
        for keyword in keywords:
            token = str(keyword).lower()
            field_tokens.append(token)
            icp_tokens.append(token)
    elif isinstance(keywords, str):
        token = keywords.lower()
        field_tokens.append(token)
        icp_tokens.append(token)
    icp_field_tokens['keywords'] = field_tokens
 
# Extract employee count range (for matching logic)
if 'employee_count' in icp_filters:
    emp_filter = icp_filters['employee_count']
    icp_field_tokens['employee_count'] = emp_filter  # Store the range dict
 
# Extract founded year (for matching logic)
if 'founded_after' in icp_filters:\n    icp_field_tokens['founded_year'] = icp_filters['founded_after']\nelif 'founded_after' in icp:\n    icp_field_tokens['founded_year'] = icp['founded_after']
 
# Clean up tokens
icp_tokens = [token.strip() for token in icp_tokens if token.strip()]
icp_tokens = list(set(icp_tokens))  # Remove duplicates
 
print(f"✅ Extracted ICP tokens: {icp_tokens}")
print(f"✅ Field-specific tokens: {icp_field_tokens}")
 
 

icp_tokens = list(set([token for token in icp_tokens if token.strip()]))

# --- 6. Score companies using util.cos_sim() ---
scored_results_util = []
icp_tensor = torch.tensor([icp_embedding], dtype=torch.float32)  # Convert ICP to tensor

for point in search_result.points:
    payload = point.payload
    company_id = payload.get("company_id")
    company_vector = point.vector

    # Calculate cosine similarity using util.cos_sim()
    company_tensor = torch.tensor([company_vector], dtype=torch.float32)
    raw_similarity = util.cos_sim(icp_tensor, company_tensor)[0][0].item()
    vector_similarity_score = ((raw_similarity + 1) / 2) * 100  # Normalize to 0–100

    try:
        if isinstance(company_id, str):
            company_id = ObjectId(company_id)
    except Exception as e:
        print(f'{company_id} has invalid ObjectId: {e}')
        continue 

    company = discovered_companies.find_one({"_id": company_id})
    if not company:
        continue

    # Apply rule-based scoring
    rule_score_total, breakdown = rule_score(company, icp_tokens, icp_founded_after, icp_field_tokens)

    # Final score weights
    vector_weight = 0.4
    rule_weight = 0.6

    final_score = round(
        (vector_similarity_score * vector_weight) + (rule_score_total * rule_weight),
        2
    )

    breakdown["vector_similarity"] = round(vector_similarity_score, 2)

    scored_doc = {
        "company_id": company_id,
        "icp_id": icp_id,
        "icp_version": icp_version,
        "final_score": final_score,
        "breakdown": breakdown,
        "weights": WEIGHTS,
        "last_scored": datetime.now(timezone.utc),
        "method": "util_cos_sim"
    }

    scored_companies.update_one(
        {"company_id": company_id, "icp_id": icp_id, "method": "util_cos_sim"},
        {"$set": scored_doc},
        upsert=True
    )

    scored_results_util.append((company.get("domain", "unknown"), final_score))

# --- 7. Print Top Results ---
print(f"📈 Total companies scored: {len(scored_results_util)}")

sorted_results = sorted(scored_results_util, key=lambda x: x[1], reverse=True)

print("\n🏆 TOP RESULTS (Weighted Average Method):")
for i, (domain, score) in enumerate(sorted_results[:10], 1):
    print(f"{i:2d}. {domain:20s} → {score:6.2f}%")


Total points returned from Qdrant: 5
✅ Extracted ICP tokens: ['fintech', 'finance']
✅ Field-specific tokens: {'industry': ['fintech', 'finance'], 'location': [], 'employee_count': {'min': 30, 'max': 20}, 'founded_year': 2019}
📈 Total companies scored: 5

🏆 TOP RESULTS (Weighted Average Method):
 1. flowcast.ai          →  52.58%
 2. neogencloud.com      →  43.05%
 3. ssynthai.io          →  42.80%
 4. retico.ai            →  31.12%
 5. ddocstream.com       →  25.36%


#### Store to MongoDB :

In [None]:


scored_companies = db['scored_companies']

# Call this after scoring each company
def store_scored_company(company, score_tuple, icp_id, icp_version=1):
    score, breakdown, similarity = score_tuple

    company_oid = ObjectId(company["_id"])
    icp_oid = ObjectId(icp_id)

    doc = {
        "company_id": company_oid,
        "icp_id": icp_oid,
        "score": float(score),
        "breakdown": {
            **breakdown,
            "vector_similarity": float(similarity)
        },
        "last_scored": datetime.now(timezone.utc),
        "icp_version": icp_version
    }

    # 🛠 Upsert safely by matching ObjectId fields
    scored_companies.update_one(
        {"company_id": company_oid, "icp_id": icp_oid},
        {"$set": doc},
        upsert=True
    )

# Store in a different collection or add method flag
    scored_companies.update_one(
        {"company_id": company_id, "icp_id": icp_id, "method": "util_cos_sim"},
        {"$set": scored_doc},
        upsert=True
    )

In [None]:
final_scores = []
similarities = []

for doc in scored_companies.find():
    similarities.append(doc["breakdown"]["vector_similarity"])
    final_scores.append(doc["final_score"])

corr, _ = pearsonr(similarities, final_scores)
print(f"Pearson correlation (similarity vs. final score): {round(corr, 3)}")



Pearson correlation (similarity vs. final score): 0.917


In [None]:
final_scores = []
similarities = []

for doc in scored_companies.find():
    similarities.append(doc["breakdown"]["vector_similarity"])
    final_scores.append(doc["final_score"])
    
corr, _ = spearmanr(similarities, final_scores)
print(f"Spearman correlation (similarity vs. final score): {round(corr, 3)}")

Spearman correlation (similarity vs. final score): 0.935
