In [7]:
#!python -m spacy download en_core_web_sm

In [8]:
#%pip install symspellpy phonetics rapidfuzz spacy openai fuzzy


In [9]:
# %%
import sys, os
try:
    # ✅ Running from a Python script (.py file)
    TOOLS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..",))
except NameError:
    # ✅ Running from a Jupyter notebook (__file__ is not defined)
    TOOLS_PATH = os.path.abspath(os.path.join(os.getcwd(), ".."))

SRC_PATH = os.path.join(TOOLS_PATH)

if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print(f"✅ SRC path added: {SRC_PATH}")
else:
    print(f"🔁 SRC path already in sys.path: {SRC_PATH}")

✅ SRC path added: /home/prashant-agrawal/projects/company_talk2data/src


In [10]:
import os
import requests
from symspellpy.symspellpy import SymSpell, Verbosity
from rapidfuzz import process, fuzz
from phonetics import dmetaphone
from typing import List, Dict

from utils.path_config import get_dictionary_path

# %% 📁 Paths
Dict_PATH = get_dictionary_path

In [11]:
# Custom domain vocabulary (your FILTERABLE_FIELDS)
#FILTERABLE_FIELDS = {   
#    "fintech", "saas", "healthtech", "crm", "b2b", "b2c", "d2c",
#    "cred", "zoho", "flipkart", "founder", "funding", "revenue",
#    "valuation", "series a", "seed", "bangalore", "bengaluru", "mumbai", "hiring",
#    "employees", "team", "growth", "unicorn", "bootstrap", "zoho"
#}

# ✅ Extensive filterable fields (flattened set for priority)
FILTERABLE_FIELDS = set([
    # 🌍 Locations
    "bengaluru", "bangalore", "mumbai", "delhi", "noida", "gurgaon", "hyderabad", "chennai", "pune", "kolkata",
    "india", "remote", "usa", "new york", "london", "singapore", "dubai",

    # 🏭 Industries
    "fintech", "saas", "healthtech", "edtech", "agritech", "cleantech", "ecommerce", "logistics", "traveltech",
    "retailtech", "cybersecurity", "medtech", "insurtech", "govtech", "spacetech", "web3", "blockchain", "crm",

    # 📦 Tech & Products
    "mobile app", "web app", "api", "platform", "software", "cloud", "dashboard", "plugin", "extension", "erp",
    "analytics", "microservices", "serverless", "paas", "saas", "open source",

    # 👥 People
    "founder", "cofounder", "ceo", "cto", "cxo", "team", "employees", "staff", "leadership",

    # 💼 Company Activity
    "hiring", "layoffs", "ipo", "acquisition", "merger", "pivot", "shutdown", "exit",

    # 💰 Finance
    "funding", "valuation", "revenue", "profit", "loss", "ebitda", "runway", "investors", "bootstrap", "unicorn",
    "series a", "series b", "seed", "angel", "growth",

    # 🔖 Tags
    "high growth", "market leader", "top startup", "soonicorn", "early stage", "late stage", "yc backed",

    # 📈 Metrics
    "users", "downloads", "retention", "engagement", "mrr", "arr", "ltv", "cac", "churn", "arpu"
])


# ✅ Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

       
# ✅ Load main dictionary
dict_path = os.path.join(Dict_PATH())
print(f"🔍 Loading SymSpell dictionary from: {dict_path}")

# Check if the dictionary file exists
loaded = sym_spell.load_dictionary(dict_path, term_index=0, count_index=1)
if not loaded:
    raise RuntimeError("❌ Could not load main dictionary")

for word in FILTERABLE_FIELDS:
    word_clean = word.lower().strip()
    if word_clean and word_clean not in sym_spell.words:
        sym_spell.create_dictionary_entry(word_clean, 99999)

print("✅ Custom terms loaded into SymSpell:", len(FILTERABLE_FIELDS))

# Verify loaded dictionary (optional)
print("Loaded custom words:", list(sym_spell.words)[:10])

🔍 Loading SymSpell dictionary from: /home/prashant-agrawal/projects/company_talk2data/src/Data/frequency_dictionary_en_82_765.txt
✅ Custom terms loaded into SymSpell: 99
Loaded custom words: ['\ufeffthe', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that']


In [12]:
def symspell_correct(query: str) -> str:
    suggestion = sym_spell.lookup_compound(query, max_edit_distance=2)
    return suggestion[0].term if suggestion else query

# Example usage
query = "healtytech dcd2f compnies in oindia"
corrected_query = symspell_correct(query)
print("✅ SymSpell Corrected Query:", corrected_query)

✅ SymSpell Corrected Query: healthtech did of companies in india


In [13]:
# %pip install fuzzy
from fuzzy import DMetaphone

dmetaphone = DMetaphone()

def build_phonetic_dict(fields: set) -> dict:
    phonetic_dict = {}
    for word in fields:
        ascii_word = ''.join([c for c in word.lower() if ord(c) < 128])
        codes = dmetaphone(ascii_word)
        code = codes[0] if codes and codes[0] else None
        if code and code not in phonetic_dict:
            phonetic_dict[code] = ascii_word
    return phonetic_dict

phonetic_dict = build_phonetic_dict(FILTERABLE_FIELDS)

def phonetic_correction(query: str) -> str:
    corrected = []
    for word in query.split():
        try:
            # Clean to ASCII-only before applying phonetic
            word_ascii = word.encode("ascii", "ignore").decode("ascii").lower()
            code = dmetaphone(word_ascii)
            corrected_word = phonetic_dict.get(code, word)
            corrected.append(corrected_word)
        except Exception:
            corrected.append(word)  # fallback to original word on error
    return ' '.join(corrected)

In [14]:
# %pip install rapidfuzz
from rapidfuzz import process, fuzz

fuzzy_choices = list(FILTERABLE_FIELDS)

def fuzzy_correction(query: str) -> str:
    words = query.split()
    corrected = []
    for word in words:
        match, score, _ = process.extractOne(word, fuzzy_choices, scorer=fuzz.WRatio)
        corrected.append(match if score > 80 else word)  # Threshold = 80
    return ' '.join(corrected)

# Usage Example
#fuzzy_corrected_query = fuzzy_correction(corrected_query)
#print("✅ Fuzzy Matched Query:", fuzzy_corrected_query)

In [15]:
def dynamic_dict_expansion(new_terms: List[str]):
    for term in new_terms:
        term = term.lower().strip()
        if term not in sym_spell.words:
            sym_spell.create_dictionary_entry(term, 5000)
            print(f"📥 Added new term: {term}")

# Example
#dynamic_dict_expansion(["unacademy", "nykaa"])

In [16]:
# %pip install python-dotenv  # Uncomment if not installed
from dotenv import load_dotenv
import os

# ✅ Automatically load environment variables from .env
load_dotenv(dotenv_path="./env.env")

TOGETHER_API_KEY = os.getenv("together_ai_api_key")
print("🔑 TOGETHER_API_KEY set to:", TOGETHER_API_KEY)

🔑 TOGETHER_API_KEY set to: tgp_v1_PoHC0z_NQ9z7XluCXtUez9Jc-mV8fjDz8hmwVBccGBI


In [17]:
import requests
import os

def together_chat_mistral(query: str) -> str:
    url = "https://api.together.xyz/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {TOGETHER_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "mistralai/Mistral-7B-Instruct-v0.2",
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a query rephraser. Your ONLY task is to rephrase the given query "
                    "into clean English without adding, removing, or guessing any content. "
                    "Preserve all terms exactly. No tags. No classifications. No explanations."
                )
            },
            {"role": "user", "content": f"{query}"}
        ],
        "temperature": 0.3,
        "max_tokens": 100,
        "top_p": 0.9
    }

    res = requests.post(url, headers=headers, json=payload)
    if res.status_code == 200:
        return res.json()['choices'][0]['message']['content'].strip()
    else:
        return f"[ERROR {res.status_code}]: {res.text}"

In [18]:
from typing import Dict

def run_pipeline(query: str):
    print(f"\n🟢 RAW_QUERY: {query}")

    corrected_symspell = symspell_correct(query)
    print(f"🔤 SYMSPELL_CORRECTED: {corrected_symspell}")

    corrected_phonetic = phonetic_correction(corrected_symspell)
    print(f"🔊 PHONETIC_CORRECTED: {corrected_phonetic}")

    corrected_fuzzy = fuzzy_correction(corrected_phonetic)
    print(f"🔎 FUZZY_CORRECTED: {corrected_fuzzy}")

    llm_result = together_chat_mistral(corrected_fuzzy)
    print(f"🤖 LLM_NORMALIZED:\n{llm_result}\n")

    return {
        "raw_query": query,
        "symspell_corrected": corrected_symspell,
        "phonetic_corrected": corrected_phonetic,
        "fuzzy_corrected": corrected_fuzzy,
        "llm_normalized": llm_result
    }


In [19]:
test_query = "fnd sars compny byjuz in benguluru"
output = run_pipeline(test_query)

for stage, result in output.items():
    print(f"{stage.upper():<25}: {result}")

print("\n🔄 Pipeline completed successfully!")


🟢 RAW_QUERY: fnd sars compny byjuz in benguluru
🔤 SYMSPELL_CORRECTED: and cars company by us in bengaluru
🔊 PHONETIC_CORRECTED: and cars company by us in bengaluru
🔎 FUZZY_CORRECTED: and cars company by users hiring bengaluru
🤖 LLM_NORMALIZED:
Companies in Bangalore that offer car hiring services, as chosen by users.

RAW_QUERY                : fnd sars compny byjuz in benguluru
SYMSPELL_CORRECTED       : and cars company by us in bengaluru
PHONETIC_CORRECTED       : and cars company by us in bengaluru
FUZZY_CORRECTED          : and cars company by users hiring bengaluru
LLM_NORMALIZED           : Companies in Bangalore that offer car hiring services, as chosen by users.

🔄 Pipeline completed successfully!


In [20]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

query1 = "Find SAS companies in India"
query2 = "Find SaaS startups in India"

# Get embeddings
emb1 = model.encode(query1, convert_to_tensor=True)
emb2 = model.encode(query2, convert_to_tensor=True)

# Check similarity
similarity = util.pytorch_cos_sim(emb1, emb2)
print("Similarity Score:", similarity.item())

Similarity Score: 0.6872326135635376


In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

def rewrite_query(user_query):
    prompt = f"Fix and rewrite the following user search query into correct English and with proper keywords:\n'{user_query}'"

    inputs = tokenizer(prompt, return_tensors="pt")
    output = model.generate(**inputs, max_new_tokens=30)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Example
query = "Fidn companeis in Delhi for healt"
bert_query = rewrite_query(query)
sym_query = symspell_correct(query)
print("✅ SymSpell Corrected Query:", sym_query)
print("Rewritten Query:", bert_query)


✅ SymSpell Corrected Query: find companies in delhi for health
Rewritten Query: 'Fidn companeis in Delhi for healt'
