In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from sklearn.preprocessing import normalize
from collections import Counter

# Load data
df = pd.read_excel("domain_data_points.xlsx")
texts = df["data_point"].tolist()
labels = df["domain"].tolist()

# Load model and compute embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)
normalized_embeddings = normalize(embeddings, norm="l2")

# Create FAISS index
dimension = normalized_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(normalized_embeddings.astype("float32"))

# Map vector ID to domain
id_to_domain = {i: label for i, label in enumerate(labels)}


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [4]:
# Encode and normalize the query
query = "HIV Aids"
query_embedding = model.encode([query])
normalized_query = normalize(query_embedding, norm="l2")

#Search top-k matches
k = 5
D, I = index.search(normalized_query.astype("float32"), k=k)

# Show all top-k results (may include repeated domains)
print("Top-5 matched data points:")
top_domains = []
for i in range(k):
    idx = I[0][i]
    similarity = D[0][i]
    domain = id_to_domain[idx]
    top_domains.append(domain)
    print(f"{i+1}. Domain: {domain}, Similarity: {similarity:.4f}")

# Determine most frequent domain among top-5
domain_counts = Counter(top_domains)
most_common_domain, count = domain_counts.most_common(1)[0]

print("\nMost frequent domain in top-5:", most_common_domain)


Top-5 matched data points:
1. Domain: Healthcare, Similarity: 0.2324
2. Domain: Healthcare, Similarity: 0.2315
3. Domain: Healthcare, Similarity: 0.2279
4. Domain: Government Services, Similarity: 0.2273
5. Domain: Healthcare, Similarity: 0.2272

Most frequent domain in top-5: Healthcare


In [8]:
df = pd.read_csv("Attributes_2019-20.csv", delimiter=";")
terms = df["Description"].dropna().unique()
sampled_terms = list(set([t.strip().lower().replace(",", "") for t in terms]))[:300]
cleaned_terms = [t.strip().replace(",", "").lower() for t in terms]

In [10]:
df.columns

Index(['Attr_id', 'Chapter_id', 'Chapter_name', 'Table_id', 'Table_name',
       'Description'],
      dtype='object')

In [12]:
cols=list(df['Description'])
tab=list(df['Table_name'])

In [None]:
for i in range(len(cols)):
    # Encode and normalize the query
    query = cols[i]
    query_embedding = model.encode([query])
    normalized_query = normalize(query_embedding, norm="l2")
    
    #Search top-k matches
    k = 5
    D, I = index.search(normalized_query.astype("float32"), k=k)
    
    # Show all top-k results (may include repeated domains)
    print("Top-5 matched data points:")
    top_domains = []
    for i in range(k):
        idx = I[0][i]
        similarity = D[0][i]
        domain = id_to_domain[idx]
        top_domains.append(domain)
        print(f"{i+1}. Domain: {domain}, Similarity: {similarity:.4f}")
    
    # Determine most frequent domain among top-5
    domain_counts = Counter(top_domains)
    most_common_domain, count = domain_counts.most_common(1)[0]
    
    print(query,"\nMost frequent domain in top-5:", most_common_domain)
    print("----------\n\n")


Top-5 matched data points:
1. Domain: Finance & Banking, Similarity: 0.2434
2. Domain: Travel & Hospitality, Similarity: 0.2379
3. Domain: Travel & Hospitality, Similarity: 0.2158
4. Domain: E-commerce & Retail, Similarity: 0.2148
5. Domain: Government Services, Similarity: 0.2146
Taluks 
Most frequent domain in top-5: Travel & Hospitality
----------


Top-5 matched data points:
1. Domain: E-commerce & Retail, Similarity: 0.2351
2. Domain: E-commerce & Retail, Similarity: 0.2308
3. Domain: E-commerce & Retail, Similarity: 0.2155
4. Domain: E-commerce & Retail, Similarity: 0.2046
5. Domain: Startups and IT Services, Similarity: 0.2026
Hoblies 
Most frequent domain in top-5: E-commerce & Retail
----------


Top-5 matched data points:
1. Domain: Government Services, Similarity: 0.2786
2. Domain: Startups and IT Services, Similarity: 0.2669
3. Domain: Finance & Banking, Similarity: 0.2650
4. Domain: Government Services, Similarity: 0.2522
5. Domain: Employment & HR Tech, Similarity: 0.2405