In [6]:
import pandas as pd
from fuzzywuzzy import fuzz, process
from sentence_transformers import SentenceTransformer, util
import torch
namaste_df = pd.read_excel("NAMASTE_sampled_100_each.xlsx").fillna("")
icd_df = pd.read_csv("SimpleTabulation-ICD-11-MMS-en.csv").fillna("")
# -------------------------------
# Step 1: Fuzzy Matching
# -------------------------------
def fuzzy_map(term, choices, threshold=80):
    best_match, score = process.extractOne(term, choices, scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        return best_match, score
    return None, None

fuzzy_results = []
for term in namaste_df["NAMC_TERM"].str.lower().str.strip():
    match, score = fuzzy_map(term, icd_df["Title"].str.lower().str.strip().tolist(), threshold=80)
    if match:
        icd_code = icd_df.loc[icd_df["Title"].str.lower().str.strip() == match, "Code"].values[0]
        fuzzy_results.append((term, match, icd_code, score))

fuzzy_df = pd.DataFrame(fuzzy_results, columns=["NAMASTE_term", "ICD_match", "ICD_code", "Score"])
fuzzy_df["Confidence"] = fuzzy_df["Score"].astype(str) + "% (Fuzzy)"
# Keep track of NAMASTE terms already matched by fuzzy
fuzzy_terms = set(fuzzy_df["NAMASTE_term"].str.lower())
# -------------------------------
# Step 2: Semantic Embeddings
# -------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

namaste_df["description"] = (
    namaste_df["Short_definition"].fillna("") + " " + namaste_df["Long_definition"].fillna("")
).str.strip()
icd_df["description"] = icd_df["Title"].fillna("").str.strip()

namaste_embeddings = model.encode(namaste_df["description"].tolist(), convert_to_tensor=True)
icd_embeddings = model.encode(icd_df["description"].tolist(), convert_to_tensor=True)

semantic_results = []
for i, nam_term in enumerate(namaste_df["description"]):
    term_lower = namaste_df.iloc[i]["NAMC_TERM"].lower()
    if term_lower in fuzzy_terms:
        continue  # skip terms already matched by fuzzy

    sims = util.cos_sim(namaste_embeddings[i], icd_embeddings)[0]
    best_idx = torch.argmax(sims).item()
    score = sims[best_idx].item()
    if score > 0.7:
        semantic_results.append((
            namaste_df.iloc[i]["NAMC_TERM"],
            icd_df.iloc[best_idx]["Title"],
            icd_df.iloc[best_idx]["Code"],
            round(score, 3)
        ))

semantic_df = pd.DataFrame(semantic_results, columns=["NAMASTE_term", "ICD_match", "ICD_code", "Similarity"])
semantic_df["Confidence"] = (semantic_df["Similarity"]*100).astype(int).astype(str) + "% (Semantic)"
# -------------------------------
# Step 3: Ontology Fallback
# -------------------------------
def get_parent_code(code):
    if isinstance(code, str) and "." in code:
        return code.split(".")[0]
    return code

ontology_df = semantic_df.copy()
ontology_df["Parent_code"] = ontology_df["ICD_code"].apply(get_parent_code)

# -------------------------------
# Final Combined Output
# -------------------------------
final_mapping = pd.concat([
    fuzzy_df[["NAMASTE_term", "ICD_match", "ICD_code", "Confidence"]],
    semantic_df[["NAMASTE_term", "ICD_match", "ICD_code", "Confidence"]],
    ontology_df[["NAMASTE_term", "ICD_match", "Parent_code"]].rename(columns={"Parent_code": "ICD_code"})
], ignore_index=True)

final_mapping.to_excel("NAMASTE_ICD_Mappings_noExact.xlsx", index=False)
print("Final mappings saved:", final_mapping.shape)
import os
print("Current working directory:", os.getcwd())



Final mappings saved: (58, 4)
Current working directory: /Users/rahulmendes/Desktop/sih 1/fuzzyfuzzy
