# performed precomputing process

developed a precoputing layer where the preprocessed data is converted embeddings and then found the cosine similarity.

In [1]:
# precomputing before sharing symptoms
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
# model loading
def load_embedding_model(model_name="all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# loading of data and generate embeddings
def compute_embeddings(csv_path: str,
                       text_column: str,
                       limit: int = 600):

    # Clean accidental quotes in path
    csv_path = csv_path.strip().strip('"').strip("'")

    df = pd.read_csv(csv_path)

    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in dataset.")

    # Select first 600 subjects
    df_subset = df.head(limit)

    texts = df_subset[text_column].astype(str).tolist()

    print(f"\nGenerating embeddings for {len(texts)} subjects...\n")

    model = load_embedding_model()

    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    # IMPORTANT: Normalize embeddings for proper cosine similarity
    embeddings = normalize(embeddings, norm="l2")

    return df_subset, embeddings
# computing cosine similarity
def compute_similarity_matrix(embeddings: np.ndarray):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

# main execution
if __name__ == "__main__":

    # Get User Inputs 
    csv_path = input("Enter full path to preprocessed CSV file:\n>> ")
    text_column = input("Enter text column name (e.g., note_preprocessed):\n>> ")

    # Step 1: Generate Embeddings 
    patient_df, stored_embeddings = compute_embeddings(
        csv_path=csv_path,
        text_column=text_column,
        limit=600
    )

    print("\nEmbedding Matrix Shape:", stored_embeddings.shape)

    # ---- Step 2: Compute Cosine Similarity ----
    print("\nComputing cosine similarity matrix...\n")

    similarity_matrix = compute_similarity_matrix(stored_embeddings)

    print("Cosine Similarity Matrix Shape:", similarity_matrix.shape)

    print("\nProcess Completed Successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Enter full path to preprocessed CSV file:
>>  C:\Users\rajak\Downloads\AI Internship\newdataset.csv
Enter text column name (e.g., note_preprocessed):
>>  note_preprocessed



Generating embeddings for 600 subjects...



Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 292.14it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 19/19 [00:20<00:00,  1.07s/it]


Embedding Matrix Shape: (600, 384)

Computing cosine similarity matrix...

Cosine Similarity Matrix Shape: (600, 600)

Process Completed Successfully.





performed the precomputing process. where the prerocessed file have beeen computed embeddings and then find the cosine similarity using the single reusable code for the existing data.

# Insight genertion layer

creating the insight generation layer 

In [2]:
# creating the insight generation layer
import json
import re
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
# clinical keywords for symptoms, treatment and otcome
SYMPTOM_KEYWORDS = [
    "fever", "cough", "dry cough", "dyspnea",
    "shortness of breath", "hypoxia", "fatigue",
    "chest pain", "oxygen desaturation",
    "tachypnea", "respiratory distress",
    "ards", "respiratory failure", "oxygen desaturation", "hypoxia","hypoxemia","tachypnea",
    "respiratory distress", "acute respiratory distress syndrome", "ards", "chest tightness",
    "wheezing", "productive cough", "hemoptysis"
]

TREATMENT_KEYWORDS = [
   "oxygen therapy",
    "supplemental oxygen",
    "high flow oxygen",
    "high flow nasal cannula",
    "hfno",
    "non invasive ventilation",
    "niv",
    "cpap",
    "bipap",
    "mechanical ventilation",
    "intubation",
    "ventilator support",
    "prone positioning",
    "respiratory support",
]

OUTCOME_KEYWORDS = {
    "discharged": ["discharged","home discharge"],
    "rehabilitation": ["rehabilitation","rehab clinic"],
    "critical_monitoring": ["icu","intensive care","critical"]
}
# similarity based function
def retrieve_similar_cases(new_embedding, top_n=5):

    global stored_embeddings

    if new_embedding.ndim == 1:
        new_embedding = new_embedding.reshape(1, -1)

    similarity_scores = cosine_similarity(
        new_embedding,
        stored_embeddings
    )[0]

    self_index = np.argmax(similarity_scores)
    similarity_scores[self_index] = -1

    ranked_indices = np.argsort(similarity_scores)[::-1]

    return ranked_indices[:top_n]

# insight generation function
def generate_case_insight(similar_cases, query_text):

    similar_texts = patient_df.iloc[similar_cases]["note_preprocessed"].tolist()

    # ---- Shared Symptoms ----
    query_words = set(query_text.split())
    symptom_counter = Counter()

    for text in similar_texts:
        words = set(text.split())
        shared = query_words.intersection(words)
        shared_symptoms = [w for w in shared if w in SYMPTOM_KEYWORDS]
        symptom_counter.update(shared_symptoms)

    most_common_symptoms = [sym for sym, _ in symptom_counter.most_common(5)]

    # ---- Treatment Extraction ----
    treatment_counter = Counter()

    for text in similar_texts:
        for treatment in TREATMENT_KEYWORDS:
            if treatment in text:
                treatment_counter.update([treatment])

    most_common_treatments = [t for t, _ in treatment_counter.most_common(3)]

    # ---- Outcome Trend ----
    outcome_trend = "Not clearly mentioned"

    for outcome, keywords in OUTCOME_KEYWORDS.items():
        count = sum(any(k in text for k in keywords) for text in similar_texts)
        if count >= 2:
            outcome_trend = outcome
            break

    # ---- Recovery Trend ----
    recovery_mentions = [
        re.findall(r'\b\d+\s*(day|week|month)s?\b', text)
        for text in similar_texts
    ]

    recovery_flat = [item for sublist in recovery_mentions for item in sublist]

    recovery_trend = "Not specified"
    if recovery_flat:
        recovery_trend = "Recovery period mentioned in similar cases"

    insight = {
        "shared_symptoms": most_common_symptoms,
        "suggested_treatment": most_common_treatments,
        "outcome_trend": outcome_trend,
        "recovery_trend": recovery_trend
    }

    return insight

# main function
if __name__ == "__main__":

    print("\nGenerating Insights for First 5 Patients\n")

    for patient_idx in range(5):

        print(f"\nPatient Index: {patient_idx}")
        print("-" * 50)

        query_embedding = stored_embeddings[patient_idx]
        query_text = patient_df.iloc[patient_idx]["note_preprocessed"]

        similar_cases = retrieve_similar_cases(
            new_embedding=query_embedding,
            top_n=5
        )

        insight = generate_case_insight(similar_cases, query_text)

        print(json.dumps(insight, indent=4))


Generating Insights for First 5 Patients


Patient Index: 0
--------------------------------------------------
{
    "shared_symptoms": [
        "cough",
        "dyspnea",
        "fever"
    ],
    "suggested_treatment": [
        "intubation",
        "prone positioning",
        "supplemental oxygen"
    ],
    "outcome_trend": "discharged",
    "recovery_trend": "Not specified"
}

Patient Index: 1
--------------------------------------------------
{
    "shared_symptoms": [
        "cough",
        "dyspnea",
        "fever"
    ],
    "suggested_treatment": [
        "intubation",
        "prone positioning",
        "oxygen therapy"
    ],
    "outcome_trend": "discharged",
    "recovery_trend": "Not specified"
}

Patient Index: 2
--------------------------------------------------
{
    "shared_symptoms": [],
    "suggested_treatment": [
        "intubation",
        "prone positioning",
        "supplemental oxygen"
    ],
    "outcome_trend": "discharged",
    "recovery_tren

created the insight generation layer which provide the similar patients shared symptom, treatment, recovery trend and the outcome trend of the patient in the json file format. We have used the function name provided by you an the outputs are provided in the json file format.

# tresting with input samples

In [8]:
# testing the insight generation layer

import re
import json
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize
# clinical keywords
SYMPTOM_KEYWORDS = [
    "fever","dry cough","cough","fatigue","weakness",
    "dyspnea","shortness of breath","breathlessness",
    "oxygen desaturation","hypoxia","tachypnea",
    "respiratory distress","acute respiratory distress syndrome",
    "chest pain","headache","nausea","vomiting",
    "diarrhea","loss of smell","anosmia","loss of taste",
    "ageusia","confusion","delirium","cyanosis",
    "respiratory failure","shock"
]
TREATMENT_KEYWORDS = [
    "oxygen therapy","supplemental oxygen","high flow oxygen",
    "non invasive ventilation","mechanical ventilation",
    "intubation","ventilator support","prone positioning",
    "remdesivir","antiviral therapy","dexamethasone",
    "methylprednisolone","steroid therapy",
    "antibiotics","azithromycin","ceftriaxone",
    "heparin","anticoagulation therapy",
    "bronchodilator","nebulization",
    "physiotherapy","pulmonary rehabilitation",
    "breathing exercise","icu admission",
    "critical care monitoring"
]

OUTCOME_KEYWORDS = {
    "discharged": ["discharged","home discharge"],
    "rehabilitation": ["rehabilitation","rehab clinic"],
    "critical_monitoring": ["icu","intensive care","critical"]
}
# pre-processing of the data
IMPORTANT_WORDS = {"no","not","without","with","before","after","during","since"}
CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.difference(IMPORTANT_WORDS)

def preprocess_text(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    words = [w for w in words if w not in CUSTOM_STOPWORDS]

    return " ".join(words)
# loading the embedding module
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")
# similarity case module
def retrieve_similar_cases(new_embedding, top_n=3):

    global stored_embeddings

    if new_embedding.ndim == 1:
        new_embedding = new_embedding.reshape(1, -1)

    new_embedding = normalize(new_embedding, norm="l2")

    similarity_scores = cosine_similarity(
        new_embedding,
        stored_embeddings
    )[0]

    ranked_indices = np.argsort(similarity_scores)[::-1]

    top_indices = ranked_indices[:top_n]
    top_scores = similarity_scores[top_indices]

    return top_indices, top_scores

# main function
if __name__ == "__main__":

    #  USER INPUT 
    symptoms_input = input("Enter symptoms:\n>> ")
    notes_input = input("\nEnter clinical notes:\n>> ")

    combined_input = symptoms_input + " " + notes_input
    processed_text = preprocess_text(combined_input)

    #  EMBEDDING 
    model = load_embedding_model()

    query_embedding = model.encode(
        [processed_text],
        convert_to_numpy=True
    )[0]

    # RETRIEVE TOP 3 
    top_indices, top_scores = retrieve_similar_cases(
        new_embedding=query_embedding,
        top_n=3
    )

    #  MOST SIMILAR PATIENT 
    most_similar_index = top_indices[0]
    most_similar_text = patient_df.iloc[most_similar_index]["note_preprocessed"]

    # SHARED SYMPTOMS 
    query_words = set(processed_text.split())
    similar_words = set(most_similar_text.split())

    shared_symptoms = [
        word for word in query_words.intersection(similar_words)
        if word in SYMPTOM_KEYWORDS
    ]

    # TREATMENT EXTRACTION 
    treatments_found = [
        treatment for treatment in TREATMENT_KEYWORDS
        if re.search(rf"\b{re.escape(treatment)}\b", most_similar_text)
    ]

    #  OUTCOME EXTRACTION 
    outcome = "Not clearly mentioned"

    for key, keywords in OUTCOME_KEYWORDS.items():
        if any(k in most_similar_text for k in keywords):
            outcome = key
            break

    #  RECOVERY TREND 
    recovery_matches = re.findall(
        r'\b\d+\s*(day|week|month)s?\b',
        most_similar_text
    )

    recovery_trend = "Not specified"
    if recovery_matches:
        recovery_trend = "Recovery duration mentioned in similar case"

    #  JSON OUTPUT FORMATTING
    result = {
        "input_processed": processed_text,
        "nearest_similar_patients": [
            {
                "patient_id": patient_df.iloc[idx]["patient_uid"]
                if "patient_uid" in patient_df.columns else int(idx),
                "similarity_score": round(float(score), 4)
            }
            for idx, score in zip(top_indices, top_scores)
        ],
        "most_similar_patient_insight": {
            "shared_symptoms": shared_symptoms,
            "treatments": treatments_found,
            "outcome": outcome,
            "recovery_trend": recovery_trend
        }
    }

    print("\nFinal Output (JSON Format):\n")
    print(json.dumps(result, indent=4))

Enter symptoms:
>>  fever, dry cough, shortness of breath

Enter clinical notes:
>>  Patient admitted with four day history of fever and persistent dry cough associated with progressive shortness of breath. Oxygen saturation mildly reduced requiring supplemental oxygen therapy. No prior chronic respiratory illness reported.


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 550.88it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Final Output (JSON Format):

{
    "input_processed": "fever dry cough shortness breath patient admitted with day history fever persistent dry cough associated with progressive shortness breath oxygen saturation mildly reduced requiring supplemental oxygen therapy no prior chronic respiratory illness reported",
    "nearest_similar_patients": [
        {
            "patient_id": 60,
            "similarity_score": 0.6396
        },
        {
            "patient_id": 59,
            "similarity_score": 0.598
        },
        {
            "patient_id": 9,
            "similarity_score": 0.572
        }
    ],
    "most_similar_patient_insight": {
        "shared_symptoms": [
            "cough",
            "fever"
        ],
        "treatments": [
            "oxygen therapy",
            "high flow oxygen",
            "methylprednisolone",
            "azithromycin"
        ],
        "outcome": "discharged",
        "recovery_trend": "Not specified"
    }
}


tested the developed insight generation module with the new input sample. This insight generation module finds the top three similar patients, most similar shared symptoms, treatment, outcome and recovery trend in the JSON file format.

testing with the input sample 2

In [9]:
# testing the insight generation layer

import re
import json
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize
# clinical keywords
SYMPTOM_KEYWORDS = [
    "fever","dry cough","cough","fatigue","weakness",
    "dyspnea","shortness of breath","breathlessness",
    "oxygen desaturation","hypoxia","tachypnea",
    "respiratory distress","acute respiratory distress syndrome",
    "chest pain","headache","nausea","vomiting",
    "diarrhea","loss of smell","anosmia","loss of taste",
    "ageusia","confusion","delirium","cyanosis",
    "respiratory failure","shock"
]
TREATMENT_KEYWORDS = [
    "oxygen therapy","supplemental oxygen","high flow oxygen",
    "non invasive ventilation","mechanical ventilation",
    "intubation","ventilator support","prone positioning",
    "remdesivir","antiviral therapy","dexamethasone",
    "methylprednisolone","steroid therapy",
    "antibiotics","azithromycin","ceftriaxone",
    "heparin","anticoagulation therapy",
    "bronchodilator","nebulization",
    "physiotherapy","pulmonary rehabilitation",
    "breathing exercise","icu admission",
    "critical care monitoring"
]

OUTCOME_KEYWORDS = {
    "discharged": ["discharged","home discharge"],
    "rehabilitation": ["rehabilitation","rehab clinic"],
    "critical_monitoring": ["icu","intensive care","critical"]
}
# pre-processing of the data
IMPORTANT_WORDS = {"no","not","without","with","before","after","during","since"}
CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.difference(IMPORTANT_WORDS)

def preprocess_text(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    words = [w for w in words if w not in CUSTOM_STOPWORDS]

    return " ".join(words)
# loading the embedding module
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")
# similarity case module
def retrieve_similar_cases(new_embedding, top_n=3):

    global stored_embeddings

    if new_embedding.ndim == 1:
        new_embedding = new_embedding.reshape(1, -1)

    new_embedding = normalize(new_embedding, norm="l2")

    similarity_scores = cosine_similarity(
        new_embedding,
        stored_embeddings
    )[0]

    ranked_indices = np.argsort(similarity_scores)[::-1]

    top_indices = ranked_indices[:top_n]
    top_scores = similarity_scores[top_indices]

    return top_indices, top_scores

# main function
if __name__ == "__main__":

    #  USER INPUT 
    symptoms_input = input("Enter symptoms:\n>> ")
    notes_input = input("\nEnter clinical notes:\n>> ")

    combined_input = symptoms_input + " " + notes_input
    processed_text = preprocess_text(combined_input)

    #  EMBEDDING 
    model = load_embedding_model()

    query_embedding = model.encode(
        [processed_text],
        convert_to_numpy=True
    )[0]

    # RETRIEVE TOP 3 
    top_indices, top_scores = retrieve_similar_cases(
        new_embedding=query_embedding,
        top_n=3
    )

    #  MOST SIMILAR PATIENT 
    most_similar_index = top_indices[0]
    most_similar_text = patient_df.iloc[most_similar_index]["note_preprocessed"]

    # SHARED SYMPTOMS 
    query_words = set(processed_text.split())
    similar_words = set(most_similar_text.split())

    shared_symptoms = [
        word for word in query_words.intersection(similar_words)
        if word in SYMPTOM_KEYWORDS
    ]

    # TREATMENT EXTRACTION 
    treatments_found = [
        treatment for treatment in TREATMENT_KEYWORDS
        if re.search(rf"\b{re.escape(treatment)}\b", most_similar_text)
    ]

    #  OUTCOME EXTRACTION 
    outcome = "Not clearly mentioned"

    for key, keywords in OUTCOME_KEYWORDS.items():
        if any(k in most_similar_text for k in keywords):
            outcome = key
            break

    #  RECOVERY TREND 
    recovery_matches = re.findall(
        r'\b\d+\s*(day|week|month)s?\b',
        most_similar_text
    )

    recovery_trend = "Not specified"
    if recovery_matches:
        recovery_trend = "Recovery duration mentioned in similar case"

    #  JSON OUTPUT FORMATTING
    result = {
        "input_processed": processed_text,
        "nearest_similar_patients": [
            {
                "patient_id": patient_df.iloc[idx]["patient_uid"]
                if "patient_uid" in patient_df.columns else int(idx),
                "similarity_score": round(float(score), 4)
            }
            for idx, score in zip(top_indices, top_scores)
        ],
        "most_similar_patient_insight": {
            "shared_symptoms": shared_symptoms,
            "treatments": treatments_found,
            "outcome": outcome,
            "recovery_trend": recovery_trend
        }
    }

    print("\nFinal Output (JSON Format):\n")
    print(json.dumps(result, indent=4))

Enter symptoms:
>>  oxygen desaturation, dyspnea, fatigue

Enter clinical notes:
>>  Elderly male presented with worsening dyspnea and oxygen desaturation over the past two days. Patient required high flow oxygen support and close monitoring in intensive care unit. Fatigue and respiratory distress observed during minimal exertion.


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 249.14it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Final Output (JSON Format):

{
    "input_processed": "oxygen desaturation dyspnea fatigue elderly male presented with worsening dyspnea oxygen desaturation past days patient required high flow oxygen support close monitoring intensive care unit fatigue respiratory distress observed during minimal exertion",
    "nearest_similar_patients": [
        {
            "patient_id": 9,
            "similarity_score": 0.6081
        },
        {
            "patient_id": 8,
            "similarity_score": 0.5709
        },
        {
            "patient_id": 6,
            "similarity_score": 0.5533
        }
    ],
    "most_similar_patient_insight": {
        "shared_symptoms": [
            "dyspnea",
            "fatigue"
        ],
        "treatments": [
            "oxygen therapy"
        ],
        "outcome": "critical_monitoring",
        "recovery_trend": "Not specified"
    }
}


For sample 2, the developed insight generation model also produces the proper output conists of the top 3 similar patients with a more similar score, the treatment suggestion, outcome, and recovery trend of the new patient.

Testing with the input sample 3

In [10]:
# testing the insight generation layer

import re
import json
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize
# clinical keywords
SYMPTOM_KEYWORDS = [
    "fever","dry cough","cough","fatigue","weakness",
    "dyspnea","shortness of breath","breathlessness",
    "oxygen desaturation","hypoxia","tachypnea",
    "respiratory distress","acute respiratory distress syndrome",
    "chest pain","headache","nausea","vomiting",
    "diarrhea","loss of smell","anosmia","loss of taste",
    "ageusia","confusion","delirium","cyanosis",
    "respiratory failure","shock"
]
TREATMENT_KEYWORDS = [
    "oxygen therapy","supplemental oxygen","high flow oxygen",
    "non invasive ventilation","mechanical ventilation",
    "intubation","ventilator support","prone positioning",
    "remdesivir","antiviral therapy","dexamethasone",
    "methylprednisolone","steroid therapy",
    "antibiotics","azithromycin","ceftriaxone",
    "heparin","anticoagulation therapy",
    "bronchodilator","nebulization",
    "physiotherapy","pulmonary rehabilitation",
    "breathing exercise","icu admission",
    "critical care monitoring"
]

OUTCOME_KEYWORDS = {
    "discharged": ["discharged","home discharge"],
    "rehabilitation": ["rehabilitation","rehab clinic"],
    "critical_monitoring": ["icu","intensive care","critical"]
}
# pre-processing of the data
IMPORTANT_WORDS = {"no","not","without","with","before","after","during","since"}
CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.difference(IMPORTANT_WORDS)

def preprocess_text(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    words = [w for w in words if w not in CUSTOM_STOPWORDS]

    return " ".join(words)
# loading the embedding module
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")
# similarity case module
def retrieve_similar_cases(new_embedding, top_n=3):

    global stored_embeddings

    if new_embedding.ndim == 1:
        new_embedding = new_embedding.reshape(1, -1)

    new_embedding = normalize(new_embedding, norm="l2")

    similarity_scores = cosine_similarity(
        new_embedding,
        stored_embeddings
    )[0]

    ranked_indices = np.argsort(similarity_scores)[::-1]

    top_indices = ranked_indices[:top_n]
    top_scores = similarity_scores[top_indices]

    return top_indices, top_scores

# main function
if __name__ == "__main__":

    #  USER INPUT 
    symptoms_input = input("Enter symptoms:\n>> ")
    notes_input = input("\nEnter clinical notes:\n>> ")

    combined_input = symptoms_input + " " + notes_input
    processed_text = preprocess_text(combined_input)

    #  EMBEDDING 
    model = load_embedding_model()

    query_embedding = model.encode(
        [processed_text],
        convert_to_numpy=True
    )[0]

    # RETRIEVE TOP 3 
    top_indices, top_scores = retrieve_similar_cases(
        new_embedding=query_embedding,
        top_n=3
    )

    #  MOST SIMILAR PATIENT 
    most_similar_index = top_indices[0]
    most_similar_text = patient_df.iloc[most_similar_index]["note_preprocessed"]

    # SHARED SYMPTOMS 
    query_words = set(processed_text.split())
    similar_words = set(most_similar_text.split())

    shared_symptoms = [
        word for word in query_words.intersection(similar_words)
        if word in SYMPTOM_KEYWORDS
    ]

    # TREATMENT EXTRACTION 
    treatments_found = [
        treatment for treatment in TREATMENT_KEYWORDS
        if re.search(rf"\b{re.escape(treatment)}\b", most_similar_text)
    ]

    #  OUTCOME EXTRACTION 
    outcome = "Not clearly mentioned"

    for key, keywords in OUTCOME_KEYWORDS.items():
        if any(k in most_similar_text for k in keywords):
            outcome = key
            break

    #  RECOVERY TREND 
    recovery_matches = re.findall(
        r'\b\d+\s*(day|week|month)s?\b',
        most_similar_text
    )

    recovery_trend = "Not specified"
    if recovery_matches:
        recovery_trend = "Recovery duration mentioned in similar case"

    #  JSON OUTPUT FORMATTING
    result = {
        "input_processed": processed_text,
        "nearest_similar_patients": [
            {
                "patient_id": patient_df.iloc[idx]["patient_uid"]
                if "patient_uid" in patient_df.columns else int(idx),
                "similarity_score": round(float(score), 4)
            }
            for idx, score in zip(top_indices, top_scores)
        ],
        "most_similar_patient_insight": {
            "shared_symptoms": shared_symptoms,
            "treatments": treatments_found,
            "outcome": outcome,
            "recovery_trend": recovery_trend
        }
    }

    print("\nFinal Output (JSON Format):\n")
    print(json.dumps(result, indent=4))

Enter symptoms:
>>  fever, headache, loss of smell

Enter clinical notes:
>>  Patient reported low grade fever with persistent headache and recent loss of smell. Respiratory status stable without need for mechanical ventilation. Managed conservatively with monitoring and supportive care.


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 294.49it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Final Output (JSON Format):

{
    "input_processed": "fever headache loss smell patient reported low grade fever with persistent headache recent loss smell respiratory status stable without need mechanical ventilation managed conservatively with monitoring supportive care",
    "nearest_similar_patients": [
        {
            "patient_id": 188,
            "similarity_score": 0.5236
        },
        {
            "patient_id": 7,
            "similarity_score": 0.5226
        },
        {
            "patient_id": 186,
            "similarity_score": 0.495
        }
    ],
    "most_similar_patient_insight": {
        "shared_symptoms": [
            "headache",
            "fever"
        ],
        "treatments": [
            "mechanical ventilation",
            "methylprednisolone"
        ],
        "outcome": "discharged",
        "recovery_trend": "Not specified"
    }
}


The generated reusable code have been successfully tested with the new three sample patients. for the tested samples it efficiently produces the top three similar patients, shared symptoms, treatment, outcome and recovery trend have been produced for the sample input.