# performed precomputing process

developed a precoputing layer where the preprocessed data is converted embeddings and then found the cosine similarity.

In [1]:
# precomputing before sharing symptoms
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
# model loading
def load_embedding_model(model_name="all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# loading of data and generate embeddings
def compute_embeddings(csv_path: str,
                       text_column: str,
                       limit: int = 600):

    # Clean accidental quotes in path
    csv_path = csv_path.strip().strip('"').strip("'")

    df = pd.read_csv(csv_path)

    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in dataset.")

    # Select first 600 subjects
    df_subset = df.head(limit)

    texts = df_subset[text_column].astype(str).tolist()

    print(f"\nGenerating embeddings for {len(texts)} subjects...\n")

    model = load_embedding_model()

    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    # IMPORTANT: Normalize embeddings for proper cosine similarity
    embeddings = normalize(embeddings, norm="l2")

    return df_subset, embeddings
# computing cosine similarity
def compute_similarity_matrix(embeddings: np.ndarray):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

# main execution
if __name__ == "__main__":

    # Get User Inputs 
    csv_path = input("Enter full path to preprocessed CSV file:\n>> ")
    text_column = input("Enter text column name (e.g., note_preprocessed):\n>> ")

    # Step 1: Generate Embeddings 
    patient_df, stored_embeddings = compute_embeddings(
        csv_path=csv_path,
        text_column=text_column,
        limit=600
    )

    print("\nEmbedding Matrix Shape:", stored_embeddings.shape)

    # ---- Step 2: Compute Cosine Similarity ----
    print("\nComputing cosine similarity matrix...\n")

    similarity_matrix = compute_similarity_matrix(stored_embeddings)

    print("Cosine Similarity Matrix Shape:", similarity_matrix.shape)

    print("\nProcess Completed Successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Enter full path to preprocessed CSV file:
>>  C:\Users\rajak\Downloads\AI Internship\newdataset.csv
Enter text column name (e.g., note_preprocessed):
>>  note_preprocessed



Generating embeddings for 600 subjects...



Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 221.85it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 19/19 [00:35<00:00,  1.85s/it]


Embedding Matrix Shape: (600, 384)

Computing cosine similarity matrix...

Cosine Similarity Matrix Shape: (600, 600)

Process Completed Successfully.





performed the precomputing process. where the prerocessed file have beeen computed embeddings and then find the cosine similarity using the single reusable code for the existing data.

# Insight genertion layer

creating the insight generation layer 

In [4]:
# creating the insight generation layer
import json
import re
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
# clinical keywords for symptoms, treatment and otcome
SYMPTOM_KEYWORDS = [
    "fever", "cough", "dry cough", "dyspnea",
    "shortness of breath", "hypoxia", "fatigue",
    "chest pain", "oxygen desaturation",
    "tachypnea", "respiratory distress",
    "ards", "respiratory failure", "oxygen desaturation", "hypoxia","hypoxemia","tachypnea",
    "respiratory distress", "acute respiratory distress syndrome", "ards", "chest tightness",
    "wheezing", "productive cough", "hemoptysis"
]

TREATMENT_KEYWORDS = [
   "oxygen therapy",
    "supplemental oxygen",
    "high flow oxygen",
    "high flow nasal cannula",
    "hfno",
    "non invasive ventilation",
    "niv",
    "cpap",
    "bipap",
    "mechanical ventilation",
    "intubation",
    "ventilator support",
    "prone positioning",
    "respiratory support",
]

OUTCOME_KEYWORDS = {
    "discharged": ["discharged","home discharge"],
    "rehabilitation": ["rehabilitation","rehab clinic"],
    "critical_monitoring": ["icu","intensive care","critical"]
}
# similarity based function
def retrieve_similar_cases(new_embedding, top_n=5):

    global stored_embeddings

    if new_embedding.ndim == 1:
        new_embedding = new_embedding.reshape(1, -1)

    similarity_scores = cosine_similarity(
        new_embedding,
        stored_embeddings
    )[0]

    self_index = np.argmax(similarity_scores)
    similarity_scores[self_index] = -1

    ranked_indices = np.argsort(similarity_scores)[::-1]

    return ranked_indices[:top_n]

# insight generation function
def generate_case_insight(similar_cases, query_text):

    similar_texts = patient_df.iloc[similar_cases]["note_preprocessed"].tolist()

    # ---- Shared Symptoms ----
    query_words = set(query_text.split())
    symptom_counter = Counter()

    for text in similar_texts:
        words = set(text.split())
        shared = query_words.intersection(words)
        shared_symptoms = [w for w in shared if w in SYMPTOM_KEYWORDS]
        symptom_counter.update(shared_symptoms)

    most_common_symptoms = [sym for sym, _ in symptom_counter.most_common(5)]

    # ---- Treatment Extraction ----
    treatment_counter = Counter()

    for text in similar_texts:
        for treatment in TREATMENT_KEYWORDS:
            if treatment in text:
                treatment_counter.update([treatment])

    most_common_treatments = [t for t, _ in treatment_counter.most_common(3)]

    # ---- Outcome Trend ----
    outcome_trend = "Not clearly mentioned"

    for outcome, keywords in OUTCOME_KEYWORDS.items():
        count = sum(any(k in text for k in keywords) for text in similar_texts)
        if count >= 2:
            outcome_trend = outcome
            break

    # ---- Recovery Trend ----
    recovery_mentions = [
        re.findall(r'\b\d+\s*(day|week|month)s?\b', text)
        for text in similar_texts
    ]

    recovery_flat = [item for sublist in recovery_mentions for item in sublist]

    recovery_trend = "Not specified"
    if recovery_flat:
        recovery_trend = "Recovery period mentioned in similar cases"

    insight = {
        "shared_symptoms": most_common_symptoms,
        "suggested_treatment": most_common_treatments,
        "outcome_trend": outcome_trend,
        "recovery_trend": recovery_trend
    }

    return insight

# main function
if __name__ == "__main__":

    print("\nGenerating Insights for First 5 Patients\n")

    for patient_idx in range(5):

        print(f"\nPatient Index: {patient_idx}")
        print("-" * 50)

        query_embedding = stored_embeddings[patient_idx]
        query_text = patient_df.iloc[patient_idx]["note_preprocessed"]

        similar_cases = retrieve_similar_cases(
            new_embedding=query_embedding,
            top_n=5
        )

        insight = generate_case_insight(similar_cases, query_text)

        print(json.dumps(insight, indent=4))


Generating Insights for First 5 Patients


Patient Index: 0
--------------------------------------------------
{
    "shared_symptoms": [
        "cough",
        "dyspnea",
        "fever"
    ],
    "suggested_treatment": [
        "intubation",
        "prone positioning",
        "supplemental oxygen"
    ],
    "outcome_trend": "discharged",
    "recovery_trend": "Not specified"
}

Patient Index: 1
--------------------------------------------------
{
    "shared_symptoms": [
        "cough",
        "dyspnea",
        "fever"
    ],
    "suggested_treatment": [
        "intubation",
        "prone positioning",
        "oxygen therapy"
    ],
    "outcome_trend": "discharged",
    "recovery_trend": "Not specified"
}

Patient Index: 2
--------------------------------------------------
{
    "shared_symptoms": [],
    "suggested_treatment": [
        "intubation",
        "prone positioning",
        "supplemental oxygen"
    ],
    "outcome_trend": "discharged",
    "recovery_tren

created the insight generation layer which provide the similar patients shared symptom, treatment, recovery trend and the outcome trend of the patient in the json file format. We have used the function name provided by you an the outputs are provided in the json file format.

# tresting with input samples