# Grouping of data

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# loading of pre-processed data
df = pd.read_csv(r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv")
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns
# creating a unique patient ID
df = df.reset_index(drop=True)
df["patient_uid"] = ["P{:05d}".format(i) for i in range(len(df))]
# TF-IDF vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# marked the severity indicators 
severity_indicators = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]
severity_indices = [
    i for i, term in enumerate(feature_names)
    if any(key in term for key in severity_indicators)
]
# Compute severity score per patient
severity_scores = tfidf_vectors[:, severity_indices].sum(axis=1).A1
df["severity_score"] = severity_scores
# clustering severity into three groups
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(
    df[["severity_score"]]
)
# mapping clusters into labels 
cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)
severity_mapping = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}
df["severity_level"] = df["severity_cluster"].map(severity_mapping)
# final output
print("\n SEVERITY DISTRIBUTION \n")
print(df["severity_level"].value_counts())

print("\n SAMPLE PATIENT SEVERITY \n")
print(df[["patient_uid", "severity_score", "severity_level"]].head(10))



 SEVERITY DISTRIBUTION 

severity_level
Low       489
Medium     82
High       29
Name: count, dtype: int64

 SAMPLE PATIENT SEVERITY 

  patient_uid  severity_score severity_level
0      P00000        0.571965           High
1      P00001        0.532564           High
2      P00002        0.983853           High
3      P00003        0.460651           High
4      P00004        0.504159           High
5      P00005        0.000000            Low
6      P00006        0.425007           High
7      P00007        0.470549           High
8      P00008        0.370591           High
9      P00009        0.546997           High


from this we have the severity level of the each subject based on the clinical terms. Then based on the severity level the patients have been grouped into high severity, medium severity and low severity. This helps to predict the severity level of the new from the existing case.

# loading the cosine similarity

In [7]:
import pandas as pd
import numpy as np
# loading the cosine similarity matrix
similarity_df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv",
    index_col=0
)
# Ensure indices are strings
similarity_df.index = similarity_df.index.astype(str)
similarity_df.columns = similarity_df.columns.astype(str)
# removing the self-similarity
np.fill_diagonal(similarity_df.values, np.nan)
# converting to the dataframe
pairs = (
    similarity_df
    .stack()
    .reset_index()
)
pairs.columns = ["patient_1", "patient_2", "cosine_similarity"]
# removing the duplicate pairs
pairs["pair_key"] = pairs.apply(
    lambda x: "_".join(sorted([x["patient_1"], x["patient_2"]])),
    axis=1
)
pairs = pairs.drop_duplicates(subset="pair_key")
pairs = pairs.drop(columns="pair_key")
# top 10 most similar pairs
top_10_similar = pairs.sort_values(
    by="cosine_similarity",
    ascending=False
).head(10)
# top 10 dissimilar pairs
top_10_dissimilar = pairs.sort_values(
    by="cosine_similarity",
    ascending=True
).head(10)
# displaying results
print("\n TOP 10 MOST SIMILAR PATIENT PAIRS \n")
print(top_10_similar)
print("\n TOP 10 MOST DISIMILAR PATIENT PAIRS \n")
print(top_10_dissimilar)


 TOP 10 MOST SIMILAR PATIENT PAIRS 

       patient_1 patient_2  cosine_similarity
258004       452       457           0.781737
258000       452       453           0.709339
44400         79        80           0.701907
256805       450       456           0.688043
258603       453       457           0.660064
43801         78        80           0.653648
43800         78        79           0.646245
112201       198       200           0.631032
32400         58        59           0.628265
347400       612       613           0.622893

 TOP 10 MOST DISIMILAR PATIENT PAIRS 

       patient_1 patient_2  cosine_similarity
139427       244       486           0.005071
139457       244       520           0.006475
139401       244       456           0.006761
89100        156       475           0.006802
294039       520       562           0.007074
92547        162       317           0.007128
222679       391       477           0.007155
100481       178       475           0.007320
44

Loaded the existing cosine similarity score file and found the top similar and disimilar pairs. Based on this similarity value we can be able to easily find the most similar existing patient to the ne patient.

# New patient 1

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# loading of pre-processed data
df = pd.read_csv(r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv")
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns, "Missing 'note_preprocessed' column"
# Create unique patient ID
df = df.reset_index(drop=True)
df["patient_uid"] = ["P{:05d}".format(i) for i in range(len(df))]
# vectorisation of the existing data
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_existing = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# getting the user input to process
new_case_text = input("\nEnter new patient clinical terms:\n")
# displaying the original input
print("\n  ORIGINAL INPUT \n")
print(new_case_text)
# Match dataset assumptions
new_case_text = new_case_text.lower()
# Vectorize input using SAME TF-IDF space
tfidf_input = vectorizer.transform([new_case_text])
# finding the most similar existing patient
similarity_scores = cosine_similarity(tfidf_input, tfidf_existing)[0]
most_similar_index = similarity_scores.argmax()
most_similar_score = similarity_scores[most_similar_index]
most_similar_patient_id = df.loc[most_similar_index, "patient_uid"]
most_similar_note = df.loc[most_similar_index, "note_preprocessed"]
# extracting the clinical shared medical terms
def get_top_tfidf_terms(tfidf_vector, top_n=15):
    dense_vec = tfidf_vector.toarray().flatten()
    top_indices = dense_vec.argsort()[-top_n:][::-1]
    return set(feature_names[top_indices])
# Top TF-IDF terms from input and similar patient
input_top_terms = get_top_tfidf_terms(tfidf_input, top_n=15)
patient_top_terms = get_top_tfidf_terms(
    tfidf_existing[most_similar_index], top_n=15
)
# getting medically meaningful terms
shared_clinical_terms = sorted(
    input_top_terms.intersection(patient_top_terms)
)
# displaying the result
print("\n  MOST SIMILAR EXISTING PATIENT ")
print("Patient ID:", most_similar_patient_id)
print("Cosine Similarity Score:", round(most_similar_score, 4))

print("\n SHARED CLINICAL TERMS (TF-IDF FILTERED) ")
print(", ".join(shared_clinical_terms))



Enter new patient clinical terms:
 A 60 year-old male presented with fever, persistent dry cough, and shortness of breath. The patient experienced oxygen desaturation on exertion and was diagnosed with viral pneumonia.



  ORIGINAL INPUT 

A 60 year-old male presented with fever, persistent dry cough, and shortness of breath. The patient experienced oxygen desaturation on exertion and was diagnosed with viral pneumonia.

  MOST SIMILAR EXISTING PATIENT 
Patient ID: P00000
Cosine Similarity Score: 0.2423

 SHARED CLINICAL TERMS (TF-IDF FILTERED) 
desaturation, oxygen desaturation


finding the most similar existing patient for the new given input. from this we also find the most important shared common symptoms.

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
# loading of pre-processed data
df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
)
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns, "note_preprocessed column missing"
# creating a unique patient ID
df = df.reset_index(drop=True)
df["patient_uid"] = ["P{:05d}".format(i) for i in range(len(df))]
# Tf IDF vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# severity scoring 
severity_indicators = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]

severity_indices = [
    i for i, term in enumerate(feature_names)
    if any(key in term for key in severity_indicators)
]

severity_scores = tfidf_vectors[:, severity_indices].sum(axis=1).A1
df["severity_score"] = severity_scores
# clustering the subjects based on the severity level
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(df[["severity_score"]])

cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)

severity_mapping = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}

df["severity_level"] = df["severity_cluster"].map(severity_mapping)
# providing user input
print("\nEnter new patient clinical description:")
new_patient_text = input().lower()

tfidf_input = vectorizer.transform([new_patient_text])
# finding top 5 existing similar patient
similarity_scores = cosine_similarity(tfidf_input, tfidf_vectors)[0]
top_5_indices = similarity_scores.argsort()[-5:][::-1]
top_5_similar = df.iloc[top_5_indices].copy()
top_5_similar["cosine_similarity"] = similarity_scores[top_5_indices]
# zinfering the severity level for the new patient
severity_weighted = {}

for _, row in top_5_similar.iterrows():
    severity_weighted[row["severity_level"]] = (
        severity_weighted.get(row["severity_level"], 0)
        + row["cosine_similarity"]
    )

predicted_severity = max(severity_weighted, key=severity_weighted.get)
# displaying result
print("\n TOP 5 MOST SIMILAR PATIENTS \n")
print(
    top_5_similar[
        ["patient_uid", "cosine_similarity"]
    ].to_string(index=False)
)

print("\n INFERRED SEVERITY FOR NEW PATIENT ")
print("Predicted Severity Level:", predicted_severity)



Enter new patient clinical description:


 A 60 year-old male presented with fever, persistent dry cough, and shortness of breath. The patient experienced oxygen desaturation on exertion and was diagnosed with viral pneumonia.



 TOP 5 MOST SIMILAR PATIENTS 

patient_uid  cosine_similarity
     P00000           0.208419
     P00005           0.189355
     P00131           0.163337
     P00191           0.142388
     P00002           0.142220

 INFERRED SEVERITY FOR NEW PATIENT 
Predicted Severity Level: High


from these code we have found the top 5 similar existing patient and the severity level of the new patient. This severity level would be the indicator for the clinicians.

In [9]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
# loading of the pre-processed data
df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
)
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns
df = df.reset_index(drop=True)
df["patient_uid"] = [f"P{i:05d}" for i in range(len(df))]
# vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# key words indicating severity
severity_terms = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]

severity_idx = [
    i for i, term in enumerate(feature_names)
    if any(k in term for k in severity_terms)
]

df["severity_score"] = tfidf_vectors[:, severity_idx].sum(axis=1).A1

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(df[["severity_score"]])

cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)

severity_map = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}
df["severity_level"] = df["severity_cluster"].map(severity_map)
# user input 
print("\nEnter new patient clinical terms:")
new_text = input().lower()
tfidf_input = vectorizer.transform([new_text])
# Top 5 similar patient
sim_scores = cosine_similarity(tfidf_input, tfidf_vectors)[0]
top_5_idx = sim_scores.argsort()[-5:][::-1]
top_5 = df.iloc[top_5_idx].copy()
# symptom extraction
SYMPTOM_TERMS = [
    "fever", "cough", "dry cough", "dyspnea",
    "shortness breath", "hypoxia", "fatigue",
    "chest pain", "oxygen desaturation",
    "tachypnea", "respiratory distress"
]

def extract_symptoms(text):
    found = []
    for term in SYMPTOM_TERMS:
        if term in text:
            found.append(term)
    return found

symptoms = []
for idx in top_5_idx:
    symptoms.extend(extract_symptoms(df.loc[idx, "note_preprocessed"]))

top_symptoms = [s for s, _ in Counter(symptoms).most_common(3)]
# treatment extraction
TREATMENT_TERMS = [
    "oxygen therapy", "mechanical ventilation", "ventilation",
    "intubation", "ecmo",
    "physiotherapy", "physical therapy",
    "breathing exercise", "rehabilitation",
    "corticosteroid", "steroid",
    "antibiotic", "antiviral",
    "dialysis", "supportive care"
]

def extract_treatments(text):
    return [t for t in TREATMENT_TERMS if t in text]

treatments = []
for idx in top_5_idx:
    treatments.extend(extract_treatments(df.loc[idx, "note_preprocessed"]))

top_treatments = [t for t, _ in Counter(treatments).most_common(3)]
# recovery time 
def extract_days(text):
    return [int(x) for x in re.findall(r'day\s*(\d+)', text)]

days = []
for idx in top_5_idx:
    days.extend(extract_days(df.loc[idx, "note_preprocessed"]))

estimated_recovery = (
    f"{int(np.median(days))} days"
    if days else "Not explicitly stated"
)
# final output
print("\n MOST SIMILAR PATIENTS ")
print(top_5[["patient_uid" ]])

print("\n SHARED CLINICAL SYMPTOMS ")
for s in top_symptoms:
    print("-", s)

print("\n TREATMENT ADOPTED IN SIMILAR CASES")
for t in top_treatments:
    print("-", t)

print("\n ESTIMATED RECOVERY PERIOD")
print(estimated_recovery)



Enter new patient clinical terms:


 A 60 year-old male presented with fever, persistent dry cough, and shortness of breath. The patient experienced oxygen desaturation on exertion and was diagnosed with viral pneumonia.



 MOST SIMILAR PATIENTS 
    patient_uid
0        P00000
5        P00005
131      P00131
191      P00191
2        P00002

 SHARED CLINICAL SYMPTOMS 
- cough
- fever
- dry cough

 TREATMENT ADOPTED IN SIMILAR CASES
- rehabilitation
- physical therapy
- breathing exercise

 ESTIMATED RECOVERY PERIOD
Not explicitly stated


From the above given code we have found the the treatment that need to be provided to the new patient based on the existing patient records. Then also found the recovery period and shared symptoms among the existing patients.

Created a full system which find the similar existing patients, severity level, treatment adopted in similar cases and suggests the extimated recovery period for the new subject.

# Testing of full system

Testing with case 1

In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
# loading of the pre-processed data
df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
)
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns
df = df.reset_index(drop=True)
df["patient_uid"] = [f"P{i:05d}" for i in range(len(df))]
# vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# key words indicating severity
severity_terms = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]

severity_idx = [
    i for i, term in enumerate(feature_names)
    if any(k in term for k in severity_terms)
]

df["severity_score"] = tfidf_vectors[:, severity_idx].sum(axis=1).A1

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(df[["severity_score"]])

cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)

severity_map = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}
df["severity_level"] = df["severity_cluster"].map(severity_map)
# user input 
print("\nEnter new patient clinical terms:")
new_text = input().lower()
tfidf_input = vectorizer.transform([new_text])
# Top 5 similar patient
sim_scores = cosine_similarity(tfidf_input, tfidf_vectors)[0]
top_5_idx = sim_scores.argsort()[-5:][::-1]
top_5 = df.iloc[top_5_idx].copy()
# severity level indications
severity_weighted = {}
for i in top_5_idx:
    level = df.loc[i, "severity_level"]
    score = sim_scores[i]
    severity_weighted[level] = severity_weighted.get(level, 0) + score
predicted_severity = max(severity_weighted, key=severity_weighted.get)

# symptom extraction
SYMPTOM_TERMS = [
    "fever", "cough", "dry cough", "dyspnea",
    "shortness breath", "hypoxia", "fatigue",
    "chest pain", "oxygen desaturation",
    "tachypnea", "respiratory distress"
]

def extract_symptoms(text):
    found = []
    for term in SYMPTOM_TERMS:
        if term in text:
            found.append(term)
    return found

symptoms = []
for idx in top_5_idx:
    symptoms.extend(extract_symptoms(df.loc[idx, "note_preprocessed"]))

top_symptoms = [s for s, _ in Counter(symptoms).most_common(3)]
# treatment extraction
TREATMENT_TERMS = [
    "oxygen therapy", "mechanical ventilation", "ventilation",
    "intubation", "ecmo",
    "physiotherapy", "physical therapy",
    "breathing exercise", "rehabilitation",
    "corticosteroid", "steroid",
    "antibiotic", "antiviral",
    "dialysis", "supportive care"
]

def extract_treatments(text):
    return [t for t in TREATMENT_TERMS if t in text]

treatments = []
for idx in top_5_idx:
    treatments.extend(extract_treatments(df.loc[idx, "note_preprocessed"]))

top_treatments = [t for t, _ in Counter(treatments).most_common(3)]
# recovery time 
def extract_days(text):
    return [int(x) for x in re.findall(r'day\s*(\d+)', text)]

days = []
for idx in top_5_idx:
    days.extend(extract_days(df.loc[idx, "note_preprocessed"]))

estimated_recovery = (
    f"{int(np.median(days))} days"
    if days else "Not explicitly stated"
)
# final output
print("\n MOST SIMILAR PATIENTS ")
print(top_5[["patient_uid" ]])

print("\n SHARED CLINICAL SYMPTOMS ")
for s in top_symptoms:
    print("-", s)
    
print("\n INFERRED SEVERITY LEVEL FOR NEW PATIENT")
print(predicted_severity)

print("\n TREATMENT ADOPTED IN SIMILAR CASES")
for t in top_treatments:
    print("-", t)

print("\n ESTIMATED RECOVERY PERIOD")
print(estimated_recovery)


Enter new patient clinical terms:


 The patient developed worsening shortness of breath with persistent cough and fever, accompanied by hypoxia and oxygen desaturation requiring close respiratory monitoring.



 MOST SIMILAR PATIENTS 
    patient_uid
0        P00000
2        P00002
59       P00059
1        P00001
129      P00129

 SHARED CLINICAL SYMPTOMS 
- fever
- cough
- dry cough

 INFERRED SEVERITY LEVEL FOR NEW PATIENT
High

 TREATMENT ADOPTED IN SIMILAR CASES
- physical therapy
- breathing exercise
- rehabilitation

 ESTIMATED RECOVERY PERIOD
Not explicitly stated


Based on the input provided the NLP model produces a proper most similar patients, shared clinical symptoms among the existing patient, severity level of the new patient, treatment suggestions based on the existing cases and estimated recovery period.

Tested with the new sample case 2

In [3]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
# loading of the pre-processed data
df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
)
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns
df = df.reset_index(drop=True)
df["patient_uid"] = [f"P{i:05d}" for i in range(len(df))]
# vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# key words indicating severity
severity_terms = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]

severity_idx = [
    i for i, term in enumerate(feature_names)
    if any(k in term for k in severity_terms)
]

df["severity_score"] = tfidf_vectors[:, severity_idx].sum(axis=1).A1

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(df[["severity_score"]])

cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)

severity_map = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}
df["severity_level"] = df["severity_cluster"].map(severity_map)
# user input 
print("\nEnter new patient clinical terms:")
new_text = input().lower()
tfidf_input = vectorizer.transform([new_text])
# Top 5 similar patient
sim_scores = cosine_similarity(tfidf_input, tfidf_vectors)[0]
top_5_idx = sim_scores.argsort()[-5:][::-1]
top_5 = df.iloc[top_5_idx].copy()
# severity level indications
severity_weighted = {}
for i in top_5_idx:
    level = df.loc[i, "severity_level"]
    score = sim_scores[i]
    severity_weighted[level] = severity_weighted.get(level, 0) + score
predicted_severity = max(severity_weighted, key=severity_weighted.get)

# symptom extraction
SYMPTOM_TERMS = [
    "fever", "cough", "dry cough", "dyspnea",
    "shortness breath", "hypoxia", "fatigue",
    "chest pain", "oxygen desaturation",
    "tachypnea", "respiratory distress"
]

def extract_symptoms(text):
    found = []
    for term in SYMPTOM_TERMS:
        if term in text:
            found.append(term)
    return found

symptoms = []
for idx in top_5_idx:
    symptoms.extend(extract_symptoms(df.loc[idx, "note_preprocessed"]))

top_symptoms = [s for s, _ in Counter(symptoms).most_common(3)]
# treatment extraction
TREATMENT_TERMS = [
    "oxygen therapy", "mechanical ventilation", "ventilation",
    "intubation", "ecmo",
    "physiotherapy", "physical therapy",
    "breathing exercise", "rehabilitation",
    "corticosteroid", "steroid",
    "antibiotic", "antiviral",
    "dialysis", "supportive care"
]

def extract_treatments(text):
    return [t for t in TREATMENT_TERMS if t in text]

treatments = []
for idx in top_5_idx:
    treatments.extend(extract_treatments(df.loc[idx, "note_preprocessed"]))

top_treatments = [t for t, _ in Counter(treatments).most_common(3)]
# recovery time 
def extract_days(text):
    return [int(x) for x in re.findall(r'day\s*(\d+)', text)]

days = []
for idx in top_5_idx:
    days.extend(extract_days(df.loc[idx, "note_preprocessed"]))

estimated_recovery = (
    f"{int(np.median(days))} days"
    if days else "Not explicitly stated"
)
# final output
print("\n MOST SIMILAR PATIENTS ")
print(top_5[["patient_uid" ]])

print("\n SHARED CLINICAL SYMPTOMS ")
for s in top_symptoms:
    print("-", s)
    
print("\n INFERRED SEVERITY LEVEL FOR NEW PATIENT")
print(predicted_severity)

print("\n TREATMENT ADOPTED IN SIMILAR CASES")
for t in top_treatments:
    print("-", t)

print("\n ESTIMATED RECOVERY PERIOD")
print(estimated_recovery)


Enter new patient clinical terms:


 The patient presented with acute respiratory distress marked by severe dyspnea, tachypnea, oxygen desaturation, and signs of respiratory failure necessitating intensive respiratory support.



 MOST SIMILAR PATIENTS 
    patient_uid
0        P00000
6        P00006
149      P00149
132      P00132
9        P00009

 SHARED CLINICAL SYMPTOMS 
- respiratory distress
- fever
- cough

 INFERRED SEVERITY LEVEL FOR NEW PATIENT
High

 TREATMENT ADOPTED IN SIMILAR CASES
- rehabilitation
- physical therapy
- intubation

 ESTIMATED RECOVERY PERIOD
Not explicitly stated


The provided input has marked as the severity level of high. The treatment needs to be provided to the new patient based on the symptoms shared are rehabilitation, physical therapy and intubation. The recovery period is not mentioned for the existing patient.

Tested with sample case 3

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
# loading of the pre-processed data
df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
)
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns
df = df.reset_index(drop=True)
df["patient_uid"] = [f"P{i:05d}" for i in range(len(df))]
# vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# key words indicating severity
severity_terms = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]

severity_idx = [
    i for i, term in enumerate(feature_names)
    if any(k in term for k in severity_terms)
]

df["severity_score"] = tfidf_vectors[:, severity_idx].sum(axis=1).A1

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(df[["severity_score"]])

cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)

severity_map = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}
df["severity_level"] = df["severity_cluster"].map(severity_map)
# user input 
print("\nEnter new patient clinical terms:")
new_text = input().lower()
tfidf_input = vectorizer.transform([new_text])
# Top 5 similar patient
sim_scores = cosine_similarity(tfidf_input, tfidf_vectors)[0]
top_5_idx = sim_scores.argsort()[-5:][::-1]
top_5 = df.iloc[top_5_idx].copy()
# severity level indications
severity_weighted = {}
for i in top_5_idx:
    level = df.loc[i, "severity_level"]
    score = sim_scores[i]
    severity_weighted[level] = severity_weighted.get(level, 0) + score
predicted_severity = max(severity_weighted, key=severity_weighted.get)

# symptom extraction
SYMPTOM_TERMS = [
    "fever", "cough", "dry cough", "dyspnea",
    "shortness breath", "hypoxia", "fatigue",
    "chest pain", "oxygen desaturation",
    "tachypnea", "respiratory distress"
]

def extract_symptoms(text):
    found = []
    for term in SYMPTOM_TERMS:
        if term in text:
            found.append(term)
    return found

symptoms = []
for idx in top_5_idx:
    symptoms.extend(extract_symptoms(df.loc[idx, "note_preprocessed"]))

top_symptoms = [s for s, _ in Counter(symptoms).most_common(3)]
# treatment extraction
TREATMENT_TERMS = [
    "oxygen therapy", "mechanical ventilation", "ventilation",
    "intubation", "ecmo",
    "physiotherapy", "physical therapy",
    "breathing exercise", "rehabilitation",
    "corticosteroid", "steroid",
    "antibiotic", "antiviral",
    "dialysis", "supportive care"
]

def extract_treatments(text):
    return [t for t in TREATMENT_TERMS if t in text]

treatments = []
for idx in top_5_idx:
    treatments.extend(extract_treatments(df.loc[idx, "note_preprocessed"]))

top_treatments = [t for t, _ in Counter(treatments).most_common(3)]
# recovery time 
def extract_days(text):
    return [int(x) for x in re.findall(r'day\s*(\d+)', text)]

days = []
for idx in top_5_idx:
    days.extend(extract_days(df.loc[idx, "note_preprocessed"]))

estimated_recovery = (
    f"{int(np.median(days))} days"
    if days else "Not explicitly stated"
)
# final output
print("\n MOST SIMILAR PATIENTS ")
print(top_5[["patient_uid" ]])

print("\n SHARED CLINICAL SYMPTOMS ")
for s in top_symptoms:
    print("-", s)
    
print("\n INFERRED SEVERITY LEVEL FOR NEW PATIENT")
print(predicted_severity)

print("\n TREATMENT ADOPTED IN SIMILAR CASES")
for t in top_treatments:
    print("-", t)

print("\n ESTIMATED RECOVERY PERIOD")
print(estimated_recovery)


Enter new patient clinical terms:


 The patient showed mild respiratory symptoms including low-grade fever, cough, and shortness of breath on exertion, without persistent hypoxia.



 MOST SIMILAR PATIENTS 
    patient_uid
425      P00425
342      P00342
131      P00131
59       P00059
243      P00243

 SHARED CLINICAL SYMPTOMS 
- fever
- shortness breath
- cough

 INFERRED SEVERITY LEVEL FOR NEW PATIENT
Low

 TREATMENT ADOPTED IN SIMILAR CASES
- antibiotic
- rehabilitation
- intubation

 ESTIMATED RECOVERY PERIOD
Not explicitly stated


The provided input has marked as the severity level of low. The treatment needs to be provided for the new patient based on the existing with the shared symptoms are antibiotic, rehabilitation and intubation. The recovery time is not found on the existing patient.