# Visualising the similar patients based on cosine distance

In the single unified performed the pre-processed data to cosine similarity. Making like this unified code and storing the step by results would reduce the time for initialising the code from the begining all the time for this case.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# loading of pre-processed data 
df = pd.read_csv(r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv")
# Ensure required column exists
assert "note_preprocessed" in df.columns, "Column 'note_preprocessed' not found"
# selecting first 600 patients alone 
df_600 = df.iloc[:600].copy()
# Handle patient IDs
if "patient_id" in df_600.columns:
    patient_ids = df_600["patient_id"].astype(str)
else:
    patient_ids = df_600.index.astype(str)
# TF-IDF vectorizer 
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    norm="l2"
)
tfidf_vectors = vectorizer.fit_transform(
    df_600["note_preprocessed"].astype(str)
)
print("TF-IDF shape:", tfidf_vectors.shape)  # (600, features)
# cosine similarity computation
cosine_sim_matrix = cosine_similarity(tfidf_vectors)
# Convert to DataFrame
cosine_similarity_df = pd.DataFrame(
    cosine_sim_matrix,
    index=patient_ids,
    columns=patient_ids
)
# saving cosine similarity matrix
output_file =r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv"
cosine_similarity_df.to_csv(output_file)
print(f"Cosine similarity matrix saved as: {output_file}")
# displaying the results head
cosine_similarity_df.head()


TF-IDF shape: (600, 3000)
Cosine similarity matrix saved as: C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv


patient_id,0,1,2,3,4,5,6,7,8,9,...,624,625,626,627,628,629,630,631,633,634
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.295018,0.286272,0.233355,0.146855,0.341031,0.417966,0.212645,0.229525,0.290407,...,0.090365,0.038137,0.064574,0.063097,0.067862,0.054088,0.045444,0.044407,0.025074,0.05517
1,0.295018,1.0,0.391625,0.115043,0.168605,0.180826,0.209542,0.200359,0.166427,0.196865,...,0.056607,0.029917,0.044105,0.099691,0.045316,0.067767,0.061307,0.074283,0.039219,0.067189
2,0.286272,0.391625,1.0,0.260915,0.165014,0.191636,0.25566,0.262918,0.236332,0.240311,...,0.075397,0.03001,0.076855,0.059636,0.099281,0.058341,0.104525,0.055934,0.059065,0.061023
3,0.233355,0.115043,0.260915,1.0,0.147604,0.293505,0.340738,0.31058,0.28703,0.300308,...,0.080737,0.031779,0.069619,0.123005,0.101059,0.106779,0.061965,0.077166,0.073236,0.05902
4,0.146855,0.168605,0.165014,0.147604,1.0,0.149002,0.134768,0.15698,0.206732,0.131699,...,0.049843,0.020568,0.043537,0.034077,0.033196,0.035601,0.038146,0.034278,0.040669,0.074965


visualising the top 20 patients cosine similarity values in the tabulated format for the better understanding.

In [5]:
import pandas as pd
import numpy as np

# Load cosine similarity matrix
similarity_df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv",
    index_col=0
)
# Ensure index consistency
similarity_df.index = similarity_df.index.astype(str)
similarity_df.columns = similarity_df.columns.astype(str)
# Removing the self similarity
np.fill_diagonal(similarity_df.values, np.nan)
# Convert matrix to pairwise format
similarity_long = similarity_df.stack().reset_index()
similarity_long.columns = ["patient_id_1", "patient_id_2", "cosine_similarity"]
# Sort by similarity score 
top20_pairs = similarity_long.sort_values(
    by="cosine_similarity",
    ascending=False
).head(20)
# Display top 20 most similar pairs
top20_pairs


Unnamed: 0,patient_id_1,patient_id_2,cosine_similarity
260995,457,452,0.781737
258004,452,457,0.781737
258599,453,452,0.709339
258000,452,453,0.709339
44999,80,79,0.701907
44400,79,80,0.701907
256805,450,456,0.688043
260394,456,450,0.688043
260996,457,453,0.660064
258603,453,457,0.660064


From this code we have found the top 20 most similar patient pairs from the given dataset.

# Creating unique patient_ID

In [2]:
import pandas as pd
# loading cosine similarity matrix
similarity_df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv",
    index_col=0
)
# creating unique patient ID
num_patients = similarity_df.shape[0]
patient_uids = [f"P{i+1:04d}" for i in range(num_patients)]
# Assigning unique patient ID
similarity_df.index = patient_uids
similarity_df.columns = patient_uids
# display the results
similarity_df.head()


Unnamed: 0,P0001,P0002,P0003,P0004,P0005,P0006,P0007,P0008,P0009,P0010,...,P0591,P0592,P0593,P0594,P0595,P0596,P0597,P0598,P0599,P0600
P0001,1.0,0.295018,0.286272,0.233355,0.146855,0.341031,0.417966,0.212645,0.229525,0.290407,...,0.090365,0.038137,0.064574,0.063097,0.067862,0.054088,0.045444,0.044407,0.025074,0.05517
P0002,0.295018,1.0,0.391625,0.115043,0.168605,0.180826,0.209542,0.200359,0.166427,0.196865,...,0.056607,0.029917,0.044105,0.099691,0.045316,0.067767,0.061307,0.074283,0.039219,0.067189
P0003,0.286272,0.391625,1.0,0.260915,0.165014,0.191636,0.25566,0.262918,0.236332,0.240311,...,0.075397,0.03001,0.076855,0.059636,0.099281,0.058341,0.104525,0.055934,0.059065,0.061023
P0004,0.233355,0.115043,0.260915,1.0,0.147604,0.293505,0.340738,0.31058,0.28703,0.300308,...,0.080737,0.031779,0.069619,0.123005,0.101059,0.106779,0.061965,0.077166,0.073236,0.05902
P0005,0.146855,0.168605,0.165014,0.147604,1.0,0.149002,0.134768,0.15698,0.206732,0.131699,...,0.049843,0.020568,0.043537,0.034077,0.033196,0.035601,0.038146,0.034278,0.040669,0.074965


This unique patient ID helps in accurate identification of patients and easier comparisons based on the similarity levels.

# Ranking similar cases

In [9]:
import pandas as pd
results = []
# selecting top 5 patients
first_5_patients = similarity_df.index[:5]
for patient_id in first_5_patients:
    # getting similarity 
    scores = similarity_df.loc[patient_id].copy()
    # Remove self-similarity
    scores.loc[patient_id] = -1
    # Get top 5 similar patients
    top5 = scores.sort_values(ascending=False).head(5)
    # Store ranked results
    for rank, (similar_patient_id, sim_score) in enumerate(top5.items(), start=1):
        results.append({
            "source_patient_id": patient_id,
            "rank": rank,
            "similar_patient_id": similar_patient_id,
            "cosine_similarity": sim_score
        })
# Convert to DataFrame
top5_similarity_ranking_df = pd.DataFrame(results)
# Display result
top5_similarity_ranking_df

Unnamed: 0,source_patient_id,rank,similar_patient_id,cosine_similarity
0,P0001,1,P0007,0.417966
1,P0001,2,P0006,0.341031
2,P0001,3,P0133,0.331201
3,P0001,4,P0150,0.297609
4,P0001,5,P0002,0.295018
5,P0002,1,P0003,0.391625
6,P0002,2,P0001,0.295018
7,P0002,3,P0274,0.230639
8,P0002,4,P0029,0.224842
9,P0002,5,P0049,0.223943


This above given code ranks the top 5 similar cases for the first five patients. The cosine similarity score and their respective ranks are listed along with the table.

In [11]:
import numpy as np
import pandas as pd
# Copy to avoid the changes
sim_df = similarity_df.copy()
# Remove self similarity
np.fill_diagonal(sim_df.values, np.nan)
# Convert upper triangle only prevents duplicating
upper_triangle = sim_df.where(
    np.triu(np.ones(sim_df.shape), k=1).astype(bool)
)
# Convert to long format
similarity_pairs = (
    upper_triangle
    .stack()
    .reset_index()
)
similarity_pairs.columns = [
    "patient_id_1",
    "patient_id_2",
    "cosine_similarity"
]
# Sort and select the top 10 most similar pairs
top10_similar_pairs = similarity_pairs.sort_values(
    by="cosine_similarity",
    ascending=False
).head(10)
# Display result
top10_similar_pairs

Unnamed: 0,patient_id_1,patient_id_2,cosine_similarity
165339,P0431,P0436,0.781737
165335,P0431,P0432,0.709339
41625,P0075,P0076,0.701907
164999,P0429,P0435,0.688043
165507,P0432,P0436,0.660064
41100,P0074,P0076,0.653648
41099,P0074,P0075,0.646245
94623,P0188,P0190,0.631032
30915,P0055,P0056,0.628265
179490,P0580,P0581,0.622893


Ranking the top 10 similarity patient ID based on their respective symptoms.

# Testing with input

Testing with respiratory disease based input

In [7]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
# loading the existing data
df = pd.read_csv(r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv")
# processed for first 600 patient data
df = df.iloc[:600].copy()
# creating a unique patient ID
df["patient_uid"] = [f"P{i+1:04d}" for i in range(len(df))]
patient_ids = df["patient_uid"].values
print("✅ Loaded 600 historical patient records")
# vectorisation for the existing classes
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    norm="l2"
)
historical_vectors = vectorizer.fit_transform(
    df["note_preprocessed"].astype(str)
)
print("✅ TF-IDF fitted on historical data")
print("Historical vector shape:", historical_vectors.shape)
# user input new patient case
user_input = input("\nEnter the new clinical case text:\n")
print("\n ORIGINAL INPUT\n")
print(user_input)
# pre-processing of input
lower_text = user_input.lower()
print("\n AFTER LOWERCASE \n")
print(lower_text)
# Tokenization (regex-based)
tokens = re.findall(r"[a-z]+", lower_text)
print("\n TOKENS \n")
print(tokens)
# Stopword removal
filtered_tokens = [
    token for token in tokens if token not in ENGLISH_STOP_WORDS
]
print("\n AFTER STOPWORD REMOVAL \n")
print(filtered_tokens)
# Reconstruct text
processed_input = " ".join(filtered_tokens)
print("\n FINAL PREPROCESSED INPUT \n")
print(processed_input)
# vectorising input case
input_vector = vectorizer.transform([processed_input])
print("\nInput vector shape:", input_vector.shape)
# computing cosine similarity 
similarity_scores = cosine_similarity(
    input_vector,
    historical_vectors
)[0]

similarity_series = pd.Series(
    similarity_scores,
    index=patient_ids
)
# Rank the top 5 patient IDs for the given input 
TOP_N = 5
top_5_similar_patients = similarity_series.sort_values(
    ascending=False
).head(TOP_N)
print("\n TOP 5 SIMILAR PATIENTS \n")
print(top_5_similar_patients)


✅ Loaded 600 historical patient records
✅ TF-IDF fitted on historical data
Historical vector shape: (600, 3000)



Enter the new clinical case text:
 The Male patient presented with fever, dry cough, and shortness of breath. Diagnosed with viral pneumonia and oxygen desaturation. Underwent respiratory physiotherapy and was discharged for rehabilitation.



 ORIGINAL INPUT

The Male patient presented with fever, dry cough, and shortness of breath. Diagnosed with viral pneumonia and oxygen desaturation. Underwent respiratory physiotherapy and was discharged for rehabilitation.

 AFTER LOWERCASE 

the male patient presented with fever, dry cough, and shortness of breath. diagnosed with viral pneumonia and oxygen desaturation. underwent respiratory physiotherapy and was discharged for rehabilitation.

 TOKENS 

['the', 'male', 'patient', 'presented', 'with', 'fever', 'dry', 'cough', 'and', 'shortness', 'of', 'breath', 'diagnosed', 'with', 'viral', 'pneumonia', 'and', 'oxygen', 'desaturation', 'underwent', 'respiratory', 'physiotherapy', 'and', 'was', 'discharged', 'for', 'rehabilitation']

 AFTER STOPWORD REMOVAL 

['male', 'patient', 'presented', 'fever', 'dry', 'cough', 'shortness', 'breath', 'diagnosed', 'viral', 'pneumonia', 'oxygen', 'desaturation', 'underwent', 'respiratory', 'physiotherapy', 'discharged', 'rehabilitation']

 FINAL PR

In the above case a new clinical input is provided. The provided input is compared with the existing cases. This enables the model to find the clinically similar past cases using vector based similarity. From this we can get the similar patients treatment and recovery period data.

Finding similar terms between the given input and other similar patient ID

In [8]:
# finding why the retrived cases are similar
def explain_similarity(input_text, historical_text, top_k_terms=8):
    """
    Explain similarity based on shared important terms
    """
    # Tokenize
    input_tokens = set(re.findall(r"[a-z]+", input_text.lower()))
    historical_tokens = set(re.findall(r"[a-z]+", historical_text.lower()))
    # Find common terms
    common_terms = input_tokens.intersection(historical_tokens)
    # Return top common terms
    return list(common_terms)[:top_k_terms]
# similarity words among top similar patients
print("\n SIMILARITY EXPLANATION \n")
for patient_id in top_5_similar_patients.index:
    patient_note = df.loc[df["patient_uid"] == patient_id, "note_preprocessed"].values[0]
    shared_terms = explain_similarity(processed_input, patient_note)
    print(f"Patient {patient_id}:")
    print(f"Similarity Score: {top_5_similar_patients[patient_id]:.4f}")
    print("Shared clinical terms:", ", ".join(shared_terms))
    print("-" * 50)



 SIMILARITY EXPLANATION 

Patient P0133:
Similarity Score: 0.2740
Shared clinical terms: patient, discharged, viral, pneumonia, fever, respiratory, diagnosed, oxygen
--------------------------------------------------
Patient P0006:
Similarity Score: 0.2264
Shared clinical terms: patient, discharged, dry, pneumonia, fever, respiratory, diagnosed, cough
--------------------------------------------------
Patient P0001:
Similarity Score: 0.2243
Shared clinical terms: patient, discharged, desaturation, dry, fever, respiratory, oxygen, cough
--------------------------------------------------
Patient P0132:
Similarity Score: 0.2190
Shared clinical terms: patient, discharged, pneumonia, fever, diagnosed, shortness, cough, male
--------------------------------------------------
Patient P0192:
Similarity Score: 0.1919
Shared clinical terms: patient, discharged, viral, presented, pneumonia, respiratory, oxygen, underwent
--------------------------------------------------


From these we can be able to state the provided sample input shares the more similar symptoms with the above given patient ID. The treatment provided to the new sample can be adopted from the existing patient who has more common symptoms.

Finding the most common treatments applied to the most similar patient

In [13]:
import numpy as np
# getting feature names
feature_names = np.array(vectorizer.get_feature_names_out())
# sorting most similar patient
most_similar_patient_id = top_5_similar_patients.index[0]
most_similar_score = top_5_similar_patients.iloc[0]
# Find index in dataset
patient_idx = df.index[df["patient_uid"] == most_similar_patient_id][0]
# extract top vectorfrom the patient
patient_vector = historical_vectors[patient_idx].toarray().flatten()

top_indices = patient_vector.argsort()[-6:][::-1]
top_terms = feature_names[top_indices]
# display the results
print(" MOST SIMILAR PATIENT ")
print("Patient ID:", most_similar_patient_id)
print("Cosine Similarity:", round(most_similar_score, 4))

print("\n DATA-DRIVEN TREATMENT / INTERVENTION PHRASES ")
for term in top_terms[:4]:
    print("-", term)


 MOST SIMILAR PATIENT 
Patient ID: P0133
Cosine Similarity: 0.274

 DATA-DRIVEN TREATMENT / INTERVENTION PHRASES 
- respiratory
- pjp
- rehabilitation
- ecmo
