# Visualising the similar patients based on cosine distance

In the single unified performed the pre-processed data to cosine similarity. Making like this unified code and storing the step by results would reduce the time for initialising the code from the begining all the time for this case.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# loading of pre-processed data 
df = pd.read_csv(r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv")
# Ensure required column exists
assert "note_preprocessed" in df.columns, "Column 'note_preprocessed' not found"
# selecting first 600 patients alone 
df_600 = df.iloc[:600].copy()
# Handle patient IDs
if "patient_id" in df_600.columns:
    patient_ids = df_600["patient_id"].astype(str)
else:
    patient_ids = df_600.index.astype(str)
# TF-IDF vectorizer 
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    norm="l2"
)
tfidf_vectors = vectorizer.fit_transform(
    df_600["note_preprocessed"].astype(str)
)
print("TF-IDF shape:", tfidf_vectors.shape)  # (600, features)
# cosine similarity computation
cosine_sim_matrix = cosine_similarity(tfidf_vectors)
# Convert to DataFrame
cosine_similarity_df = pd.DataFrame(
    cosine_sim_matrix,
    index=patient_ids,
    columns=patient_ids
)
# saving cosine similarity matrix
output_file =r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv"
cosine_similarity_df.to_csv(output_file)
print(f"Cosine similarity matrix saved as: {output_file}")
# displaying the results head
cosine_similarity_df.head()


TF-IDF shape: (600, 3000)
Cosine similarity matrix saved as: C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv


patient_id,0,1,2,3,4,5,6,7,8,9,...,624,625,626,627,628,629,630,631,633,634
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.295018,0.286272,0.233355,0.146855,0.341031,0.417966,0.212645,0.229525,0.290407,...,0.090365,0.038137,0.064574,0.063097,0.067862,0.054088,0.045444,0.044407,0.025074,0.05517
1,0.295018,1.0,0.391625,0.115043,0.168605,0.180826,0.209542,0.200359,0.166427,0.196865,...,0.056607,0.029917,0.044105,0.099691,0.045316,0.067767,0.061307,0.074283,0.039219,0.067189
2,0.286272,0.391625,1.0,0.260915,0.165014,0.191636,0.25566,0.262918,0.236332,0.240311,...,0.075397,0.03001,0.076855,0.059636,0.099281,0.058341,0.104525,0.055934,0.059065,0.061023
3,0.233355,0.115043,0.260915,1.0,0.147604,0.293505,0.340738,0.31058,0.28703,0.300308,...,0.080737,0.031779,0.069619,0.123005,0.101059,0.106779,0.061965,0.077166,0.073236,0.05902
4,0.146855,0.168605,0.165014,0.147604,1.0,0.149002,0.134768,0.15698,0.206732,0.131699,...,0.049843,0.020568,0.043537,0.034077,0.033196,0.035601,0.038146,0.034278,0.040669,0.074965


visualising the top 20 patients cosine similarity values in the tabulated format for the better understanding.

In [5]:
import pandas as pd
import numpy as np

# Load cosine similarity matrix
similarity_df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv",
    index_col=0
)
# Ensure index consistency
similarity_df.index = similarity_df.index.astype(str)
similarity_df.columns = similarity_df.columns.astype(str)
# Removing the self similarity
np.fill_diagonal(similarity_df.values, np.nan)
# Convert matrix to pairwise format
similarity_long = similarity_df.stack().reset_index()
similarity_long.columns = ["patient_id_1", "patient_id_2", "cosine_similarity"]
# Sort by similarity score 
top20_pairs = similarity_long.sort_values(
    by="cosine_similarity",
    ascending=False
).head(20)
# Display top 20 most similar pairs
top20_pairs


Unnamed: 0,patient_id_1,patient_id_2,cosine_similarity
260995,457,452,0.781737
258004,452,457,0.781737
258599,453,452,0.709339
258000,452,453,0.709339
44999,80,79,0.701907
44400,79,80,0.701907
256805,450,456,0.688043
260394,456,450,0.688043
260996,457,453,0.660064
258603,453,457,0.660064


From this code we have found the top 20 most similar patient pairs from the given dataset.

# Creating unique patient_ID

In [8]:
import pandas as pd
# loading cosine similarity matrix
similarity_df = pd.read_csv(
    r"C:\Users\rajak\Downloads\AI Internship\cosinesimilarity.csv",
    index_col=0
)
# creating unique patient ID
num_patients = similarity_df.shape[0]
patient_uids = [f"P{i+1:04d}" for i in range(num_patients)]
# Assigning unique patient ID
similarity_df.index = patient_uids
similarity_df.columns = patient_uids
# display the results
similarity_df.head()


Unnamed: 0,P0001,P0002,P0003,P0004,P0005,P0006,P0007,P0008,P0009,P0010,...,P0591,P0592,P0593,P0594,P0595,P0596,P0597,P0598,P0599,P0600
P0001,1.0,0.295018,0.286272,0.233355,0.146855,0.341031,0.417966,0.212645,0.229525,0.290407,...,0.090365,0.038137,0.064574,0.063097,0.067862,0.054088,0.045444,0.044407,0.025074,0.05517
P0002,0.295018,1.0,0.391625,0.115043,0.168605,0.180826,0.209542,0.200359,0.166427,0.196865,...,0.056607,0.029917,0.044105,0.099691,0.045316,0.067767,0.061307,0.074283,0.039219,0.067189
P0003,0.286272,0.391625,1.0,0.260915,0.165014,0.191636,0.25566,0.262918,0.236332,0.240311,...,0.075397,0.03001,0.076855,0.059636,0.099281,0.058341,0.104525,0.055934,0.059065,0.061023
P0004,0.233355,0.115043,0.260915,1.0,0.147604,0.293505,0.340738,0.31058,0.28703,0.300308,...,0.080737,0.031779,0.069619,0.123005,0.101059,0.106779,0.061965,0.077166,0.073236,0.05902
P0005,0.146855,0.168605,0.165014,0.147604,1.0,0.149002,0.134768,0.15698,0.206732,0.131699,...,0.049843,0.020568,0.043537,0.034077,0.033196,0.035601,0.038146,0.034278,0.040669,0.074965


This unique patient helps in accurate identification of patients and easier comparisons based on the similarity levels.