# Grouping of data

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# loading of pre-processed data
df = pd.read_csv(r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv")
df = df.iloc[:600].copy()
assert "note_preprocessed" in df.columns
# creating a unique patient ID
df = df.reset_index(drop=True)
df["patient_uid"] = ["P{:05d}".format(i) for i in range(len(df))]
# TF-IDF vectorisation
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2
)
tfidf_vectors = vectorizer.fit_transform(df["note_preprocessed"])
feature_names = np.array(vectorizer.get_feature_names_out())
# marked the severity indicators 
severity_indicators = [
    "icu", "intubation", "ventilator", "ards",
    "respiratory failure", "oxygen", "desaturation",
    "dyspnea", "shortness breath", "critical"
]
severity_indices = [
    i for i, term in enumerate(feature_names)
    if any(key in term for key in severity_indicators)
]
# Compute severity score per patient
severity_scores = tfidf_vectors[:, severity_indices].sum(axis=1).A1
df["severity_score"] = severity_scores
# clustering severity into three groups
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["severity_cluster"] = kmeans.fit_predict(
    df[["severity_score"]]
)
# mapping clusters into labels 
cluster_means = (
    df.groupby("severity_cluster")["severity_score"]
    .mean()
    .sort_values()
)
severity_mapping = {
    cluster_means.index[0]: "Low",
    cluster_means.index[1]: "Medium",
    cluster_means.index[2]: "High"
}
df["severity_level"] = df["severity_cluster"].map(severity_mapping)
# final output
print("\n SEVERITY DISTRIBUTION \n")
print(df["severity_level"].value_counts())

print("\n SAMPLE PATIENT SEVERITY \n")
print(df[["patient_uid", "severity_score", "severity_level"]].head(10))



 SEVERITY DISTRIBUTION 

severity_level
Low       489
Medium     82
High       29
Name: count, dtype: int64

 SAMPLE PATIENT SEVERITY 

  patient_uid  severity_score severity_level
0      P00000        0.571965           High
1      P00001        0.532564           High
2      P00002        0.983853           High
3      P00003        0.460651           High
4      P00004        0.504159           High
5      P00005        0.000000            Low
6      P00006        0.425007           High
7      P00007        0.470549           High
8      P00008        0.370591           High
9      P00009        0.546997           High


from this we have the severity level of the each subject based on the clinical terms. Then based on the severity level the patients have been grouped into high severity, medium severity and low severity. This helps to predict the severity level of the new from the existing case.