# Input data

In [1]:
import pandas as pd
# Load the CSV file
file_path = r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
df = pd.read_csv(file_path)
# Checking whether the column exist
if "note_preprocessed" not in df.columns:
    raise ValueError("Column 'note_preprocessed' not found in the CSV file.")
# Display first three preprocessed notes
first_three_notes = df["note_preprocessed"].head(3)
print("First Three Pre-Processed Notes:\n")
for idx, note in enumerate(first_three_notes, start=1):
    print(f"Note {idx}:")
    print(note)
    print("-" * 60)


First Three Pre-Processed Notes:

Note 1:
discharge summary patient year old male moderate acute respiratory distress syndrome covid hospital course patient admitted hospital symptom fever dry cough dyspnea physical therapy acute ward patient experienced coughing attack induced oxygen desaturation dyspnea change position deep breathing avoid rapid deterioration respiratory failure step step approach used position change breathing exercise adapted avoid prolonged coughing oxygen desaturation close monitoring patient managed perform strength walking exercise low level exercise progression low initially increased daily hospital discharge rehabilitation clinic day clinical outcome patient discharged day rehabilitation clinic making satisfactory progress symptom resolved follow patient receive follow care rehabilitation clinic regular monitoring progress rehabilitation exercise full recovery new symptom concern reported clinic immediately overall impression patient responded well treatment 

The input pre-processed file for the embedding generation. This consist of the fully pre-processed and the normalised data.

# Text to vector

converting the entire pre-processed data to the vector using reusable code

In [1]:
# conting the pre-processed data to the vector
import numpy as np
import pandas as pd
from typing import List
from sentence_transformers import SentenceTransformer
# loading the embedding model
def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# generating the embeddings
def generate_embeddings(texts: List[str], model) -> np.ndarray:
    if not isinstance(texts, list):
        raise TypeError("Input must be a list of strings.")
    if len(texts) == 0:
        raise ValueError("Text list is empty.")
    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    return embeddings

if __name__ == "__main__":
    # Get CSV path from user
    csv_path = input("Enter path to preprocessed CSV file:\n>> ")
    # Load dataset
    df = pd.read_csv(csv_path)
    if "note_preprocessed" not in df.columns:
        raise ValueError("Column 'note_preprocessed' not found in dataset.")
    # Select first 300 patients 
    df_subset = df.head(300)
    # Convert to list of strings
    notes = df_subset["note_preprocessed"].astype(str).tolist()
    print(f"\nNumber of patients selected: {len(notes)}")
    # Load model
    model = load_embedding_model()
    # Generate embeddings
    print("\nGenerating embeddings for first 600 patients...")
    embeddings = generate_embeddings(notes, model)
    print("\nEmbedding generation completed.")
    print(f"Embedding Matrix Shape: {embeddings.shape}")
    # Display first 3 embedding vectors
    print("\nDisplaying first 3 embedding vectors:\n")

    for i in range(min(3, len(embeddings))):
        print(f"Patient {i+1} Embedding Vector:")
        print(embeddings[i])
        print(f"Vector Shape: {embeddings[i].shape}")


  from .autonotebook import tqdm as notebook_tqdm


Enter path to preprocessed CSV file:
>>  C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv



Number of patients selected: 300


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 586.46it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Generating embeddings for first 600 patients...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.12it/s]


Embedding generation completed.
Embedding Matrix Shape: (300, 384)

Displaying first 3 embedding vectors:

Patient 1 Embedding Vector:
[-4.16854164e-03  4.55092341e-02 -1.61163844e-02 -1.49405552e-02
 -5.72580434e-02  6.43947944e-02  3.91317680e-02  2.90072206e-02
 -2.24398710e-02 -3.69700417e-02  2.08975151e-02 -1.04047395e-02
  1.82979535e-02  3.49009074e-02  4.13241573e-02 -3.91100235e-02
  3.36168706e-02 -5.90851251e-03 -3.28265727e-02 -1.18449600e-02
  2.17140168e-02  8.66404399e-02  5.44088008e-03  7.04913354e-03
 -7.02937022e-02 -6.36091735e-03 -5.53100109e-02 -4.76749148e-03
  1.02990851e-01 -3.71950865e-02  6.17636405e-02  3.25238779e-02
  9.16399360e-02  4.07861732e-02 -1.87928118e-02  6.62181675e-02
  1.64691340e-02  7.86282122e-02 -6.97186450e-03  2.06328463e-02
 -4.39944416e-02 -2.99381325e-04  2.34261416e-02 -6.53473437e-02
  1.46085257e-02  1.35365753e-02 -1.28772646e-01  3.11733373e-02
  5.33213951e-02  3.59467939e-02 -4.89928480e-03  2.13349354e-03
 -6.00546859e-02  3




create the reusable python code for converting the text of preprocessed data to the vector dimension form.

# Text to vector sample cases

Text with the sample case 1

In [2]:
# embedding module tested with sample case
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# preserving clinically importat words
CLINICAL_IMPORTANT_WORDS = {
    "no", "not", "without", "with",
    "before", "after", "during",
    "since", "until", "prior", "post"
}

CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.difference(CLINICAL_IMPORTANT_WORDS)
# prerocessing of the data
def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        raise TypeError("Input must be a string.")
    if text.strip() == "":
        raise ValueError("Input text cannot be empty.")
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()

    # Remove stopwords except important clinical modifiers
    words = [w for w in words if w not in CUSTOM_STOPWORDS]

    return " ".join(words)
# embedding module
def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# generate embedding
def generate_embedding(text: str, model) -> np.ndarray:
    embedding = model.encode([text], convert_to_numpy=True)
    return embedding[0]
# main execution
if __name__ == "__main__":

    #  Stage 1: Raw Input 
    user_input = input("Enter clinical text:\n>> ")

    print("\n Stage 1: Raw Input")
    print(user_input)

    # Stage 2: Preprocessed Text
    preprocessed_text = preprocess_text(user_input)

    print("\n Stage 2: Preprocessed Text")
    print(preprocessed_text)

    # ---- Stage 3: Embedding Vector ----
    model = load_embedding_model()
    embedding_vector = generate_embedding(preprocessed_text, model)

    print("\n Stage 3: Final Embedding Vector ")
    print(embedding_vector)
    print(f"\nVector Shape: {embedding_vector.shape}")


Enter clinical text:
>>  Patient presents with persistent fever and dry cough for three days associated with mild dyspnea on exertion. Oxygen saturation slightly reduced but stable on room air. No prior history of chronic respiratory disease.



 Stage 1: Raw Input
Patient presents with persistent fever and dry cough for three days associated with mild dyspnea on exertion. Oxygen saturation slightly reduced but stable on room air. No prior history of chronic respiratory disease.

 Stage 2: Preprocessed Text
patient presents with persistent fever dry cough days associated with mild dyspnea exertion oxygen saturation slightly reduced stable room air no prior history chronic respiratory disease


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 264.57it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



 Stage 3: Final Embedding Vector 
[ 2.35406086e-02  2.99810860e-02  1.43062482e-02  9.49351937e-02
  4.30108309e-02 -6.85133561e-02  1.48214102e-02  3.03430241e-02
 -5.52415326e-02 -3.35704163e-02 -9.26392619e-03  5.31079173e-02
  5.73559701e-02  8.79758894e-02  3.46794911e-02  1.39573207e-02
 -5.83753586e-02 -3.18350270e-02 -3.05436775e-02  2.45348141e-02
 -2.73823049e-02  8.00394192e-02  7.27900211e-03 -2.96493229e-02
  1.13823712e-02 -1.55746611e-03  2.93601844e-02 -1.76130682e-02
  7.73883536e-02  2.61193123e-02 -7.77160516e-03  5.00847260e-03
 -1.28498729e-02  8.71417951e-03  5.35488762e-02 -1.02503961e-02
  4.23370712e-02  6.59679100e-02 -1.07338980e-01  9.20958538e-03
  1.58796250e-03  2.70311292e-02 -1.28201721e-03  1.18794106e-02
 -8.11511129e-02  5.06604053e-02 -1.09639198e-01  1.19216621e-01
  8.66734516e-03  3.28699425e-02 -2.73187924e-02  8.83784797e-03
 -3.71766612e-02  6.48464784e-02 -4.65396121e-02  3.74594964e-02
 -4.14177217e-02 -2.52656769e-02 -4.89167459e-02 -7.630

This created reusable code pre-process the given sample input and then converts the pre-processed data to the vector format for the further process.

Testing with the sample case 2

In [4]:
# embedding module tested with sample case
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# preserving clinically importat words
CLINICAL_IMPORTANT_WORDS = {
    "no", "not", "without",
    "before", "after", "during",
    "since", "until", "prior", "post"
}

CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.difference(CLINICAL_IMPORTANT_WORDS)
# prerocessing of the data
def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        raise TypeError("Input must be a string.")
    if text.strip() == "":
        raise ValueError("Input text cannot be empty.")
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()

    # Remove stopwords except important clinical modifiers
    words = [w for w in words if w not in CUSTOM_STOPWORDS]

    return " ".join(words)
# embedding module
def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# generate embedding
def generate_embedding(text: str, model) -> np.ndarray:
    embedding = model.encode([text], convert_to_numpy=True)
    return embedding[0]
# main execution
if __name__ == "__main__":

    #  Stage 1: Raw Input 
    user_input = input("Enter clinical text:\n>> ")

    print("\n Stage 1: Raw Input")
    print(user_input)

    # Stage 2: Preprocessed Text
    preprocessed_text = preprocess_text(user_input)

    print("\n Stage 2: Preprocessed Text")
    print(preprocessed_text)

    # ---- Stage 3: Embedding Vector ----
    model = load_embedding_model()
    embedding_vector = generate_embedding(preprocessed_text, model)

    print("\n Stage 3: Final Embedding Vector ")
    print(embedding_vector)
    print(f"\nVector Shape: {embedding_vector.shape}")


Enter clinical text:
>>  The patient reports severe abdominal pain with nausea and vomiting since last night. No history of similar episodes before. Physical examination suggests possible acute gastritis requiring further evaluation.



 Stage 1: Raw Input
The patient reports severe abdominal pain with nausea and vomiting since last night. No history of similar episodes before. Physical examination suggests possible acute gastritis requiring further evaluation.

 Stage 2: Preprocessed Text
patient reports severe abdominal pain nausea vomiting since night no history similar episodes before physical examination suggests possible acute gastritis requiring evaluation


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 246.64it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



 Stage 3: Final Embedding Vector 
[ 1.11375153e-01 -1.60383116e-02 -3.32994051e-02  6.52174056e-02
  2.75049619e-02 -6.34794906e-02  3.22295725e-02  3.48705202e-02
  1.72248911e-02 -7.96386898e-02 -4.86117825e-02 -1.72321044e-03
 -1.12197483e-02  6.45298287e-02 -4.21318747e-02 -8.11157301e-02
  5.12350760e-02 -1.11746714e-01  3.63144800e-02  3.35723050e-02
 -8.90935361e-02  5.68694621e-03  1.43328626e-02  6.38177060e-03
  2.71473490e-02  2.32532565e-02  6.29463568e-02  2.17642845e-03
  7.41925240e-02  1.08204847e-02  1.47466594e-02  1.16136596e-02
  2.03897636e-02  3.15638781e-02  6.23575784e-02 -2.07195804e-02
  2.69148790e-04  4.16753776e-02  1.87664013e-02 -1.39209889e-02
  3.05603631e-03 -2.89836712e-02  7.76842888e-03 -1.85814686e-02
 -5.83120100e-02  1.99338775e-02 -3.66925038e-02  3.01455744e-02
  5.55362366e-02  5.21220341e-02 -2.91344635e-02 -4.04964127e-02
  5.43578900e-02 -4.84597199e-02  9.29622538e-03 -2.75302120e-02
  3.58666666e-02 -8.08277726e-02 -2.68223807e-02 -6.212

Converted the sample input 2 to the vector format using the reusable embedding module.

Testing with the sample case 3

In [8]:
# embedding module tested with sample case
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# preserving clinically importat words
CLINICAL_IMPORTANT_WORDS = {
    "no", "not", "without",
    "before", "after", "during",
    "since", "until", "prior", "post"
}

CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.difference(CLINICAL_IMPORTANT_WORDS)
# prerocessing of the data
def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        raise TypeError("Input must be a string.")
    if text.strip() == "":
        raise ValueError("Input text cannot be empty.")
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()

    # Remove stopwords except important clinical modifiers
    words = [w for w in words if w not in CUSTOM_STOPWORDS]

    return " ".join(words)
# embedding module
def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# generate embedding
def generate_embedding(text: str, model) -> np.ndarray:
    embedding = model.encode([text], convert_to_numpy=True)
    return embedding[0]
# main execution
if __name__ == "__main__":

    #  Stage 1: Raw Input 
    user_input = input("Enter clinical text:\n>> ")

    print("\n Stage 1: Raw Input")
    print(user_input)

    # Stage 2: Preprocessed Text
    preprocessed_text = preprocess_text(user_input)

    print("\n Stage 2: Preprocessed Text")
    print(preprocessed_text)

    # ---- Stage 3: Embedding Vector ----
    model = load_embedding_model()
    embedding_vector = generate_embedding(preprocessed_text, model)

    print("\n Stage 3: Final Embedding Vector ")
    print(embedding_vector)
    print(f"\nVector Shape: {embedding_vector.shape}")


Enter clinical text:
>>  Elderly male admitted with worsening shortness of breath and bilateral lower limb swelling. History of hypertension and cardiac disease. Requires supplemental oxygen and close cardiovascular monitoring.



 Stage 1: Raw Input
Elderly male admitted with worsening shortness of breath and bilateral lower limb swelling. History of hypertension and cardiac disease. Requires supplemental oxygen and close cardiovascular monitoring.

 Stage 2: Preprocessed Text
elderly male admitted worsening shortness breath bilateral lower limb swelling history hypertension cardiac disease requires supplemental oxygen close cardiovascular monitoring


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 276.35it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



 Stage 3: Final Embedding Vector 
[-9.31834243e-03  1.14988618e-01  3.22798081e-02  7.90367499e-02
  5.46633005e-02 -5.45721129e-02 -4.59567718e-02  2.72279233e-02
 -8.78242031e-02 -7.74210244e-02  1.06846420e-02  1.84786953e-02
  1.28574148e-02  4.51404005e-02  3.62702087e-02 -3.40889357e-02
  6.82067405e-03  2.61165071e-02 -5.28505258e-03  4.36312109e-02
  5.91573678e-02  1.03293717e-01 -6.10554516e-02 -1.22511219e-02
 -3.73211913e-02 -7.95715488e-03 -4.80167530e-02 -2.25494355e-02
  9.11034793e-02  8.40187073e-02  5.62361181e-02  7.58312270e-02
  1.05095565e-01  4.54920866e-02 -3.17314975e-02 -2.31926609e-02
  3.78940403e-02  1.66803841e-02 -1.04235955e-01  1.63156365e-03
 -1.52146518e-02 -5.13833053e-02 -1.27072664e-04  4.30919789e-02
 -6.99032396e-02  8.52872431e-02 -3.86306681e-02 -6.18399307e-02
  1.48615486e-03  4.53891978e-02 -1.03155486e-02 -5.68525344e-02
  1.31862648e-02  5.47442306e-03 -1.67999994e-02 -1.49589097e-02
 -7.13739917e-02 -8.59741569e-02 -4.37314771e-02 -3.249

From this embeded module code we can be able to see that it effectively pre-process the provided text and properly converts the providedinput to the vector format.