# Input data

In [1]:
import pandas as pd
# Load the CSV file
file_path = r"C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv"
df = pd.read_csv(file_path)
# Checking whether the column exist
if "note_preprocessed" not in df.columns:
    raise ValueError("Column 'note_preprocessed' not found in the CSV file.")
# Display first three preprocessed notes
first_three_notes = df["note_preprocessed"].head(3)
print("First Three Pre-Processed Notes:\n")
for idx, note in enumerate(first_three_notes, start=1):
    print(f"Note {idx}:")
    print(note)
    print("-" * 60)


First Three Pre-Processed Notes:

Note 1:
discharge summary patient year old male moderate acute respiratory distress syndrome covid hospital course patient admitted hospital symptom fever dry cough dyspnea physical therapy acute ward patient experienced coughing attack induced oxygen desaturation dyspnea change position deep breathing avoid rapid deterioration respiratory failure step step approach used position change breathing exercise adapted avoid prolonged coughing oxygen desaturation close monitoring patient managed perform strength walking exercise low level exercise progression low initially increased daily hospital discharge rehabilitation clinic day clinical outcome patient discharged day rehabilitation clinic making satisfactory progress symptom resolved follow patient receive follow care rehabilitation clinic regular monitoring progress rehabilitation exercise full recovery new symptom concern reported clinic immediately overall impression patient responded well treatment 

The input pre-processed file for the embedding generation. This consist of the fully pre-processed and the normalised data.

# Text to vector

converting the entire pre-processed data to the vector using reusable code

In [1]:
# conting the pre-processed data to the vector
import numpy as np
import pandas as pd
from typing import List
from sentence_transformers import SentenceTransformer
# loading the embedding model
def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)
# generating the embeddings
def generate_embeddings(texts: List[str], model) -> np.ndarray:
    if not isinstance(texts, list):
        raise TypeError("Input must be a list of strings.")
    if len(texts) == 0:
        raise ValueError("Text list is empty.")
    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    return embeddings

if __name__ == "__main__":
    # Get CSV path from user
    csv_path = input("Enter path to preprocessed CSV file:\n>> ")
    # Load dataset
    df = pd.read_csv(csv_path)
    if "note_preprocessed" not in df.columns:
        raise ValueError("Column 'note_preprocessed' not found in dataset.")
    # Select first 300 patients 
    df_subset = df.head(300)
    # Convert to list of strings
    notes = df_subset["note_preprocessed"].astype(str).tolist()
    print(f"\nNumber of patients selected: {len(notes)}")
    # Load model
    model = load_embedding_model()
    # Generate embeddings
    print("\nGenerating embeddings for first 600 patients...")
    embeddings = generate_embeddings(notes, model)
    print("\nEmbedding generation completed.")
    print(f"Embedding Matrix Shape: {embeddings.shape}")
    # Display first 3 embedding vectors
    print("\nDisplaying first 3 embedding vectors:\n")

    for i in range(min(3, len(embeddings))):
        print(f"Patient {i+1} Embedding Vector:")
        print(embeddings[i])
        print(f"Vector Shape: {embeddings[i].shape}")


  from .autonotebook import tqdm as notebook_tqdm


Enter path to preprocessed CSV file:
>>  C:\Users\rajak\Downloads\AI Internship\clinical_notes_preprocessed_no_spellcheck.csv



Number of patients selected: 300


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 586.46it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Generating embeddings for first 600 patients...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.12it/s]


Embedding generation completed.
Embedding Matrix Shape: (300, 384)

Displaying first 3 embedding vectors:

Patient 1 Embedding Vector:
[-4.16854164e-03  4.55092341e-02 -1.61163844e-02 -1.49405552e-02
 -5.72580434e-02  6.43947944e-02  3.91317680e-02  2.90072206e-02
 -2.24398710e-02 -3.69700417e-02  2.08975151e-02 -1.04047395e-02
  1.82979535e-02  3.49009074e-02  4.13241573e-02 -3.91100235e-02
  3.36168706e-02 -5.90851251e-03 -3.28265727e-02 -1.18449600e-02
  2.17140168e-02  8.66404399e-02  5.44088008e-03  7.04913354e-03
 -7.02937022e-02 -6.36091735e-03 -5.53100109e-02 -4.76749148e-03
  1.02990851e-01 -3.71950865e-02  6.17636405e-02  3.25238779e-02
  9.16399360e-02  4.07861732e-02 -1.87928118e-02  6.62181675e-02
  1.64691340e-02  7.86282122e-02 -6.97186450e-03  2.06328463e-02
 -4.39944416e-02 -2.99381325e-04  2.34261416e-02 -6.53473437e-02
  1.46085257e-02  1.35365753e-02 -1.28772646e-01  3.11733373e-02
  5.33213951e-02  3.59467939e-02 -4.89928480e-03  2.13349354e-03
 -6.00546859e-02  3




create the reusable python code for converting the text of preprocessed data to the vector dimension form.