In [None]:
!pip install langchain-community
!pip install chromadb
!pip install transformers accelerate torch


In [None]:
import os
import pandas as pd
import re
from tabulate import tabulate
from tqdm import tqdm
from langchain_classic.chains import RetrievalQA
from langchain_classic.llms import HuggingFacePipeline
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_classic.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


In [None]:
van_feature_csv = ['admissions.csv.gz', 'labevents.csv.gz']


def preprocessingCSV(csv_files):
    dfs = []
    for file in csv_files:
        compression = 'gzip' if file.endswith('.gz') else None
        df = pd.read_csv(file, compression=compression)
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    lang_docs = []

    for row in tqdm(combined_df.itertuples(index=False), total=combined_df.shape[0], desc='CSV → LangChain Docs'):
        row_dict = row._asdict()
        subject_id = str(row_dict.get('subject_id', ''))
        hadm_id = str(row_dict.get('hadm_id', ''))

        lab_info = {}
        for k in row_dict:
            if k.startswith('lab') or k in ['value', 'valueuom', 'flag', 'valuenum', 'itemid']:
                lab_info[k] = row_dict[k]

        content = f"Patient {subject_id} had admission {hadm_id}. Data: {lab_info}"

        metadata = {
            'subject_id': subject_id,
            'hadm_id': hadm_id
        }

        lang_docs.append(Document(page_content=content, metadata=metadata))

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=200,
        add_start_index=False
    )

    chunked_docs = []
    for doc in lang_docs:
        chunks = text_splitter.split_documents([doc])
        chunked_docs.extend(chunks)

    return chunked_docs



In [None]:
print(preprocessingCSV(van_feature_csv)[:20])

CSV → LangChain Docs: 100%|██████████| 108002/108002 [00:06<00:00, 15774.38it/s]


[Document(metadata={'subject_id': '10004235', 'hadm_id': '24181354.0'}, page_content="Patient 10004235 had admission 24181354.0. Data: {'labevent_id': nan, 'itemid': nan, 'value': nan, 'valuenum': nan, 'valueuom': nan, 'flag': nan}"), Document(metadata={'subject_id': '10009628', 'hadm_id': '25926192.0'}, page_content="Patient 10009628 had admission 25926192.0. Data: {'labevent_id': nan, 'itemid': nan, 'value': nan, 'valuenum': nan, 'valueuom': nan, 'flag': nan}"), Document(metadata={'subject_id': '10018081', 'hadm_id': '23983182.0'}, page_content="Patient 10018081 had admission 23983182.0. Data: {'labevent_id': nan, 'itemid': nan, 'value': nan, 'valuenum': nan, 'valueuom': nan, 'flag': nan}"), Document(metadata={'subject_id': '10006053', 'hadm_id': '22942076.0'}, page_content="Patient 10006053 had admission 22942076.0. Data: {'labevent_id': nan, 'itemid': nan, 'value': nan, 'valuenum': nan, 'valueuom': nan, 'flag': nan}"), Document(metadata={'subject_id': '10031404', 'hadm_id': '216062

In [None]:
def convertToEmbeddings(chunked_documents,embeddings,batch_size=32): ## EMBEDDINGS

    content_txt = [doc.page_content for doc in chunked_documents]
    embeddings_list = []

    for i in tqdm(range(0, len(content_txt), batch_size), desc="Generating embeddings..."):
        batch = content_txt[i:i+batch_size]
        batch_emb = embeddings.embed_documents(batch)
        embeddings_list.extend(batch_emb)

    print("Dim:", len(embeddings_list[0]))
    print("Num Embeddings:", len(embeddings_list))
    return embeddings_list



In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
embs = convertToEmbeddings(preprocessingCSV(van_feature_csv), embeddings)
print("✅ Done generating embeddings")
print("Vectors:", len(embs))
print("Dim:", len(embs[0]))


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

CSV → LangChain Docs: 100%|██████████| 108002/108002 [00:04<00:00, 23645.63it/s]
Generating embeddings...: 100%|██████████| 3376/3376 [01:50<00:00, 30.45it/s]

Dim: 384
Num Embeddings: 108002
✅ Done generating embeddings
Vectors: 108002
Dim: 384





In [None]:
def store_in_chroma_vectorDB(chunked_docs_lst, embeddings_lst, batch_size=5000):
  vectorDB = Chroma(
      collection_name='nlp_DB',
      embedding_function=embeddings,
      persist_directory="./chroma_db"
  )
  print(vectorDB._collection.count())

  print("Storing embeddings into Chroma DB...")
  for i in tqdm(range(0, len(chunked_docs_lst), batch_size), desc="Adding documents to Chroma", unit="batch"):
    end_idx = min(i + batch_size, len(chunked_docs_lst))
    docs = chunked_docs_lst[i:end_idx]
    embs = embeddings_lst[i:end_idx]
    vectorDB.add_documents(docs)



  vectorDB.persist()
  print("Count:", vectorDB._collection.count())
  print('Did connect and store embeddings into chroma DB')
  return vectorDB

In [None]:
vector_em = store_in_chroma_vectorDB(preprocessingCSV(van_feature_csv), embs) # VECTOR DB

CSV → LangChain Docs: 100%|██████████| 108002/108002 [00:02<00:00, 38278.04it/s]
  vectorDB = Chroma(


0
Storing embeddings into Chroma DB...


Adding documents to Chroma: 100%|██████████| 22/22 [03:12<00:00,  8.75s/batch]

Count: 108002
Did connect and store embeddings into chroma DB



  vectorDB.persist()


In [None]:
print(vector_em.get(ids=['0'], include=['embeddings', 'documents', 'metadatas']))

{'ids': [], 'embeddings': array([], dtype=float64), 'documents': [], 'uris': None, 'included': ['embeddings', 'documents', 'metadatas'], 'data': None, 'metadatas': []}


In [None]:
chroma_retreiver = vector_em.as_retriever(search_type='similarity', search_kwargs={"k": 5}) # Retrieval

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # LLM
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', dtype='auto', trust_remote_code=True)

llm_pipeline = pipeline(
    task = "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    do_sample=True

)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
rag_llm_pipeline = HuggingFacePipeline(pipeline=llm_pipeline)

  rag_llm_pipeline = HuggingFacePipeline(pipeline=llm_pipeline)


In [None]:
rag_template = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are a clinical assistant. Use the context below to answer the question.\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n\n"
        "If the lab events are provided in the context, summarize them clearly as 'Lab Event ID', 'Value', 'Units', and 'Flag'. "
        "If not enough information is available, say 'I don't know.'\n\n"
        "Answer:"
    )
)

retrievalQA = RetrievalQA.from_chain_type(
    llm=rag_llm_pipeline,
    chain_type="stuff",
    chain_type_kwargs={"prompt": rag_template},
    retriever=chroma_retreiver,
    return_source_documents=True
)

retrievalQA


RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are a clinical assistant. Use the context below to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}\n\nIf the lab events are provided in the context, summarize them clearly as 'Lab Event ID', 'Value', 'Units', and 'Flag'. If not enough information is available, say 'I don't know.'\n\nAnswer:"), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7d0c05fff110>), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), return_source_documents=True, retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorst

In [None]:
docs = chroma_retreiver.invoke("Tell me the lab events for subject_id 10014354?")
print(docs)



[Document(metadata={'hadm_id': '28829452.0', 'subject_id': '10021312'}, page_content="Patient 10021312 had admission 28829452.0. Data: {'labevent_id': 250866.0, 'itemid': 51274.0, 'value': '14.2', 'valuenum': 14.2, 'valueuom': 'sec', 'flag': 'abnormal'}"), Document(metadata={'subject_id': '10016810', 'hadm_id': '20973395.0'}, page_content="Patient 10016810 had admission 20973395.0. Data: {'labevent_id': 205321.0, 'itemid': 51274.0, 'value': '14.9', 'valuenum': 14.9, 'valueuom': 'sec', 'flag': 'abnormal'}"), Document(metadata={'subject_id': '10021312', 'hadm_id': '28829452.0'}, page_content="Patient 10021312 had admission 28829452.0. Data: {'labevent_id': 250721.0, 'itemid': 51274.0, 'value': '13.4', 'valuenum': 13.4, 'valueuom': 'sec', 'flag': 'abnormal'}"), Document(metadata={'hadm_id': '22342963.0', 'subject_id': '10022017'}, page_content="Patient 10022017 had admission 22342963.0. Data: {'labevent_id': 264106.0, 'itemid': 51274.0, 'value': '14.0', 'valuenum': 14.0, 'valueuom': 'sec'

In [None]:
def query_rag(user_query, k_docs=5):
    match = re.search(r'\b\d{8}\b', user_query)
    subject_id = match.group(0) if match else None

    if subject_id:
        print(f"Detected subject_id: {subject_id}")
        docs = chroma_retreiver.vectorstore.similarity_search(
            user_query,
            k=k_docs,
            filter={"subject_id": str(subject_id)}
        )
        result = retrievalQA.combine_documents_chain.invoke({
            "input_documents": docs,
            "question": user_query
        })
        result_text = result
    else:
        result = retrievalQA({"query": user_query})
        result_text = result.get("result", "")
        docs = result.get("source_documents", [])

    print("\nAnswer:\n")
    print(result_text)

    if docs:
      data = []
      for doc in docs:
          meta = doc.metadata
          content = doc.page_content

          import ast
          lab_data_str = content.split("Data: ")[1]
          lab_data = ast.literal_eval(lab_data_str)

          data.append({
              "Subject ID": meta.get("subject_id", ""),
              "Admission ID": str(int(float(meta.get("hadm_id", "")))),
              "Lab Event ID": str(int(float(lab_data.get("labevent_id", "")))),
              "Value": lab_data.get("valuenum", lab_data.get("value", "")),
              "Units": lab_data.get("valueuom", ""),
              "Flag": lab_data.get("flag", "")
          })
      if data:
        print('\nLab Events:\n')
        df_rag_result = pd.DataFrame(data)
        print(df_rag_result.to_markdown(index=False))
        print("\n----------DataFrame----------\n")
        return df_rag_result


In [None]:
df_labs = pd.read_csv(van_feature_csv[1])
df_labs[df_labs['labevent_id'] == 85323]



Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
18172,85323,10006053,22942076.0,3380860,50868,,2111-11-14 06:04:00,2111-11-14 09:32:00,32,32.0,mEq/L,8.0,20.0,abnormal,STAT,


In [None]:
query_df = query_rag("Tell me the lab events for subject_id 10006053")
query_df

Detected subject_id: 10006053

Answer:

{'input_documents': [Document(metadata={'subject_id': '10006053', 'hadm_id': '22942076.0'}, page_content="Patient 10006053 had admission 22942076.0. Data: {'labevent_id': 85323.0, 'itemid': 50868.0, 'value': '32', 'valuenum': 32.0, 'valueuom': 'mEq/L', 'flag': 'abnormal'}"), Document(metadata={'subject_id': '10006053', 'hadm_id': '22942076.0'}, page_content="Patient 10006053 had admission 22942076.0. Data: {'labevent_id': 85410.0, 'itemid': 51275.0, 'value': '38.3', 'valuenum': 38.3, 'valueuom': 'sec', 'flag': 'abnormal'}"), Document(metadata={'subject_id': '10006053', 'hadm_id': '22942076.0'}, page_content="Patient 10006053 had admission 22942076.0. Data: {'labevent_id': 85291.0, 'itemid': 50804.0, 'value': '15', 'valuenum': 15.0, 'valueuom': 'mEq/L', 'flag': 'abnormal'}"), Document(metadata={'hadm_id': '22942076.0', 'subject_id': '10006053'}, page_content="Patient 10006053 had admission 22942076.0. Data: {'labevent_id': 85515.0, 'itemid': 51222

Unnamed: 0,Subject ID,Admission ID,Lab Event ID,Value,Units,Flag
0,10006053,22942076,85323,32.0,mEq/L,abnormal
1,10006053,22942076,85410,38.3,sec,abnormal
2,10006053,22942076,85291,15.0,mEq/L,abnormal
3,10006053,22942076,85515,8.1,g/dL,abnormal
4,10006053,22942076,85440,26.4,sec,abnormal


In [None]:
print(query_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Subject ID    5 non-null      object 
 1   Admission ID  5 non-null      object 
 2   Lab Event ID  5 non-null      float64
 3   Value         5 non-null      float64
 4   Units         5 non-null      object 
 5   Flag          5 non-null      object 
dtypes: float64(2), object(4)
memory usage: 372.0+ bytes
None
