Import necessary python packages

In [143]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import DataFrameLoader
from langchain_openai import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from typing import Optional
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
import functools
import operator
import warnings
warnings.filterwarnings("ignore")



Set OPENAI API Key (saved in a .env file)

In [2]:

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

Read 2024 VAERS Data downloaded from [Vaccine Adverse Event Reporting System](https://vaers.hhs.gov/data/datasets.html)

In [3]:
df = pd.read_csv('VAERS/2024VAERSDATA.csv',encoding='windows-1252')
df.head()

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,2728969,01/01/2024,AK,14.0,,,M,,fluzone qiv hd administered to a minor patient...,,...,,,,USSA2023SA396019,2,12/30/2023,,,,
1,2728982,01/01/2024,NJ,,,,U,,administering vaccines subcutaneously to patie...,,...,,,,USGSKUS2023AMR179885,2,01/01/2024,,,,
2,2728984,01/01/2024,,,,,U,,extreme joint pain; Unable to walk; This non-s...,,...,,,,USGSKUS2023AMR180978,2,01/01/2024,,,,
3,2728992,01/01/2024,MO,,,,F,,was due to have 2nd when everything shut down ...,,...,,,,USGSKUS2023AMR181463,2,01/01/2024,,,,
4,2728993,01/01/2024,,,,,U,,Injection site hot; Hypersentivity reaction; C...,,...,Autoimmune disorder (Autoimmune disorder diagn...,,,USGSKUS2023AMR181703,2,01/01/2024,,,,


Now we use langchain DataFrameLoader to load the SYMPTOM_TEXT column

In [4]:
loader = DataFrameLoader(df,page_content_column="SYMPTOM_TEXT")

data = loader.load()

Let's print the first five symptomp narratives

In [5]:
import textwrap

for page in data[0:5]:
    print(textwrap.fill(page.page_content))
    print('\n\n\n')

fluzone qiv hd administered to a minor patient with no reported
adverse event; patient was supposed to receive the Flumist Nasal Spray
but she grabbed the FLUZONE QIV HD and inadvertently gave it to
intranasally with no reported adverse event; patient was supposed to
receive the Flumist Nasal Spray but she grabbed the FLUZONE QIV HD and
inadvertently gave it to intranasally with no reported adverse event;
Initial information received from Regulatory Authority on 18-Dec-2023
regarding an unsolicited valid  non-serious case received from a
nurse.  This case involves a 14 years old male patient to whom
influenza quadrival A-B high dose HV vaccine [Fluzone High-Dose
Quadrivalent] was administered who was supposed to receive the
Influenza Vaccine Live Reassort 3v (Flumist) nasal Spary but she
grabbed the Fluzone QIV HD and inadvertently gave it to intranasally
with no reported adverse event.  The patient's past medical history,
medical treatment(s), vaccination(s) and family history were no

We define events and Information class using pydantic, later on this classes will be used to generate function schemas that can be used by LLM

In [153]:

class Event(BaseModel):
    """Information about a patient narrative."""
    VaccineName: str = Field(description = "Name of Administered Vaccine if the name of vaccine is not available the value should ba NA")
    HospitalAdmission: bool = Field(description = "This flag is True if patient was admitted into hospital due to Adverse Event otherwise false")
    Age: int = Field(description = "Age of the patient if available otherwise the value is NA")

Infromation class is a list of all events

In [154]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Event] = Field(description="List of info about patient narratives")

Now we create the chat model instance

In [146]:
model = ChatOpenAI(temperature=0)


convert_pydantic_to_openai_function will create the information extraction schema from the Information class. 

In [155]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'$defs': {'Event': {'description': 'Information about a patient narrative.',
    'properties': {'VaccineName': {'description': 'Name of Administered Vaccine if the name of vaccine is not available the value should ba NA',
      'type': 'string'},
     'HospitalAdmission': {'description': 'This flag is True if patient was admitted into hospital due to Adverse Event otherwise false',
      'type': 'boolean'},
     'Age': {'description': 'Age of the patient if available otherwise the value is NA',
      'type': 'integer'}},
    'required': ['VaccineName', 'HospitalAdmission', 'Age'],
    'type': 'object'}},
  'properties': {'people': {'description': 'List of info about patient narratives',
    'items': {'description': 'Information about a patient narrative.',
     'properties': {'VaccineName': {'description': 'Name of Administered Vaccine if the name of vaccine is not available the value should ba NA',
    

Function calling capabilities in OpenAI allows the LLM to parse function arguments from provided text and send it back as response. We leverage this capability to extract the desired information as funciton arguments.

Let us create an extraction model by binding the extraction function with chat model

In [156]:

extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

We now create the prompt that would be supplied to the chat model

In [157]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess rather provide NA value for non boolean fields. Extract partial info"),
    ("human", "{input}")
])
print(prompt)

input_variables=['input'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Extract the relevant information, if not explicitly provided do not guess rather provide NA value for non boolean fields. Extract partial info')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))]


Now we create the final chain using Langchain Expression Language

In [96]:

extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

We do a batch submit for first 20 narratives 

In [97]:
input_batch = []
for page in data[0:20]:
    input_batch.append({"input":page.page_content})



In [98]:
vaers=extraction_chain.batch(input_batch)

In [99]:
vaers

[[{'VaccineName': 'Fluzone QIV HD', 'HodpitalAdmission': False, 'Age': 14}],
 [{'VaccineName': 'NA', 'HodpitalAdmission': False, 'Age': 45}],
 [{'VaccineName': 'RSVPreF3 adjuvanted (Arexvy)',
   'HodpitalAdmission': False,
   'Age': 'NA'}],
 [{'VaccineName': 'Herpes zoster (Shingrix)',
   'HodpitalAdmission': False,
   'Age': 54}],
 [{'VaccineName': 'RSVPreF3 adjuvanted (Arexvy)',
   'HodpitalAdmission': False,
   'Age': None}],
 [{'VaccineName': 'RSVPreF3 adjuvanted (Arexvy)',
   'HodpitalAdmission': False,
   'Age': None},
  {'VaccineName': 'Influenza vaccine',
   'HodpitalAdmission': False,
   'Age': None},
  {'VaccineName': 'Tozinameran (Pfizer BioNTech COVID-19 vaccine)',
   'HodpitalAdmission': False,
   'Age': None}],
 [{'VaccineName': 'Infanrix', 'HodpitalAdmission': False, 'Age': 11}],
 [{'VaccineName': 'Flu Seasonal QIV Quebec (FluLaval Quadrivalent 2023-2024 season)',
   'HodpitalAdmission': False,
   'Age': 75}],
 [{'VaccineName': 'DTPa (Reduced antigen) (Boostrix)',
   'Ho

In [215]:

#this is reportedly the most efficient way to get one list out of list of lists by concat
vaers_list=functools.reduce(operator.iconcat, vaers, [])


In [216]:
vaers_df = pd.DataFrame.from_dict(vaers_list) 
print(vaers_df)

                                          VaccineName  HodpitalAdmission   Age
0                                      Fluzone QIV HD              False    14
1                                                  NA              False    45
2                        RSVPreF3 adjuvanted (Arexvy)              False    NA
3                            Herpes zoster (Shingrix)              False    54
4                        RSVPreF3 adjuvanted (Arexvy)              False  None
5                        RSVPreF3 adjuvanted (Arexvy)              False  None
6                                   Influenza vaccine              False  None
7      Tozinameran (Pfizer BioNTech COVID-19 vaccine)              False  None
8                                            Infanrix              False    11
9   Flu Seasonal QIV Quebec (FluLaval Quadrivalent...              False    75
10                  DTPa (Reduced antigen) (Boostrix)              False    NA
11                                             Arexv

## Merging extraction with retrieval 

The approach used above is useful when we want to extract information from all documents. However, this could be time consuming and expensive if document colllection is large. In addition, in most real world use cases often we are interested in understanding only specific scenarios and extract data from those narratives.

This is where RAG approach can help us narrowing down the number of documents we supply to the LLM.

To implement RAG we will use OpenAI Embedding capabilities with a local chromadb

## Parent Document Retriever Implementation

In [93]:
## use this code to delete an existing colleciton!
#collection = persistent_client.get_or_create_collection(name="VAERS_PC")
#persistent_client.delete_collection(name="VAERS_PC") 


9551

In [94]:
import chromadb
from langchain_openai import OpenAIEmbeddings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage import LocalFileStore
from langchain.retrievers import ParentDocumentRetriever
embeddings =OpenAIEmbeddingFunction(model_name="text-embedding-ada-002")
persistent_client = chromadb.PersistentClient(path="Chroma")
collection = persistent_client.get_or_create_collection(name="VAERS_PC",embedding_function = embeddings)
from langchain_community.vectorstores import Chroma
langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="VAERS_PC",
    embedding_function=OpenAIEmbeddings(model="text-embedding-ada-002"),
)
# split it into chunks
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
fs = LocalFileStore("Chroma/VAERS_STORE")
store = create_kv_docstore(fs)
retriever= ParentDocumentRetriever(
    vectorstore=langchain_chroma,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    
)

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

for data_batch in batch(data,50):
    retriever.add_documents(data_batch, ids=None)


### Set maximum number of parent documents to return from search

In [95]:
retriever.search_kwargs={"k":10}

In [97]:
retriever.get_relevant_documents('Booster')

[Document(page_content='Treatment of COVID-19; Treatment of COVID-19; This is a spontaneous report received from a Consumer or other non HCP.  A 40-year-old male patient received BNT162b2 omi xbb.1.5 (COMIRNATY (2023-2024 FORMULA)), as dose 1, single (Batch/Lot number: unknown) at the age of 40 years for covid-19 immunisation. The patient\'s relevant medical history was not reported. There were no concomitant medications. Vaccination history included: Covid-19 vaccine (Dose 1; Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 2; Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 3 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 4 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 5 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 6 (booster); Manufacturer: Unknown), for COVID-19 immunisation. The following information was reported: VA

In [98]:
## get parent document
print(retriever.get_relevant_documents('Booster')[0].page_content)
print(retriever.get_relevant_documents('Booster')[0].metadata['VAERS_ID'])

Treatment of COVID-19; Treatment of COVID-19; This is a spontaneous report received from a Consumer or other non HCP.  A 40-year-old male patient received BNT162b2 omi xbb.1.5 (COMIRNATY (2023-2024 FORMULA)), as dose 1, single (Batch/Lot number: unknown) at the age of 40 years for covid-19 immunisation. The patient's relevant medical history was not reported. There were no concomitant medications. Vaccination history included: Covid-19 vaccine (Dose 1; Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 2; Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 3 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 4 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 5 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 6 (booster); Manufacturer: Unknown), for COVID-19 immunisation. The following information was reported: VACCINATION FAILURE (medica

In [103]:

print("There are", langchain_chroma._collection.count(), " chunks in the child document collection")

There are 44181  chunks in the child document collection


### We can also search the child document (smaller chunks)

In [104]:
langchain_chroma.similarity_search('Booster',k=10)

[Document(page_content='(booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 4 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 5 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 6 (booster); Manufacturer: Unknown), for COVID-19 immunisation. The following information was reported: VACCINATION FAILURE (medically', metadata={'AGE_YRS': 40.0, 'FORM_VERS': 2, 'ONSET_DATE': '01/01/2024', 'RECOVD': 'U', 'RECVDATE': '02/03/2024', 'SEX': 'M', 'SPLTTYPE': 'USPFIZER INC202400031411', 'TODAYS_DATE': '02/02/2024', 'VAERS_ID': 2743526, 'V_ADMINBY': 'UNK', 'doc_id': '90406d85-e9f5-4d1a-915d-e37a605dd7a8'}),
 Document(page_content='5 (BOOSTER); MANUFACTURER UNKNOWN), for COVID-19 immunization; Covid-19 vaccine (DOSE 4 (BOOSTER); MANUFACTURER UNKNOWN), for COVID-19 immunization; Covid-19 vaccine (DOSE 3 (BOOSTER); MANUFACTURER UNKNOWN), for COVID-19 immunization; Covid-19 vaccine (DOSE 2; MANUFACTU

In [105]:
#get chunk
print(langchain_chroma.similarity_search('Booster')[0].page_content)
print(langchain_chroma.similarity_search('Booster')[0].metadata['VAERS_ID'])

(booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 4 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 5 (booster); Manufacturer: Unknown), for COVID-19 immunisation; Covid-19 vaccine (Dose 6 (booster); Manufacturer: Unknown), for COVID-19 immunisation. The following information was reported: VACCINATION FAILURE (medically
2743526


We can always query the ChromaDB collection

In [126]:
print(collection)
query="Covid Booster shot"

query_vector=embeddings(query)
print(query)

name='VAERS_PC' id=UUID('71333657-c9ca-4844-83f8-04b6112d69f3') metadata=None tenant='default_tenant' database='default_database'
Covid Booster shot


In [135]:

res = collection.query(
            query_embeddings=query_vector,
            n_results=10,
            include=['distances','embeddings', 'documents', 'metadatas'],
            
        )

res

{'ids': [['c7399ec0-90dd-4ba8-897b-90a4cf109d2c',
   '6be72fbd-6c9f-49aa-b2c0-1def9088ad92',
   '51ec89ee-6b35-442f-8901-dfb0d165d65a',
   '0ee6636b-e98e-4fa7-bf9c-7b38748aa835',
   'defa4c0f-5a08-4469-972c-eb73737b19e1',
   '6d407763-62a0-4874-8aa7-7ffb63d356c5',
   '7692253b-08de-449a-b8be-89ed419f3919',
   'b6a307f4-e38e-4b39-9c97-c481487678f6',
   'f848a691-215f-4e73-bc84-3649622baea7',
   'defe888e-e6a0-4f83-b3f9-ef20c4205d27'],
  ['cf0e31c2-e060-43bc-a237-4bd1046778d0',
   'd06127a7-1298-4f47-8f04-b7ea176f3796',
   'ab9806fe-0a5b-4fca-8422-8f5436c4871a',
   '3bca30c4-2723-4243-89bd-a8ec1ff488a6',
   'e1a8f3e4-3713-40bb-a969-67a777b70d0b',
   'c7399ec0-90dd-4ba8-897b-90a4cf109d2c',
   '6be72fbd-6c9f-49aa-b2c0-1def9088ad92',
   '51ec89ee-6b35-442f-8901-dfb0d165d65a',
   'defe888e-e6a0-4f83-b3f9-ef20c4205d27',
   '0ee6636b-e98e-4fa7-bf9c-7b38748aa835'],
  ['236d3932-cf5f-4f6f-9b12-97069aeec2c7',
   '881a2177-077c-41a9-9e82-b0c1bd358415',
   '0e30551e-1219-45a9-ab11-970af04f86d6',
  

In [193]:
retriever.search_kwargs={"k":1000}
context_data = retriever.get_relevant_documents('Booster Shot')

vaers_ids=[]
for dc in context_data:
    vaers_ids.append(dc.metadata['VAERS_ID'])


In [207]:
vaers_subset = df[df['VAERS_ID'].isin( vaers_ids)]
vaers_subset.head()


Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
169,2729253,01/02/2024,,37.0,37.0,,F,,Updated COVID-19 vaccine (2023-2024) indicated...,,...,,,,,2,01/02/2024,,,,
217,2729351,01/02/2024,FL,68.0,,,F,,COVID 19 Treatment; COVID 19 Treatment; This i...,,...,,Medical History/Concurrent Conditions: Blood p...,,USPFIZER INC202300455986,2,01/02/2024,,,,
221,2729355,01/02/2024,,40.0,,,F,,COVID 19; COVID 19; Dose 2/Dose 1:Pfizer;Dose ...,,...,,Medical History/Concurrent Conditions: Hypothy...,,USPFIZER INC202300456294,2,12/29/2023,,,,
222,2729356,01/02/2024,TX,28.0,,,M,,COVID 19; COVID 19; This is a spontaneous repo...,,...,,,,USPFIZER INC202300456559,2,12/29/2023,,,,
225,2729359,01/02/2024,NH,72.0,,,M,,I have had eight of the Pfizer COVID shots; I ...,,...,,Medical History/Concurrent Conditions: Blood p...,,USPFIZER INC202300456936,2,01/01/2024,,,,


In [208]:
vaers_subset.shape

(642, 35)

In [221]:
input_batch = []
for row in vaers_subset['SYMPTOM_TEXT']:
    input_batch.append({"input":row})


extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")
vaers_sub=extraction_chain.batch(input_batch[0:20])

In [225]:
vaers_sub_list=functools.reduce(operator.iconcat, vaers_sub, [])
vaers_sub_df = pd.DataFrame.from_dict(vaers_sub_list) 
print(vaers_sub_df)

                                          VaccineName  HospitalAdmission   Age
0                Updated COVID-19 vaccine (2023-2024)              False    37
1   BNT162b2 omi xbb.1.5 (COMIRNATY (2023-2024 FOR...              False    68
2                                            BNT162b2              False    42
3                                          elasomeran              False    40
4                                            BNT162b2              False    29
5   BNT162b2 omi xbb.1.5 (COMIRNATY (2023-2024 FOR...              False    72
6                                            BNT162b2              False    64
7   BNT162b2 omi xbb.1.5 (COMIRNATY (2023-2024 FOR...              False    62
8                                                  NA              False    NA
9   BNT162b2 omi xbb.1.5 (COMIRNATY (2023-2024 FOR...              False    74
10                   BNT162b2 omi xbb.1.5 (COMIRNATY)              False    49
11  BNT162b2, BNT162b2 omi ba.4-5 (BNT162B2, BNT16..

Same concept can be applied through a chain. However, we may run into isssues with maximum token limits.

In [226]:
from langchain_core.runnables import (RunnableParallel,RunnablePassthrough)
chunk_retriever = langchain_chroma.as_retriever(search_kwargs={"k" : 10})
retrival=RunnableParallel({"input": chunk_retriever, "query":RunnablePassthrough()})


In [228]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess rather provide NA value for non boolean fields. Extract partial info"),
    ("human", "{input}")
])
## retrival with chunk retrievar
retrival_extraction_chain = retrival | prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [229]:
retrieved_data = retrival_extraction_chain.invoke('Booster Shot')


In [230]:
print(retrieved_data)


[{'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': 74}, {'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': None}, {'VaccineName': 'Bnt162b2', 'HospitalAdmission': False, 'Age': None}, {'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': None}, {'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': 65}, {'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': 65}, {'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': None}, {'VaccineName': 'Covid-19 vaccine', 'HospitalAdmission': False, 'Age': 65}]


In [231]:
retrieved_df = pd.DataFrame.from_dict(retrieved_data) 
print(retrieved_df)

        VaccineName  HospitalAdmission   Age
0  Covid-19 vaccine              False  74.0
1  Covid-19 vaccine              False   NaN
2          Bnt162b2              False   NaN
3  Covid-19 vaccine              False   NaN
4  Covid-19 vaccine              False  65.0
5  Covid-19 vaccine              False  65.0
6  Covid-19 vaccine              False   NaN
7  Covid-19 vaccine              False  65.0


Note:  We  limited the maximum number of returned documents (10) to avoid maximum token limit exceed error. 