Import necessary python packages

In [3]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import DataFrameLoader
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from typing import Optional
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
import functools
import operator


Set OPENAI API Key (saved in a .env file)

In [4]:

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

Read 2024 VAERS Data downloaded from [Vaccine Adverse Event Reporting System](https://vaers.hhs.gov/data/datasets.html)

In [5]:
df = pd.read_csv('VAERS/2024VAERSDATA.csv',encoding='windows-1252')
df.head()

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,2728969,01/01/2024,AK,14.0,,,M,,fluzone qiv hd administered to a minor patient...,,...,,,,USSA2023SA396019,2,12/30/2023,,,,
1,2728982,01/01/2024,NJ,,,,U,,administering vaccines subcutaneously to patie...,,...,,,,USGSKUS2023AMR179885,2,01/01/2024,,,,
2,2728984,01/01/2024,,,,,U,,extreme joint pain; Unable to walk; This non-s...,,...,,,,USGSKUS2023AMR180978,2,01/01/2024,,,,
3,2728992,01/01/2024,MO,,,,F,,was due to have 2nd when everything shut down ...,,...,,,,USGSKUS2023AMR181463,2,01/01/2024,,,,
4,2728993,01/01/2024,,,,,U,,Injection site hot; Hypersentivity reaction; C...,,...,Autoimmune disorder (Autoimmune disorder diagn...,,,USGSKUS2023AMR181703,2,01/01/2024,,,,


Now we use langchain DataFrameLoader to load the SYMPTOM_TEXT column

In [6]:
loader = DataFrameLoader(df,page_content_column="SYMPTOM_TEXT")

data = loader.load()

In [7]:
data[0].page_content

"fluzone qiv hd administered to a minor patient with no reported adverse event; patient was supposed to receive the Flumist Nasal Spray but she grabbed the FLUZONE QIV HD and inadvertently gave it to intranasally with no reported adverse event; patient was supposed to receive the Flumist Nasal Spray but she grabbed the FLUZONE QIV HD and inadvertently gave it to intranasally with no reported adverse event; Initial information received from Regulatory Authority on 18-Dec-2023 regarding an unsolicited valid  non-serious case received from a nurse.  This case involves a 14 years old male patient to whom influenza quadrival A-B high dose HV vaccine [Fluzone High-Dose Quadrivalent] was administered who was supposed to receive the Influenza Vaccine Live Reassort 3v (Flumist) nasal Spary but she grabbed the Fluzone QIV HD and inadvertently gave it to intranasally with no reported adverse event.  The patient's past medical history, medical treatment(s), vaccination(s) and family history were n

We define events and Information class using pydantic, later on this classes will be used to generate schemas that can be used by LLM

In [10]:

class Event(BaseModel):
    """Information about a patient narrative."""
    VaccineName: str = Field(description = "Name of Administered Vaccine if the name of vaccine is not available the value should ba NA")
    HodpitalAdmission: bool = Field(description = "This flag is True if patient was admitted into hospital due to Adverse Event otherwise false")
    Age: int = Field(description = "Age of the patien if available otherwise the value is NA")

Infromation class is a list of all events

In [11]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Event] = Field(description="List of info about patient narratives")

Now we create the chat model instance

In [12]:
model = ChatOpenAI(temperature=0)


convert_pydantic_to_openai_function will create the information extraction schema from the Information class. 

In [13]:
convert_pydantic_to_openai_function(Information)

  warn_deprecated(


{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'$defs': {'Event': {'description': 'Information about a patient narrative.',
    'properties': {'VaccineName': {'description': 'Name of Administered Vaccine if the name of vaccine is not available the value should ba NA',
      'type': 'string'},
     'HodpitalAdmission': {'description': 'This flag is True if patient was admitted into hospital due to Adverse Event otherwise false',
      'type': 'boolean'},
     'Age': {'description': 'Age of the patien if available otherwise the value is NA',
      'type': 'integer'}},
    'required': ['VaccineName', 'HodpitalAdmission', 'Age'],
    'type': 'object'}},
  'properties': {'people': {'description': 'List of info about patient narratives',
    'items': {'description': 'Information about a patient narrative.',
     'properties': {'VaccineName': {'description': 'Name of Administered Vaccine if the name of vaccine is not available the value should ba NA',
     

Let us create an extraction model by binding the extraction function with chat model

In [14]:

extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

We now create the prompt that would be supplied to the chat model

In [16]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess rather provide NA value for non boolean fields. Extract partial info"),
    ("human", "{input}")
])
print(prompt)

input_variables=['input'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Extract the relevant information, if not explicitly provided do not guess rather provide NA value for non boolean fields. Extract partial info')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))]


Now we create the final chain using Langchain Expression Language

In [17]:

extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

We do a batch submit for first 20 narratives 

In [34]:
input_batch = []
for page in data[0:20]:
    input_batch.append({"input":page.page_content})



In [35]:
vaers=extraction_chain.batch(input_batch)

In [36]:
vaers

[[{'VaccineName': 'Fluzone QIV HD', 'HodpitalAdmission': False, 'Age': 14}],
 [{'VaccineName': 'NA', 'HodpitalAdmission': False, 'Age': 45}],
 [{'VaccineName': 'RSVPreF3 adjuvanted (Arexvy)',
   'HodpitalAdmission': False,
   'Age': 'NA'}],
 [{'VaccineName': 'Herpes zoster (Shingrix)',
   'HodpitalAdmission': False,
   'Age': 54}],
 [{'VaccineName': 'RSVPreF3 adjuvanted (Arexvy)',
   'HodpitalAdmission': False,
   'Age': 25}],
 [{'VaccineName': 'RSVPreF3 adjuvanted (Arexvy)',
   'HodpitalAdmission': False,
   'Age': None},
  {'VaccineName': 'Influenza vaccine',
   'HodpitalAdmission': False,
   'Age': None},
  {'VaccineName': 'Tozinameran (Pfizer BioNTech COVID-19 vaccine)',
   'HodpitalAdmission': False,
   'Age': None}],
 [{'VaccineName': 'Infanrix', 'HodpitalAdmission': False, 'Age': 11}],
 [{'VaccineName': 'Flu Seasonal QIV Quebec (FluLaval Quadrivalent 2023-2024 season)',
   'HodpitalAdmission': False,
   'Age': 75}],
 [{'VaccineName': 'DTPa (Reduced antigen) (Boostrix)',
   'Hodp

In [16]:

#this is reportedly the most efficient way to get one list out of list of lists by concat
vaers_list=functools.reduce(operator.iconcat, vaers, [])


In [18]:
vaers_df = pd.DataFrame.from_dict(vaers_list) 