# Load the Data

In [20]:
# import the necessary libraries
import pandas as pd
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings(action='ignore')

In [21]:
df = pd.read_csv(r'C:\Users\spoor\Downloads\cleaned_vaers.csv')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328255 entries, 0 to 328254
Data columns (total 35 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   VAERS_ID         328255 non-null  int64  
 1   STATE            328255 non-null  object 
 2   AGE_YRS          328255 non-null  float64
 3   SEX              328255 non-null  int64  
 4   SYMPTOM_TEXT     328255 non-null  object 
 5   DIED             328255 non-null  int64  
 6   L_THREAT         328255 non-null  int64  
 7   HOSPITAL         328255 non-null  int64  
 8   HOSPDAYS         328255 non-null  float64
 9   DISABLE          328255 non-null  int64  
 10  RECOVD           328255 non-null  int64  
 11  VAX_DATE         328255 non-null  object 
 12  ONSET_DATE       328255 non-null  object 
 13  NUMDAYS          328255 non-null  int64  
 14  V_ADMINBY        328255 non-null  object 
 15  OTHER_MEDS       213171 non-null  object 
 16  CUR_ILL          141840 non-null  obje

## Defining target variable - SERIOUS 

In [24]:
# Define the function to check serious criteria
def serious_criteria(row):

    if any(row[col] == 1 for col in ['DIED', 'L_THREAT', 'HOSPITAL', 'DISABLE', 'BIRTH_DEFECT']):
        return 1
    else:
        return 0

## Check with the medDRA important medical event terms list

In [36]:
# List of fields/columns interested
fields = ['MedDRA Code', 'PT Name', 'SOC Name']

file_path = r"C:\Users\spoor\Downloads\MedDRA_27.1.xlsx"
# Read the Excel file starting from row 12
medDRA = pd.read_excel(file_path, usecols=fields, skiprows=11)
medDRA

Unnamed: 0,MedDRA Code,PT Name,SOC Name
0,10091289,Arrhythmia induced cardiomyopathy,Cardiac disorders
1,10090743,Ewart's sign,Cardiac disorders
2,10090970,Hypothyroid cardiomyopathy,Cardiac disorders
3,10090793,Pericardial perforation,Cardiac disorders
4,10091259,Agnathia,"Congenital, familial and genetic disorders"
...,...,...,...
7615,10047193,Vena cava embolism,Vascular disorders
7616,10047195,Vena cava thrombosis,Vascular disorders
7617,10067030,Venous thrombosis in pregnancy,Vascular disorders
7618,10064602,Venous thrombosis neonatal,Vascular disorders


In [38]:
list_of_names = medDRA['PT Name'].to_list()
list_of_names = [i.lower() for i in list_of_names]
print('List of Names: ', list_of_names)
print('Type of listOfNames: ', type(list_of_names))

Type of listOfNames:  <class 'list'>


# PPV

In [40]:
ppv_df = df[df['VAX_TYPE'].str.contains('PPV')]
ppv_df = ppv_df.drop('VAX_TYPE', axis=1)

In [42]:
# Apply the function to create the 'serious' column
ppv_df['SERIOUS'] = ppv_df.apply(serious_criteria, axis=1)

In [44]:
ppv_df['SERIOUS'].value_counts()

SERIOUS
0    16339
Name: count, dtype: int64

In [46]:
%%time 
# Create a function that returns 1 if any symptom matches with MedDRA terms, 0 otherwise
def has_meddra_match(row):
    for col in ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']:
        if pd.notna(row[col]) and row[col].lower() in list_of_names:
            return 1
    return 0

# Create new column 'HAS_MEDDRA_MATCH'
ppv_df['HAS_MEDDRA_MATCH'] = ppv_df.apply(has_meddra_match, axis=1)

# You can verify the results
print(f"Number of records with MedDRA matches: {ppv_df['HAS_MEDDRA_MATCH'].sum()}")
print(f"Number of records without MedDRA matches: {(ppv_df['HAS_MEDDRA_MATCH'] == 0).sum()}")

Number of records with MedDRA matches: 731
Number of records without MedDRA matches: 15608
CPU times: total: 15.8 s
Wall time: 16.5 s


In [47]:
ppv_df['SERIOUS'] = ppv_df['SERIOUS'] + ppv_df['HAS_MEDDRA_MATCH']
ppv_df['SERIOUS'] = ppv_df['SERIOUS'].apply(lambda x: 0 if x == 0 else 1)
#ppv_df = ppv_df.drop(columns=['HAS_MEDDRA_MATCH'])

In [48]:
ppv_df.head(2)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,SYMPTOM_TEXT,DIED,L_THREAT,HOSPITAL,HOSPDAYS,DISABLE,...,VAX_NAME,ORDER_y,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,ORDER,SERIOUS,HAS_MEDDRA_MATCH
41,573481,WI,42.0,1,THIS SPONTANEOUS REPORT WAS RECEIVED FROM A PH...,0,0,0,0.0,0,...,PNEUMO (PNEUMOVAX),1,AUTOPSY,DEATH,MULTI-ORGAN FAILURE,POLYMERASE CHAIN REACTION,SEPSIS,1,1,1
42,573481,WI,42.0,1,THIS SPONTANEOUS REPORT WAS RECEIVED FROM A PH...,0,0,0,0.0,0,...,PNEUMO (PNEUMOVAX),1,VARICELLA POST VACCINE,VARICELLA VIRUS TEST POSITIVE,,,,1,0,0


In [54]:
ppv_df.to_csv(r'C:\Users\spoor\Downloads\ppv.csv', index=False)

# VARZOS

In [56]:
varzos_df = df[df['VAX_TYPE'].str.contains('VARZOS')]
varzos_df = varzos_df.drop('VAX_TYPE', axis=1)

In [58]:
# Apply the function to create the 'serious' column
varzos_df['SERIOUS'] = varzos_df.apply(serious_criteria, axis=1)
varzos_df['SERIOUS'].value_counts()

SERIOUS
0    53120
Name: count, dtype: int64

In [60]:
%%time 
# Create a function that returns 1 if any symptom matches with MedDRA terms, 0 otherwise
def has_meddra_match(row):
    for col in ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']:
        if pd.notna(row[col]) and row[col].lower() in list_of_names:
            return 1
    return 0
    
# Create new column 'HAS_MEDDRA_MATCH'
varzos_df['HAS_MEDDRA_MATCH'] = varzos_df.apply(has_meddra_match, axis=1)

# You can verify the results
print(f"Number of records with MedDRA matches: {varzos_df['HAS_MEDDRA_MATCH'].sum()}")
print(f"Number of records without MedDRA matches: {(varzos_df['HAS_MEDDRA_MATCH'] == 0).sum()}")

Number of records with MedDRA matches: 3257
Number of records without MedDRA matches: 49863
CPU times: total: 54.5 s
Wall time: 56.7 s


In [61]:
varzos_df['SERIOUS'] = varzos_df['SERIOUS'] + varzos_df['HAS_MEDDRA_MATCH']
varzos_df['SERIOUS'] = varzos_df['SERIOUS'].apply(lambda x: 0 if x == 0 else 1)
varzos_df = varzos_df.drop(columns=['HAS_MEDDRA_MATCH'])

In [62]:
varzos_df.head(2)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,SYMPTOM_TEXT,DIED,L_THREAT,HOSPITAL,HOSPDAYS,DISABLE,...,VAX_SITE,VAX_NAME,ORDER_y,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,ORDER,SERIOUS
39,571772,MA,75.0,0,REDNESS AND ITCHING,0,0,0,0.0,0,...,LA,ZOSTER LIVE (ZOSTAVAX),1,INJECTION SITE ERYTHEMA,INJECTION SITE SWELLING,RHINORRHOEA,THROAT IRRITATION,,1,0
40,571772,MA,75.0,0,REDNESS AND ITCHING,0,0,0,0.0,0,...,LA,ZOSTER LIVE (ZOSTAVAX),1,ERYTHEMA,PRURITUS,,,,2,0


In [63]:
varzos_df.to_csv(r'C:\Users\spoor\Downloads\varzos.csv', index=False)

# FLU

In [68]:
flu_df = df[df['VAX_TYPE'].str.contains('FLU')]
flu_df = flu_df.drop('VAX_TYPE', axis=1)

In [70]:
# Apply the function to create the 'serious' column
flu_df['SERIOUS'] = flu_df.apply(serious_criteria, axis=1)
flu_df['SERIOUS'].value_counts()

SERIOUS
0    49562
Name: count, dtype: int64

In [72]:
%%time
# Create a function that returns 1 if any symptom matches with MedDRA terms, 0 otherwise
def has_meddra_match(row):
    for col in ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']:
        if pd.notna(row[col]) and row[col].lower() in list_of_names:
            return 1
    return 0

# Create new column 'HAS_MEDDRA_MATCH'
flu_df['HAS_MEDDRA_MATCH'] = flu_df.apply(has_meddra_match, axis=1)

# You can verify the results
print(f"Number of records with MedDRA matches: {flu_df['HAS_MEDDRA_MATCH'].sum()}")
print(f"Number of records without MedDRA matches: {(flu_df['HAS_MEDDRA_MATCH'] == 0).sum()}")

Number of records with MedDRA matches: 5558
Number of records without MedDRA matches: 44004
CPU times: total: 48.7 s
Wall time: 50.9 s


In [73]:
flu_df['SERIOUS'] = flu_df['SERIOUS'] + flu_df['HAS_MEDDRA_MATCH']
flu_df['SERIOUS'] = flu_df['SERIOUS'].apply(lambda x: 0 if x == 0 else 1)
flu_df = flu_df.drop(columns=['HAS_MEDDRA_MATCH'])

In [74]:
flu_df.head(2)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,SYMPTOM_TEXT,DIED,L_THREAT,HOSPITAL,HOSPDAYS,DISABLE,...,VAX_SITE,VAX_NAME,ORDER_y,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,ORDER,SERIOUS
133,622060,WA,39.0,0,AFTER THE VACCINE MY SHOULDER HURT BUT I THOUG...,0,0,0,0.0,0,...,AR,INFLUENZA (SEASONAL) (FLUVIRIN),1,VACCINATION SITE PAIN,,,,,1,0
134,622060,WA,39.0,0,AFTER THE VACCINE MY SHOULDER HURT BUT I THOUG...,0,0,0,0.0,0,...,AR,INFLUENZA (SEASONAL) (FLUVIRIN),1,INJECTION SITE PAIN,,,,,2,0


In [75]:
flu_df.to_csv(r'C:\Users\spoor\Downloads\flu.csv', index=False)

# COVID

In [80]:
covid_df = df[df['VAX_TYPE'].str.contains('COVID')]
covid_df = covid_df.drop('VAX_TYPE', axis=1)

In [82]:
# Apply the function to create the 'serious' column
covid_df['SERIOUS'] = covid_df.apply(serious_criteria, axis=1)
covid_df['SERIOUS'].value_counts()

SERIOUS
0    118274
Name: count, dtype: int64

In [83]:
%%time 
# Create a function that returns 1 if any symptom matches with MedDRA terms, 0 otherwise
def has_meddra_match(row):
    for col in ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']:
        if pd.notna(row[col]) and row[col].lower() in list_of_names:
            return 1
    return 0

# Create new column 'HAS_MEDDRA_MATCH'
covid_df['HAS_MEDDRA_MATCH'] = covid_df.apply(has_meddra_match, axis=1)

# You can verify the results
print(f"Number of records with MedDRA matches: {covid_df['HAS_MEDDRA_MATCH'].sum()}")
print(f"Number of records without MedDRA matches: {(covid_df['HAS_MEDDRA_MATCH'] == 0).sum()}")

Number of records with MedDRA matches: 9987
Number of records without MedDRA matches: 108287
CPU times: total: 2min 2s
Wall time: 2min 6s


In [85]:
covid_df['SERIOUS'] = covid_df['SERIOUS'] + covid_df['HAS_MEDDRA_MATCH']
covid_df['SERIOUS'] = covid_df['SERIOUS'].apply(lambda x: 0 if x == 0 else 1)
covid_df = covid_df.drop(columns=['HAS_MEDDRA_MATCH'])

In [86]:
covid_df.head(2)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,SYMPTOM_TEXT,DIED,L_THREAT,HOSPITAL,HOSPDAYS,DISABLE,...,VAX_SITE,VAX_NAME,ORDER_y,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,ORDER,SERIOUS
203470,902418,NJ,56.0,0,PATIENT EXPERIENCED MILD NUMBNESS TRAVELING FR...,0,0,0,0.0,0,...,LA,COVID19 (COVID19 (PFIZER-BIONTECH)),1,HYPOAESTHESIA,INJECTION SITE HYPOAESTHESIA,,,,1,0
203476,902440,AZ,35.0,0,C O HEADACHE,0,0,0,0.0,0,...,LA,COVID19 (COVID19 (PFIZER-BIONTECH)),1,HEADACHE,,,,,1,0


In [89]:
covid_df.to_csv(r'C:\Users\spoor\Downloads\covid.csv', index=False)