In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
#sns.set_style('darkgrid')


<div style="width:100%;text-align: center;"> <img align=middle src="https://vaers.hhs.gov/images/vaers-logo.png" alt="Vaers" style="height:100px;margin-top:3rem;"> </div>

<div class="alert alert-block alert-info"> <h2>Introduction</h2></div>
VAERS is a passive reporting system, meaning it relies on individuals to send in reports of their experiences to CDC and FDA. VAERS is not designed to determine if a vaccine caused a health problem, but is especially useful for detecting unusual or unexpected patterns of adverse event reporting that might indicate a possible safety problem with a vaccine. This way, VAERS can provide CDC and FDA with valuable information that additional work and evaluation is necessary to further assess a possible safety concern.

I will try to show you multiple visualizations on this dataset


<div class="alert alert-block alert-info"> <h3>Summary:</h3></div>

<div class="alert alert-block alert-info"> <h4>Load and Filter data</h4></div>

In [None]:
data = pd.read_csv('/kaggle/input/covid19-vaccine-adverse-reactions-vaers-dataset/Covid1-19_VAERS_Jan_June_2021.csv') 
data = data[data.VAX_MANU!='UNKNOWN MANUFACTURER']
# we interest in vax doses 1 and 2 and form vers 2
data = data[(data.FORM_VERS==2) & (data['VAX_DOSE_SERIES'].isin(['1','2']))]
#data.DIED=data.DIED.fillna('N')

In [None]:
# columns in data
data.columns

In [None]:
# sample data
data.head()

In [None]:
# describe pandas for numerical values
data.describe()

<div class="alert alert-block alert-info"> <h4>Feature Descriptions</h4></div>
<b>VAERS_ID VAERS:</b> Identification Number<br>
<b>SYMPTOM1:</b> Adverse Event MedDRA Term 1<br>
<b>SYMPTOMVERSION:</b> MedDRA dictionary version number 1<br>
<b>SYMPTOM2:</b> Adverse Event MedDRA Term 1<br>
<b>SYMPTOMVERSION2:</b> MedDRA dictionary version number 2<br>
<b>SYMPTOM3:</b> Adverse Event MedDRA Term 3<br>
<b>SYMPTOMVERSION3:</b> MedDRA dictionary version number 3<br>
<b>SYMPTOM4:</b> Adverse Event MedDRA Term 4<br>
<b>SYMPTOMVERSION4:</b> MedDRA dictionary version number 4<br>
<b>SYMPTOM5:</b> Adverse Event MedDRA Term 5<br>
<b>SYMPTOMVERSION5:</b> MedDRA dictionary version number 5<br>
<b>VAX_TYPE:</b> Administered Vaccine Type<br>
<b>VAX_MANU:</b> Vaccine Manufacturer<br>
<b>VAX_LOT:</b> Manufacturer's Vaccine Lot<br>
<b>VAX_DOSE_SERIES:</b> Number of doses administered<br>
<b>VAX_ROUTE:</b> Vaccination Route<br>
<b>VAX_SITE:</b> Vaccination Site<br>
<b>VAX_NAME:</b> Vaccination Name<br>
<b>RECVDATE:</b> Date report was received<br>
<b>STATE:</b> State<br>
<b>AGE_YRS:</b> Age in Years<br>
<b>CAGE_YR:</b> Calculated age of patient in years<br>
<b>CAGE_MO:</b> Calculated age of patient in months<br>
<b>SEX:</b> Sex<br>
<b>RPT_DATE:</b> Date Form Completed<br>
<b>SYMPTOM_TEXT:</b> Reported symptom text<br>
<b>DIED:</b> Died<br>
<b>DATEDIED:</b> Date of Death<br>
<b>L_THREAT:</b> Life-Threatening Illness<br>
<b>ER_VISIT:</b> Emergency Room or Doctor Visit<br>
<b>HOSPITA:</b> Hospitalized<br>
<b>HOSPDAYS:</b> Number of days Hospitalized<br>
<b>X_STAY:</b> Prolongation of Existing Hospitalization<br>
<b>DISABLE:</b> Disability<br>
<b>RECOVD:</b> Recovered<br>
<b>VAX_DATE:</b> Vaccination Date<br>
<b>ONSET_DATE:</b> Adverse Event Onset Date<br>
<b>NUMDAYS:</b> Number of days (Onset date - Vax. Date)<br>
<b>LAB_DATA:</b> Diagnostic laboratory data<br>
<b>V_ADMINBY:</b> Type of facility where vaccine was administered<br>
<b>V_FUNDBY:</b> Type of funds used to purchase vaccines<br>
<b>OTHER_MEDS:</b> Other Medications<br>
<b>CUR_ILL:</b> Illnesses at time of vaccination<br>
<b>HISTORY:</b> Chronic or long-standing health conditions<br>
<b>PRIOR_VAX:</b> Prior Vaccination Event information<br>
<b>SPLTTYPE:</b> Manufacturer/Immunization Project Report Number<br>
<b>FORM_VERS:</b> VAERS form version 1 or 2<br>
<b>TODAYS_DATE:</b> Form Completed<br>
<b>BIRTH_DEFECT:</b> Congenital anomaly or birth defect<br>
<b>OFC_VISIT:</b> Doctor or other healthcare provider office/clinic visit<br>
<b>EREDVISIT:</b> Emergency room/department or urgent care<br>
<b>ALLERGIES:</b> Allergies to medications,food, or other products<br>

In [None]:
data.info()

In [None]:
float_features=data.select_dtypes(include='float64').columns
int_features=data.select_dtypes(include='int64').columns
object_features=data.select_dtypes(include='object').columns
#numerical_features=data.select_dtypes(exclude='object').columns

- <b>float64(10)</b>: SYMPTOMVERSION1, SYMPTOMVERSION2, SYMPTOMVERSION3, SYMPTOMVERSION4, SYMPTOMVERSION5, AGE_YRS, CAGE_YR, CAGE_MO, HOSPDAYS, NUMDAYS</p>
- <b>int64(2)</b>:  VAERS_ID, FORM_VERS</p>
- <b>object(40)</b>:  SYMPTOM1, SYMPTOM2, SYMPTOM3, SYMPTOM4, SYMPTOM5, VAX_TYPE, VAX_MANU, VAX_LOT, VAX_DOSE_SERIES, VAX_ROUTE, VAX_SITE, VAX_NAME, RECVDATE, STATE, SEX, RPT_DATE, SYMPTOM_TEXT, DIED, DATEDIED, L_THREAT, ER_VISIT, HOSPITAL, X_STAY,
 DISABLE, RECOVD, VAX_DATE, ONSET_DATE, LAB_DATA, V_ADMINBY,
 V_FUNDBY, OTHER_MEDS, CUR_ILL, HISTORY, PRIOR_VAX, SPLTTYPE,
 TODAYS_DATE, BIRTH_DEFECT, OFC_VISIT, ER_ED_VISIT, ALLERGIES 

<div class="alert alert-block alert-info"> <h4>Univariate</h4></div>

- <b>Numerical Features:</b> eg.  HOSPDAYS, NUMDAYS, AGE_YRS, etc... <br>
- <b>Categorical Features:</b> eg . SYMPTOM1,DIED, HOSPITAL, VAX_SITE, SEX, etc...

<div class="alert alert-block alert-info"> <h4>Numerical Features</h4></div>

In [None]:
from scipy.stats import probplot,skew

def plot_basic_stat(data, feats):
    for feat in feats:
        fig, axes = plt.subplots(1, 3, figsize=(20,2))
        sns.distplot(data[feat],kde=False, ax=axes[0])
        sns.boxplot(data[feat], ax=axes[1])
        probplot(data[feat], plot=axes[2])
        skew_val=round(data[feat].skew(), 1)
        axes[1].set_yticklabels([])
        axes[1].set_yticks([])
        axes[0].set_title(feat + " | Distplot")
        axes[1].set_title(feat + " | Boxplot")
        axes[2].set_title(feat + " | Probability Plot - Skew: "+str(skew_val))
        plt.show()

In [None]:
plot_basic_stat(data,['AGE_YRS','CAGE_YR'])

<div class="alert alert-block alert-info"> <h4>Categorical Features</h4></div>

In [None]:
sns.countplot(data=data, x='VAX_MANU')

In [None]:
sns.countplot(data=data, x='VAX_ROUTE')
plt.show()

<div class="alert alert-block alert-info"> <h3>Basic Analysis</h3></div>

In [None]:
# SEX and VAX_MANU selection
sns.countplot(x="SEX", hue="VAX_MANU", data=data)
plt.show()

In [None]:
# Average age by Gender in vaccine reaction response
data[["SEX","AGE_YRS"]].groupby(["SEX"], as_index = False).mean().sort_values(by = "AGE_YRS",ascending = False)

<div class="alert alert-block alert-info"> <h3>Outlier Analysis</h3></div>

In data analysis, anomaly detection (also outlier detection)[1] is the identification of rare items, events or observations which raise suspicions by differing significantly from the majority of the data. Reference: https://en.wikipedia.org/wiki/Anomaly_detection

The interquartile range is often used to find outliers in data. Outliers here are defined as observations that fall below Q1 − 1.5 IQR or above Q3 + 1.5 IQR. In a boxplot, the highest and lowest occurring value within this limit are indicated by whiskers of the box (frequently with an additional bar at the end of the whisker) and any outliers as individual points.

Q1 -> First quarter
Q3 -> Third quarter

IQR -> Q3 - Q1 (interquartile distance)

lower bound -> Q1 – 1.5(IQR)
upper limit -> Q3 + 1.5(IQR)

outlier -> Values outside the lower and upper limits


In [None]:
from collections import Counter

def iqr_outliers(df,feats):
    outlier_indices  = []
    
    for feat in feats:
        Q1 = df[feat].quantile(0.25)
        Q3 = df[feat].quantile(0.75)
        IQR = Q3 - Q1
        IQR_val = 1.5 * IQR
        outlier_list_col = df[((df[feat] < (Q1 - IQR_val)) | (df[feat] > (Q3 + IQR_val)))].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    outliers = list(i for i, v in outlier_indices.items() if v > 2) 
    return outliers

In [None]:
# find rows having multiple outliers to be removed
out_cols = ['AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'HOSPDAYS', 'NUMDAYS']
iqr_outliers(data,out_cols)

data.loc[iqr_outliers(data,out_cols)]

In [None]:
# remove outlier rows
# We extract what we find as outlier from our data
data = data.drop(iqr_outliers(data,out_cols),axis = 0).reset_index(drop=True)

<div class="alert alert-block alert-info"> <h3>Missing Data</h3></div>

In [None]:
# columns with missing data in our dataset
data.columns[data.isnull().any()]

In [None]:
# missings with seaborn heatmap
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(data.isna().transpose(),
            cmap="YlGnBu",
            cbar_kws={'label': 'Missing Data'})
plt.show()

<div class="alert alert-block alert-info"> <h3>Fill Missing Value</h3></div>

Example AGE_YRS

In [None]:
# number of missings
data['AGE_YRS'].isnull().sum()

In [None]:
# we get average age by gender to fill missings
gender_age=data[["SEX","AGE_YRS"]].groupby(["SEX"], as_index = False).median().sort_values(by = "AGE_YRS",ascending = False)
gender_age

In [None]:
age_dict=gender_age.set_index('SEX')['AGE_YRS'].to_dict()
data['AGE_YRS']=data.AGE_YRS.fillna(data.SEX.map(age_dict))
data['AGE_YRS'].isnull().sum()

In [None]:
# Died almost have all values missing, but it's important feature
data.DIED.isnull().sum()

In [None]:
data['DIED']=data.DIED.fillna('N')
data.DIED.value_counts()

In [None]:
data['HOSPITAL']=data.HOSPITAL.fillna('N')

<div class="alert alert-block alert-info"> <h2>Various Plots with Seaborn</h2></div>


<div class="alert alert-block alert-info"> <h3>Bar Plot</h3></div>
A bar plot represents an estimate of central tendency for a numeric variable with the height of each rectangle and provides some indication of the uncertainty around that estimate using error bars. Bar plots include 0 in the quantitative axis range, and they are a good choice when 0 is a meaningful value for the quantitative variable, and you want to make comparisons against it.

For datasets where 0 is not a meaningful value, a point plot will allow you to focus on differences between levels of one or more categorical variables.

In [None]:
plt.title("VAX MANU Bar Plot",color = 'blue',fontsize=15)
sns.barplot(x="VAX_MANU", y="HOSPDAYS", data=data)
plt.show()

<div class="alert alert-block alert-info"> <h3>Count Plot</h3></div>

A count plot can be thought of as a histogram across a categorical, instead of quantitative, variable. The basic API and options are identical to those for barplot(), so you can compare counts across nested variables.

In [None]:
plt.title("VAX Route Count Plot",color = 'blue',fontsize=15)
sns.countplot(x="VAX_ROUTE", data=data)
plt.show()

In [None]:
plt.title("VAX SITE Count Plot",color = 'blue',fontsize=15)
sns.countplot(x="VAX_SITE", data=data)
plt.show()

<div class="alert alert-block alert-info"> <h3>Pair Plot</h3></div>

Plot pairwise relationships in a dataset.

By default, this function will create a grid of Axes such that each numeric variable in data will by shared across the y-axes across a single row and the x-axes across a single column. The diagonal plots are treated differently: a univariate distribution plot is drawn to show the marginal distribution of the data in each column.

In [None]:
# pairplot taking long time to calcualte so I used very small fraction of data here
# for demostration
# 
sample_cols = ['AGE_YRS', 'SEX','CAGE_YR','DIED']
df_sample=data.sample(n=1000, replace=True, random_state=1)
df_sample=df_sample[sample_cols]
sns.pairplot(df_sample, hue="DIED")
plt.show()

<div class="alert alert-block alert-info"> <h3>Box Plot</h3></div>

A box plot (or box-and-whisker plot) shows the distribution of quantitative data in a way that facilitates comparisons between variables or across levels of a categorical variable. The box shows the quartiles of the dataset while the whiskers extend to show the rest of the distribution, except for points that are determined to be “outliers” using a method that is a function of the inter-quartile range.</br>
Reference: https://www.kdnuggets.com/2019/11/understanding-boxplots.html
<div style="width:100%;text-align: center;"> <img align=left src="https://miro.medium.com/max/9000/1*2c21SkzJMf3frPXPAR_gZA.png" alt="Vaers" style="height:180px;margin-top:3rem;"> </div>



In [None]:
plt.title("Box Plot of AGE",color = 'blue',fontsize=15)
sns.boxplot(data.AGE_YRS)
plt.show()

In [None]:
plt.figure(figsize = (9,9))
sns.boxplot(x="VAX_MANU", y="AGE_YRS", hue="SEX", linewidth=2.2, fliersize= 3.2 ,data=data, saturation=1, palette="YlOrRd")
plt.show()

<div class="alert alert-block alert-info"> <h3>Heatmap</h3></div>

A heat map is a data visualization technique that shows magnitude of a phenomenon as color in two dimensions. The variation in color may be by hue or intensity, giving obvious visual cues to the reader about how the phenomenon is clustered or varies over space. </br>
Reference: https://en.wikipedia.org/wiki/Heat_map

In [None]:
plt.figure(figsize = (9,9))
plt.title("Correlation Heatmap",color = 'blue',fontsize=15)
sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True)

<div class="alert alert-block alert-info"> <h3>Violin Plot</h3></div>
Draw a combination of boxplot and kernel density estimate.

A violin plot plays a similar role as a box and whisker plot. It shows the distribution of quantitative data across several levels of one (or more) categorical variables such that those distributions can be compared. Unlike a box plot, in which all of the plot components correspond to actual datapoints, the violin plot features a kernel density estimation of the underlying distribution.

This can be an effective and attractive way to show multiple distributions of data at once, but keep in mind that the estimation procedure is influenced by the sample size, and violins for relatively small samples might look misleadingly smooth.

In [None]:
plt.title("Violin Plot of Age",color = 'blue',fontsize=15)
ax = sns.violinplot(x=data["AGE_YRS"])

In [None]:
plt.figure(figsize = (9,9))
sns.violinplot(x="SEX", y="AGE_YRS", data=data, hue="VAX_MANU",orient='v')

<div class="alert alert-block alert-info"> <h3>Point Plot</h3></div>
Show point estimates and confidence intervals using scatter plot glyphs.

A point plot represents an estimate of central tendency for a numeric variable by the position of scatter plot points and provides some indication of the uncertainty around that estimate using error bars.

Point plots can be more useful than bar plots for focusing comparisons between different levels of one or more categorical variables. They are particularly adept at showing interactions: how the relationship between levels of one categorical variable changes across levels of a second categorical variable. The lines that join each point from the same hue level allow interactions to be judged by differences in slope, which is easier for the eyes than comparing the heights of several groups of points or bars.

In [None]:
plt.figure(figsize = (9,5))
plt.title("Point Plot Example",color = 'blue',fontsize=15)
sns.pointplot(x="SEX", y="AGE_YRS", hue="VAX_MANU", data=data, markers=["o","x","s"])

<div class="alert alert-block alert-info"> <h3>Joint Plot</h3></div>
Draw a plot of two variables with bivariate and univariate graphs.

This function provides a convenient interface to the JointGrid class, with several canned plot kinds. This is intended to be a fairly lightweight wrapper; if you need more flexibility, you should use JointGrid directly.

In [None]:
#CAGE_YR, CAGE_MO
sns.jointplot(data=data, x="CAGE_YR", y="AGE_YRS", hue="DIED")

<div class="alert alert-block alert-info"> <h3>Word Cloud</h3></div>

In [None]:
from wordcloud import WordCloud, STOPWORDS

def show_cloud(data,title=None):
    words_str=' '.join([str(elem) for elem in data])
    wordcloud = WordCloud(
        width = 3000,
        height = 2000,
        background_color = 'black',
        stopwords = STOPWORDS).generate(words_str)
    fig = plt.figure(figsize = (40, 30), facecolor = 'k', edgecolor = 'k')
    plt.imshow(wordcloud)
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
show_cloud(data.SYMPTOM1)

## **If you like this notebook, please give an Upvote!**