In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error
from scipy.interpolate import griddata

# Preprocessing Data (Filter out unwanted columns and combining symptons into a list in merged_symptons)
CSV_Data = '../data/2021VAERSDATA.csv'
CSV_Symptoms = '../data/2021VAERSSYMPTOMS.csv'
CSV_Vax = "../data/2021VAERSVAX.csv" 

df_d = pd.read_csv(CSV_Data, names = ['VAERS_ID','RECVDATE','STATE','AGE_YRS','CAGE_YR','CAGE_MO','SEX','RPT_DATE','SYMPTOM_TEXT','DIED','DATEDIED','L_THREAT','ER_VISIT','HOSPITAL','HOSPDAYS','X_STAY','DISABLE','RECOVD','VAX_DATE','ONSET_DATE','NUMDAYS','LAB_DATA','V_ADMINBY','V_FUNDBY','OTHER_MEDS','CUR_ILL','HISTORY','PRIOR_VAX','SPLTTYPE','FORM_VERS','TODAYS_DATE','BIRTH_DEFECT','OFC_VISIT','ER_ED_VISIT','ALLERGIES'],error_bad_lines=False, index_col=False, dtype='unicode')
df = df_d.drop(['CAGE_YR','STATE', 'CAGE_MO', 'DIED','RPT_DATE','SYMPTOM_TEXT','DATEDIED','L_THREAT','ER_VISIT','HOSPITAL','HOSPDAYS','X_STAY','DISABLE','RECOVD','VAX_DATE','ONSET_DATE','NUMDAYS','LAB_DATA','V_ADMINBY','V_FUNDBY','OTHER_MEDS','SPLTTYPE','FORM_VERS','TODAYS_DATE','BIRTH_DEFECT','OFC_VISIT','ER_ED_VISIT'], axis=1)
df_s = pd.read_csv(CSV_Symptoms, names = ['VAERS_ID','SYMPTOM1','SYMPTOMVERSION1','SYMPTOM2','SYMPTOMVERSION2','SYMPTOM3','SYMPTOMVERSION3','SYMPTOM4','SYMPTOMVERSION4','SYMPTOM5','SYMPTOMVERSION5'],error_bad_lines=False, index_col=False, dtype='unicode')
df_s = df_s.drop(['SYMPTOMVERSION1','SYMPTOMVERSION2','SYMPTOMVERSION3','SYMPTOMVERSION4','SYMPTOMVERSION5'], axis = 1)
df_s['MERGED_SYMPTONS'] = df_s[df_s.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
df_s = df_s.drop(['VAERS_ID','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5'], axis = 1)
df = pd.concat([df,df_s], axis=1, join="inner")
df_v = pd.read_csv (CSV_Vax, names = ['VAERS_ID','VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_ROUTE','VAX_SITE','VAX_NAME'],error_bad_lines=False, index_col=False, dtype='unicode')
df_v = df_v.drop(['VAERS_ID','VAX_TYPE','VAX_LOT','VAX_ROUTE','VAX_SITE','VAX_NAME'], axis = 1)
df = pd.concat([df,df_v], axis=1, join="inner")

df = df.drop (df.index[0])
# Get symptons from age, sex, vac manu, vax dose series
df_as = df.drop(['RECVDATE','CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES'], axis = 1)
df_as

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,MERGED_SYMPTONS,VAX_MANU,VAX_DOSE_SERIES
1,0916600,33.0,F,"Dysphagia,Epiglottitis",MODERNA,1
2,0916601,73.0,F,"Anxiety,Dyspnoea",MODERNA,1
3,0916602,23.0,F,"Chest discomfort,Dysphagia,Pain in extremity,V...",PFIZER\BIONTECH,1
4,0916603,58.0,F,"Dizziness,Fatigue,Mobility decreased",MODERNA,UNK
5,0916604,47.0,F,"Injection site erythema,Injection site pruritu...",MODERNA,1
...,...,...,...,...,...,...
217712,1316365,27.0,F,"Chills,Dizziness,Fatigue,Hyperhidrosis,Nausea",JANSSEN,UNK
217713,1316366,24.0,M,"Pain,Pyrexia",JANSSEN,UNK
217714,1316367,,U,"Injection site rash,Rash,Rash papular,Skin warm",JANSSEN,UNK
217715,1316368,55.0,M,"Dyspnoea,Pharyngeal swelling",JANSSEN,UNK


In [3]:
df_as.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217716 entries, 1 to 217716
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAERS_ID         217716 non-null  object
 1   AGE_YRS          200353 non-null  object
 2   SEX              217716 non-null  object
 3   MERGED_SYMPTONS  217716 non-null  object
 4   VAX_MANU         217716 non-null  object
 5   VAX_DOSE_SERIES  216525 non-null  object
dtypes: object(6)
memory usage: 11.6+ MB


In [26]:
# Get symptoms occurrences 

SYMPTOM_Dict ={}
num = 0
for x in df_as['MERGED_SYMPTONS']:
    string = ""
    for char in x:
        if(char == ","):
            if string in SYMPTOM_Dict :
                SYMPTOM_Dict[string] += 1
            else :
                SYMPTOM_Dict[string] = 1
            
            string = ""
        else:
            string += char
    if string in SYMPTOM_Dict :
        SYMPTOM_Dict[string] += 1
    else :
        SYMPTOM_Dict[string] = 1
    
sorted_dict = {}
sorted_keys = sorted(SYMPTOM_Dict, key=SYMPTOM_Dict.get, reverse = True)

# Get symptoms with occurences over 1000
for x in sorted_keys:
    sorted_dict[x] = SYMPTOM_Dict[x]

stop_index = 0
for i, x in enumerate(sorted_dict):
    if sorted_dict[x] > 1000:
        pass
    else:
        stop_index = i
        break

symptoms_to_use = list(sorted_dict.keys())[:stop_index]


symptoms_to_use.remove("SARS-CoV-2 test negative")
symptoms_to_use.remove("SARS-CoV-2 test positive")
symptoms_to_use.remove("SARS-CoV-2 test")
symptoms_to_use.remove("Poor quality product administered")
symptoms_to_use.remove("Death")
symptoms_to_use.remove("COVID-19")
symptoms_to_use.remove("Body temperature")
symptoms_to_use.remove("Product storage error")
symptoms_to_use.remove("Blood test")
symptoms_to_use.remove("Chest X-ray")



    
    
print("Symptoms with occurences over 1000: ",symptoms_to_use)

Symptoms with occurences over 1000:  ['Headache', 'Chills', 'Pyrexia', 'Fatigue', 'Pain', 'Nausea', 'Dizziness', 'Pain in extremity', 'Injection site pain', 'Injection site erythema', 'Myalgia', 'Arthralgia', 'Pruritus', 'Injection site swelling', 'Injection site pruritus', 'Rash', 'Erythema', 'Dyspnoea', 'Injection site warmth', 'Vomiting', 'Asthenia', 'Urticaria', 'Diarrhoea', 'Paraesthesia', 'Lymphadenopathy', 'Injection site rash', 'Hyperhidrosis', 'Malaise', 'Hypoaesthesia', 'Feeling abnormal', 'Peripheral swelling', 'Skin warm', 'Cough', 'Rash erythematous', 'Chest discomfort', 'Tremor', 'Heart rate increased', 'Swelling', 'Palpitations', 'Body temperature increased', 'Feeling hot', 'Flushing', 'Back pain', 'Chest pain', 'Decreased appetite', 'Injection site induration', 'Neck pain', 'Paraesthesia oral', 'Rash pruritic', 'Oropharyngeal pain', 'Mobility decreased', 'Syncope', 'Influenza like illness', 'Throat tightness', 'Sleep disorder', 'Electrocardiogram', 'Feeling cold', 'Swel

In [8]:
# add symptoms column to dataframe
symtoms = symptoms_to_use

for symtom in symtoms:
    l = []
    num = 0
    for x in df_as['MERGED_SYMPTONS']:
        string = ""
        check_contain = False
        for char in x:
            if(char == ","):
                if string == symtom:
                    check_contain = True
                string = ""
            else:
                string += char
        if string == symtom:
            check_contain = True
        num += 1
        if (check_contain):
            l.append(1)
        else:
            l.append(0)
    df_as.insert(len(df_as.columns),symtom,l,True)

df_asn = df_as.drop(["MERGED_SYMPTONS"],axis = 1)

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder_SEX = LabelEncoder()
encoder_VAX_MANU = LabelEncoder()


# Remove rows where VAX_DOSE_SERIES is not 1 or 2
df_asn.drop(df_asn[(df_asn['VAX_DOSE_SERIES']!="1") & (df_asn['VAX_DOSE_SERIES']!="2")].index, inplace = True)
# Remove rows where VZX_MANU is not Moderna, Pfizer\Biotech, or Janssen
df_asn.drop(df_asn[(df_asn['VAX_MANU']!="MODERNA") & (df_asn['VAX_MANU']!="PFIZER\BIONTECH") &(df_asn['VAX_MANU']!="JANSSEN")].index, inplace = True)
# ID_encoded = le.fit_transform(np.ravel(df_as["VAERS_ID"]))
df_asn["VAERS_ID"] = df_asn["VAERS_ID"].astype(str).astype(int)
# Remove rows where Age is missing
df_asn.drop(df_asn[df_asn['AGE_YRS'].isna()].index, inplace = True)
df_asn["AGE_YRS"] = df_asn["AGE_YRS"].astype(str).astype(float)
Sex_encoded = encoder_SEX.fit_transform(np.ravel(df_asn["SEX"]))
# 0 = Female, 1 = Male
df_asn["SEX"] = Sex_encoded
# 0 = JANSSEN, 1 = MODERNA, 2 = PFIZER\BIONTECH
VAX_MANU_encoded = encoder_VAX_MANU.fit_transform(np.ravel(df_asn["VAX_MANU"]))
df_asn["VAX_MANU"] = VAX_MANU_encoded
df_asn["VAX_DOSE_SERIES"] = df_asn["VAX_DOSE_SERIES"].astype(str).astype(int)

df_asn

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,VAX_MANU,VAX_DOSE_SERIES,Headache,Chills,Pyrexia,Fatigue,Pain,...,Nasal congestion,Pallor,Computerised tomogram,Confusional state,Limb discomfort,Inappropriate schedule of product administration,Chest X-ray,Electrocardiogram normal,Hot flush,Cold sweat
1,916600,33.0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,916601,73.0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,916602,23.0,0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,916604,47.0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,916606,44.0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217656,1316162,54.0,0,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
217657,1316163,17.0,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
217659,1316165,29.0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
217661,1316167,22.0,0,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_asn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162256 entries, 1 to 217662
Columns: 122 entries, VAERS_ID to Cold sweat
dtypes: float64(1), int64(121)
memory usage: 152.3 MB


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import math 

df = shuffle(df_asn)

# print("70% of data set used:", math.floor(df.shape[0]*0.7))
log = LogisticRegression()
X = df[['AGE_YRS','SEX','VAX_MANU','VAX_DOSE_SERIES']]


Y_list = symptoms_to_use

# initialize list of models
model_list = list(map(lambda _ : LogisticRegression(), Y_list))

symptoms_dict ={}
for i, y in enumerate(Y_list):   
    Y= df[y]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size =0.3)
    model_list[i].fit(x_train, y_train)
    symptoms_dict[y] = model_list[i].predict_proba(np.array([50,0,1, 1]).reshape(1, -1))[0][1]
    print("\nProbability of", y, "happening: ",model_list[i].predict_proba(np.array([50,0,1, 1]).reshape(1, -1))[0][1])
    print("Accuracy:", model_list[i].score(x_test, y_test))

# correct predictions / total number of data points
# print("Accuracy of this prediction: ",score/num)


Probability of Headache happening:  0.16819073483129393
Accuracy: 0.832857406988927

Probability of Chills happening:  0.13600177290668575
Accuracy: 0.8627688641452842

Probability of Pyrexia happening:  0.13435429467256885
Accuracy: 0.8619060336503893

Probability of Fatigue happening:  0.13008476368127284
Accuracy: 0.8679458471146537

Probability of Pain happening:  0.12572017835503577
Accuracy: 0.8741294656614007

Probability of Nausea happening:  0.09547399036634686
Accuracy: 0.9076976806294554

Probability of Dizziness happening:  0.08276117518559149
Accuracy: 0.9186679540645479

Probability of Pain in extremity happening:  0.07523740452338114
Accuracy: 0.9246461367791771

Probability of Injection site pain happening:  0.07213318195731219
Accuracy: 0.9303367093288412

Probability of Injection site erythema happening:  0.0666346625619717
Accuracy: 0.933192267395279

Probability of Myalgia happening:  0.05310865344079014
Accuracy: 0.9434024282515356

Probability of Arthralgia happe


Probability of Tinnitus happening:  0.005643948283759012
Accuracy: 0.9941861659510652

Probability of Abdominal discomfort happening:  0.006698925809333344
Accuracy: 0.9938780122028884

Probability of Lymph node pain happening:  0.006234335567885009
Accuracy: 0.9939807301189474

Probability of Rash macular happening:  0.006420901478640823
Accuracy: 0.9941861659510652

Probability of Body temperature happening:  0.006405468474034727
Accuracy: 0.9940423608685827

Probability of Product administered to patient of inappropriate age happening:  0.005979904341550617
Accuracy: 0.9939807301189474

Probability of Gait disturbance happening:  0.006255420195849379
Accuracy: 0.9938985557861002

Probability of Rhinorrhoea happening:  0.005816726995244138
Accuracy: 0.9936931199539824

Probability of Laboratory test happening:  0.00619786003637362
Accuracy: 0.9937547507036177

Probability of No adverse event happening:  0.005572556767015035
Accuracy: 0.994103991618218

Probability of Product storage

In [23]:
# Top 10 most likely symptoms
sorted_dict = {}
sorted_keys = sorted(symptoms_dict, key=symptoms_dict.get, reverse = True)

# Get symptoms with occurences over 1000
for x in sorted_keys:
    sorted_dict[x] = symptoms_dict[x]

num = 0
print("These are the 10 mostly likely syptoms you would get")
for x in sorted_dict:
    if num == 10:
        break
    print("Probability of",x , "happening: ", sorted_dict[x])
    num += 1

These are the 10 mostly likely syptoms you would get
Probability of Headache happening:  0.16819073483129393
Probability of Chills happening:  0.13600177290668575
Probability of Pyrexia happening:  0.13435429467256885
Probability of Fatigue happening:  0.13008476368127284
Probability of Pain happening:  0.12572017835503577
Probability of Nausea happening:  0.09547399036634686
Probability of Dizziness happening:  0.08276117518559149
Probability of Pain in extremity happening:  0.07523740452338114
Probability of Injection site pain happening:  0.07213318195731219
Probability of Injection site erythema happening:  0.0666346625619717


In [None]:
import joblib
# save models
for i, model in enumerate(model_list):
    model_name_raw = Y_list[i]
    model_name = model_name_raw.replace(" ", "_").upper()
    model_path = f"../../src/models/symptoms-sklearn/symptoms_{model_name}.pkl"
    joblib.dump(model, model_path)

joblib.dump(encoder_SEX, "../../src/models/symptoms-sklearn/encoder_SEX.pkl")
joblib.dump(encoder_VAX_MANU, "../../src/models/symptoms-sklearn/encoder_VAX_MANU.pkl")

In [11]:
# load model from filesystem
model_ = joblib.load("../../src/models/symptoms-sklearn/symptoms_HEADACHE.pkl")
model_

LogisticRegression()

In [12]:
sample_ = x_train.iloc[0]
print(sample_)
sample_array_ = np.array(sample_)
sample_array_

AGE_YRS            31.0
SEX                 2.0
VAX_MANU            1.0
VAX_DOSE_SERIES     1.0
Name: 27764, dtype: float64


array([31.,  2.,  1.,  1.])

In [13]:
model_.predict_proba([sample_array_])

array([[0.8376703, 0.1623297]])