In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import random

In [3]:
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.utils import resample

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

from imblearn.pipeline import make_pipeline
import statsmodels.api as sm

## Loading the data

In [5]:
obs_data = pd.read_csv("observation_features.csv")
treat_data = pd.read_csv("treatment_features.csv")
action_data = pd.read_csv("treatment_actions.csv")
outcome_data = pd.read_csv("treatment_outcomes.csv")

In [6]:
cols = (['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death',
        'Age', 'Gender', 'Income'] +
         [f'Gene_{i+1:03}' for i in range(128)] +
         ['Asthma', 'Obesity', 'Smoking', 'Diabetes', 'Heart disease', 'Hypertension',
         'Vacc_1', 'Vacc_2', 'Vacc_3'])

obs_data.columns = cols
treat_data.columns = cols
outcome_data.columns = cols[:10]
action_data.columns = ['Treatment_1', 'Treatment_2']

## Slicing the data

In [7]:
symptoms = obs_data.iloc[:,0:10]
age = obs_data.iloc[:,10]
gender = obs_data.iloc[:,11]
income = obs_data.iloc[:,12]
genome = obs_data.iloc[:,13:141]
comorbidities = obs_data.iloc[:,141:147]
vaccination_status = np.array(obs_data.iloc[:,147:])

vacced = obs_data[np.sum(obs_data.iloc[:,-3:], axis=1) == 1]
vacced_neg = vacced[vacced.iloc[:,1]==0]
vacced_pos = vacced[vacced.iloc[:,1]==1]

un_vacced = obs_data[np.sum(obs_data.iloc[:,-3:], axis=1) == 0]
un_vacced_neg = un_vacced[un_vacced.iloc[:,1]==0]
un_vacced_pos = un_vacced[un_vacced.iloc[:,1]==1]

symptom_names = ['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death']
prior_probs= [np.sum(obs_data.iloc[:,i]) / len(obs_data) for i, key in enumerate(symptom_names)]

In [8]:
class Pipeline_observational():
    def __init__(self,X,y,clf,obs_data,random_state=None):
        self.obs_data = obs_data
        self.X = X
        self.y = y
        self.clf = clf
        self.threshold = threshold = 0.8
        self.random_state = random_state
        self.parameter_grid = parameter_grid = [{'kernel': ['poly', 'rbf'],
                                                'C': [0.01, 0.1,1, 10, 100,],
                                                'gamma': [.1, .01, 1e-3]}, ]
        
        self.symptom_names = ['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death']

    def run_select_features(self):
        """This function finds the selected features, then runs BIC test in order 
        to see whether the model with selected features are better than the full model"""
        
        #finding the best features:
        self.best_features = best_features = self.select_features(self.X,self.y,self.threshold)
        self.important_genes = [col for col in self.X.iloc[:,best_features].columns]
        
        ###tuning the parameters for the given clf
        #print("Classification using best features")
        #self.tune_parameters(X.iloc[:,best_features],y,clf,parameter_grid)

        #print("Classification using all features")
        #self.tune_parameters(X,y,clf,parameter_grid)
        
        
        #evalutiong th models by BIC
        #print("BIC-test:")
        #BIC_selected = self.model_evaluation(self.X.iloc[:,best_features],self.y)
        #BIC_all = self.model_evaluation(self.X,self.y)
        
        """print(f'BIC_selected: {BIC_selected}')
        print(f'BIC_all: {BIC_all}')
        
        if BIC_selected <= BIC_all:
            print("BIC: selected model is better")
        else:
            print("BIC: full model is better")"""


    def select_features(self, X, Y, threshold):
        """ Select the most important features of a data set, where X (2D)
        contains the feature data, and Y (1D) contains the target
        """
        X, Y = np.array(X), np.array(Y)

        n_features = X.shape[1]
        n_data =  X.shape[0]
        alpha_b = np.ones([n_features, 2 ])
        beta_b = np.ones([n_features, 2])
        log_p = np.zeros(n_features)

        log_null = 0
        alpha = 1
        beta = 1
        for t in range(n_data):
            p_null = alpha / (alpha + beta)
            log_null += np.log(p_null)*Y[t] + np.log(1-p_null)*(1 - Y[t])
            alpha += Y[t]
            beta += (1 - Y[t])
            for i in range(n_features):
                x_ti = int(X[t,i])
                p = alpha_b[i, x_ti] / (alpha_b[i, x_ti] + beta_b[i, x_ti])
                log_p[i] += np.log(p)*Y[t] + np.log(1-p)*(1 - Y[t])
                alpha_b[i, x_ti] += Y[t]
                beta_b[i, x_ti] += (1 - Y[t])
        log_max=np.mean(log_p)
        log_max2=np.mean(log_null)
        log_p=log_p-log_max
        log_null=log_null-log_max2
        #p = np.exp(log_p) / (np.exp(log_p) + np.exp(log_null))
        p = 1 / (np.exp(log_null - log_p) + 1)
        #print(f"{(log_p)=}\n{(log_null)=}\n{(log_p) + (log_null)=}\n {p=}")
        #print(f"{np.exp(log_p)=}\n{np.exp(log_null)=}\n{np.exp(log_p) + np.exp(log_null)=}")

        features = [i for i in range(n_features) if p[i] > threshold]

        return features

    def tune_parameters(self, X, y, clf, parameter_grid, scoring=None, cv=None):
        """ Given X, y, a classifier and a parameter grid,
        find the best parameters for the classifier and data using GridSearch
        with cross validation.
        """
        # The code below is from
        # https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=self.random_state)

        print(f"# Tuning hyper-parameters for {scoring=}")
        print()

        clf = GridSearchCV(    clf,
                                parameter_grid,
                                scoring=scoring,
                                n_jobs=-1,
                                cv=cv
                            ).fit(X_train, y_train)

        #piped_clf
        print("Best parameters set found on development set:")
        print()
        print(f"{clf.best_params_}, score: {clf.best_score_:.4f}")
        print()
        """print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()"""

        print("Classification report:")
        print()

        print(classification_report(y_test, clf.predict(X_test)))
        print()

    def model_evaluation(self,X,y):
        """Input: an X"""
        """Output: Bayesian information criterion - BIC(X)"""
        
        #fitting a regression in order to compute BIC
        model = sm.OLS(y,X).fit()
        return model.bic

    def find_alpha(self, beta,p):
        """ Given beta and a mean probability p, compute and return the alpha of a beta distribution. """
        return beta*p/(1-p)
    def find_efficacy(self, group_pos: pd.DataFrame, group_neg: pd.DataFrame, symptom, prior_probs):
        if isinstance(symptom, int):
            symptom_index = symptom
            symptom_name = group_pos.keys()[symptom]
        else:
            symptom_name = symptom
            symptom_index = list(group_pos.keys()).index(symptom)

        group_pos_count = np.sum(group_pos[symptom_name] * group_pos.iloc[:,1])
        group_neg_count = np.sum(group_neg[symptom_name] * group_neg.iloc[:,1])

        v = group_pos_count/len(group_pos)
        n_v = group_neg_count/len(group_neg)

        if n_v == 0:
            print(f'{v=}, {n_v=}: Division by zero')
            return

        IRR = v/n_v

        #print(v, n_v)
        efficacy = 100*(1- IRR)

        N = 100_000
        beta = 1
        p = prior_probs[symptom_index]
        alpha = self.find_alpha(beta,p)

        samples_group_pos = stats.beta.rvs(alpha + group_pos_count, beta + len(group_pos) - group_pos_count, size=N)
        samples_group_neg = stats.beta.rvs(alpha + group_neg_count, beta + len(group_neg) - group_neg_count, size=N)

        samples_ve = 100 * (1 - samples_group_pos/samples_group_neg)
        lower = np.percentile(samples_ve, 2.5)
        upper = np.percentile(samples_ve, 97.5)
        
        if efficacy >= lower and efficacy <= upper:
            status = 'not rejected'
        else:
            status = 'rejected'
            
        print(f'{symptom_name:15s}: {efficacy:3.3f} - ({lower:3.3f}, {upper:3.3f}) - {status}')
    
    def run_efficacy(self, vacced, un_vacced,prior_probs):
        for i, s in enumerate(self.symptom_names):
            self.find_efficacy(vacced,un_vacced,i,prior_probs)
        print("")
        
        """
        vacc_type1 = self.obs_data[obs_data.Vacc_1 == 1]
        vacc_type2 = self.obs_data[obs_data.Vacc_2 == 1]
        vacc_type3 = self.obs_data[obs_data.Vacc_3 == 1]
        vaccination_types = [vacc_type1,vacc_type2,vacc_type3]
        vaccination_names = ['type 1', 'type 2', 'type 3']
        
        for name in vaccination_names:
            print(name)
            index = vaccination_names.index(name)
            for i, s in enumerate(self.symptom_names):
                self.find_efficacy(vaccination_types[index],un_vacced,i,prior_probs)
            print("")
        """
        
        
    def side_effects(self, vacced_neg, un_vacced_neg, start, end):
        df = pd.DataFrame(index=vacced_neg.keys()[start:end],
                          columns = ("p1 (%)", "p2 (%)", "Diff (%)", "Credible Interval (%)", "Null Hypothesis", ),
                         )

        for i in range(start, end):
            symptom = vacced_neg.keys()[i]
            p1 = vacced_neg.sum()[i] / len(self.y) / (len(vacced_neg) / len(self.y))
            p2 = un_vacced_neg.sum()[i] / len(self.y) / (len(un_vacced_neg) / len(self.y))


            lower = (p1-p2 - 1.64 * np.sqrt((p1*(1-p1) / len(vacced_neg)) + (p2 * (1-p2) / len(un_vacced_neg))))
            higher = (p1-p2 + 1.64 * np.sqrt((p1*(1-p1) / len(vacced_neg)) + (p2 * (1-p2) / len(un_vacced_neg))))

            p1, p2, lower, higher = p1 * 100, p2 * 100, lower * 100, higher * 100

            df.loc[symptom] = np.array([round(p1, 4), round(p2, 4), round(p1 - p2, 4), (round(lower, 4), round(higher, 4)),
                               "rejected" if lower>0 else "not rejected", ],dtype=object)


        return df
    
        

In [9]:
class Pipeline_treatment():
    def __init__(self,treat_data,action_data,outcome_data):
        import warnings
        warnings.filterwarnings('ignore')

        new_treat_data = treat_data[((np.sum(treat_data.iloc[:,2:10],axis=1) > 0.0) | np.sum(outcome_data.iloc[:,2:10],axis=1) > 0.0)]
        group_first = new_treat_data[((action_data.iloc[:,0] == 1) & (action_data.iloc[:,1] == 0))]
        group_second = new_treat_data[((action_data.iloc[:,0] == 0) & (action_data.iloc[:,1] == 1))]
        group_both = new_treat_data[((action_data.iloc[:,0] == 1) & (action_data.iloc[:,1] == 1))]
        group_none = new_treat_data[((action_data.iloc[:,0] == 0) & (action_data.iloc[:,1] == 0))]

        new_outcome_data = outcome_data[((np.sum(treat_data.iloc[:,2:10],axis=1) > 0.0) | np.sum(outcome_data.iloc[:,2:10],axis=1) > 0.0)]
        outcome_first = new_outcome_data[((action_data.iloc[:,0] == 1) & (action_data.iloc[:,1] == 0))]
        outcome_second = new_outcome_data[((action_data.iloc[:,0] == 0) & (action_data.iloc[:,1] == 1))]
        outcome_both = new_outcome_data[((action_data.iloc[:,0] == 1) & (action_data.iloc[:,1] == 1))]
        outcome_none = new_outcome_data[((action_data.iloc[:,0] == 0) & (action_data.iloc[:,1] == 0))]
        prior_probs= [(np.sum(new_treat_data[sym]) + np.sum(new_outcome_data[sym]) / (len(new_treat_data) * 2) for sym in symptom_names][2:]

        for outcome_treated, pre_treated, treatment in zip([outcome_first, outcome_second, outcome_both],[group_first, group_second, group_both],['treatment 1', 'treatment 2', 'both treatments']):
            print(f"{treatment} efficacy:")
            for i, key in enumerate(outcome_data.keys()[2:]):
                #print(key)
                self.treatment_efficacy(outcome_treated, pre_treated, outcome_none, group_none, prior_probs[i], key)
            print()


    def find_alpha(self, beta,p):
        """ Given beta and a mean probability p, compute and return the alpha of a beta distribution. """
        return beta*p/(1-p)

    def treatment_efficacy(self, outcome_treated, precondition_treated, outcome_untreated, precondition_untreated, p, symptom_name, log=True):
        group_pos_count = np.sum(outcome_treated[symptom_name])
        group_neg_count = np.sum(outcome_untreated[symptom_name])

        group_pos_total = np.sum(precondition_treated[symptom_name])
        group_neg_total = np.sum(precondition_untreated[symptom_name])

        if any(v == 0 for v in (group_pos_total, group_neg_total, group_neg_count)):
            print(f'{symptom_name:15s}: Division by zero - not enough data to compute efficacy' )
            return

        v = group_pos_count / group_pos_total
        n_v = group_neg_count / group_neg_total
        IRR = v/n_v

        efficacy = 100 * (1- IRR)

        N = 100_000
        beta = 1
        alpha = self.find_alpha(beta,p)

        #symptom_name = symptom_names[symptom_index]
        samples_group_pos = stats.beta.rvs(alpha + group_pos_count, beta + len(outcome_treated) - group_pos_count, size=N)
        samples_group_neg = stats.beta.rvs(alpha + group_neg_count, beta + len(outcome_untreated) - group_neg_count, size=N)

        samples_ve = 100 * (1 - samples_group_pos/samples_group_neg)
        lower = np.percentile(samples_ve, 2.5)
        upper = np.percentile(samples_ve, 97.5)
        if log is True:
            print(f'{symptom_name:15s}: {efficacy:7.3f} - 95% CI: ({lower:3.3f}, {upper:3.3f})')

        return efficacy, (lower, upper)

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' (1638845493.py, line 17)

### The experiment setup

In this section we will set up the experiment by running the pipeline with different generated data and see whether it works.

In [10]:
prior_probs= [np.sum(obs_data.iloc[:,i]) / len(obs_data) for i, key in enumerate(symptom_names)]
pipe = Pipeline_observational(genome,symptoms.iloc[:,1],SVC(),obs_data)
pipe.run_select_features()
pipe.run_efficacy(vacced, un_vacced,prior_probs)
pipe.side_effects(vacced_neg, un_vacced_neg, 2, 10)

Covid-Recovered: 28.965 - (20.082, 36.912) - not rejected
Covid-Positive : 21.464 - (19.610, 23.258) - not rejected
No-Taste/Smell : 47.521 - (41.629, 52.877) - not rejected
Fever          : 51.575 - (44.944, 57.391) - not rejected
Headache       : 45.473 - (28.899, 58.300) - not rejected
Pneumonia      : 57.810 - (51.551, 63.282) - not rejected
Stomach        : 50.542 - (25.763, 67.464) - not rejected
Myocarditis    : 44.531 - (28.477, 57.154) - not rejected
Blood-Clots    : 57.190 - (50.899, 62.763) - not rejected
Death          : 93.394 - (90.533, 95.668) - not rejected



Unnamed: 0,p1 (%),p2 (%),Diff (%),Credible Interval (%),Null Hypothesis
No-Taste/Smell,0.0812,0.0568,0.0243,"(-0.0067, 0.0554)",not rejected
Fever,9.9028,0.5081,9.3947,"(9.1613, 9.6281)",rejected
Headache,5.5788,1.053,4.5258,"(4.3287, 4.7229)",rejected
Pneumonia,0.1332,0.1437,-0.0106,"(-0.0557, 0.0345)",not rejected
Stomach,0.231,0.2574,-0.0264,"(-0.0864, 0.0336)",not rejected
Myocarditis,0.2143,0.0468,0.1675,"(0.1273, 0.2077)",rejected
Blood-Clots,0.2435,0.1103,0.1331,"(0.0847, 0.1816)",rejected
Death,0.0354,0.0,0.0354,"(0.0213, 0.0494)",rejected


In [11]:
Pipeline_treatment(treat_data,action_data,outcome_data)

NameError: name 'Pipeline_treatment' is not defined

## Generating synthetic data

In [41]:
def generate_binary_data(num_features, N, correlation=[0.9, 0.5]):
    data = np.random.choice(2, size=(N, num_features))
    df = pd.DataFrame(data)
    df["Target"] = np.zeros(N).astype(int)
    for i, cor in enumerate(correlation):
        if i >= num_features:
            break

        df["Target"] |= df.iloc[:, i] * np.random.choice(2, size=N, p=[(1-cor), cor])

    return df.iloc[:, :num_features], df["Target"]

In [54]:
def generate_genomes_symptoms(random_indecies):
    cor = [0.001 for _ in range(128)]
    for r in random_indecies:
        cor[r] = 0.6
    X,y = generate_binary_data(128,100_000, correlation=cor)
    X.columns = [f'Gene_{i+1:03}' for i in range(128)]
    y.columns = 'Covid-Positive'
    return X,y   

In [55]:
random_indecies = random.sample(range(128), 20)
genomes,symptom = generate_genomes_symptoms(random_indecies)
pipe = Pipeline_observational(genomes,symptom,SVC(),obs_data)
pipe.run_select_features()

In [56]:
pipe.best_features

[7,
 32,
 34,
 39,
 40,
 43,
 46,
 49,
 56,
 64,
 77,
 79,
 92,
 97,
 99,
 105,
 107,
 110,
 112,
 124,
 127]

In [57]:
for p in pipe.best_features: 
    if p not in random_indecies:
        print(p)

7
43
79


In [253]:
for i in range(10):
    print(f'Run nr: {i}')
    random_indecies = random.sample(range(128), 20)
    genomes,symptom = generate_genomes_symptoms(random_indecies)
    pipe = Pipeline_observational(genomes,symptom,SVC(),obs_data)
    pipe.run_select_features()
    
    for i in pipe.best_features:
        if i not in random_indecies:
            print(f'{i}: fail')


    #kjøre efficacy
    #kjøre sideeffects

Run nr: 0
Run nr: 1
Run nr: 2
Run nr: 3
Run nr: 4
Run nr: 5
Run nr: 6
Run nr: 7
Run nr: 8
Run nr: 9


## Testing the generate_vaccine

In [448]:
def generate_vaccine_data(random_indecies):
    cor = [0 for _ in range(10)]
    for r in random_indecies:
        cor[r] = 0.0
    symp_generated,vac_generated = generate_binary_data(10,100_000, correlation=cor)
    
    
    vaccines_generated = np.zeros([len(vac_generated),3])
    for i,y in enumerate(vac_generated):
        random_index = random.randint(0,2)
        if vac_generated[i] == 1:
            vaccines_generated[i][random_index] = 1
    
    #symp_generated.columns = ['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death']
    #vaccines_generated = pd.DataFrame(vaccines_generated,columns = ['Vacc_1', 'Vacc_2', 'Vacc_3'])
    
    return symp_generated,vaccines_generated


In [450]:
#denne funksjonen finner gjennomsnittet av antall 1-ere per per rad
def find_mean_of_1s(symp_generated):
    sum_rows = 0
    for i in range(len(symp_generated.iloc[:,])):
        sum_rows += np.sum(symp_generated.iloc[i,:])

    return sum_rows/len(symp_generated.iloc[:,])
    
print(f'generated: {find_mean_of_1s(symp_generated)}')
print(f'real: {find_mean_of_1s(symptoms)}')

generated: 4.99158
real: 0.4029940299402994


In [506]:
sym_g = np.zeros([len(vaccines_generated),10])
vac_g = np.zeros([len(vac_generated),3])

for i,s in enumerate(sym_g):    
    prob = random.uniform(0, 1)    
    if prob > 0.6:
        rand_ind = random.randint(0,9)
        s[rand_ind] = 1
    else:
        rand_ind = random.randint(0,2)
        vac_g[i][rand_ind] = 1

vac_g = pd.DataFrame(vac_g,columns = ['Vacc_1', 'Vacc_2', 'Vacc_3'])
sym_g = pd.DataFrame(sym_g,columns = ['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death'])
find_mean_of_1s(sym_g)

0.40119

In [531]:
symp_generated,vac_generated = generate_binary_data(1,100_000)


vac_g = np.zeros([100_000,3])
sym_g = np.zeros([100_000,10])
for i,y in enumerate(vac_generated):
    random_index = random.randint(0,2)
    if vac_generated[i] == 1:
        vac_g[i][random_index] = 1


for i,row in symp_generated.iterrows():
    random_index = random.randint(0,9)
    if symp_generated.iloc[i,0] == 1:
        sym_g[i][random_index] = 1
        
vac_g = pd.DataFrame(vac_g,columns = ['Vacc_1', 'Vacc_2', 'Vacc_3'])
sym_g = pd.DataFrame(sym_g,columns = ['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death'])

In [532]:
vacced = sym_g[np.sum(vac_g.iloc[:,-3:], axis=1) == 1]
un_vacced = sym_g[np.sum(vac_g.iloc[:,-3:], axis=1) == 0]
prior_probs_generated = [np.sum(sym_g.iloc[:,i]) / len(sym_g) for i, key in enumerate(sym_g.columns)]

In [534]:
pipe = Pipeline_observational(X,y,SVC(),obs_data)
pipe.run_efficacy(vacced, un_vacced,prior_probs_generated)

v=0.0, n_v=0.0: Division by zero
Covid-Positive : -1003.533 - (-1109.173, -910.154) - not rejected
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero
v=0.0, n_v=0.0: Division by zero



In [537]:
symptoms = obs_data.iloc[:,0:10]
age = obs_data.iloc[:,10]
gender = obs_data.iloc[:,11]
income = obs_data.iloc[:,12]
genome = obs_data.iloc[:,13:141]
comorbidities = obs_data.iloc[:,141:147]
vaccination_status = np.array(obs_data.iloc[:,147:])

vacced = obs_data[np.sum(obs_data.iloc[:,-3:], axis=1) == 1]
vacced_neg = vacced[vacced.iloc[:,1]==0]
vacced_pos = vacced[vacced.iloc[:,1]==1]

un_vacced = obs_data[np.sum(obs_data.iloc[:,-3:], axis=1) == 0]
un_vacced_neg = un_vacced[un_vacced.iloc[:,1]==0]
un_vacced_pos = un_vacced[un_vacced.iloc[:,1]==1]

symptom_names = ['Covid-Recovered', 'Covid-Positive', 'No-Taste/Smell', 'Fever', 'Headache', 'Pneumonia', 'Stomach', 'Myocarditis', 'Blood-Clots', 'Death']
prior_probs= [np.sum(obs_data.iloc[:,i]) / len(obs_data) for i, key in enumerate(symptom_names)]