# Bayesian Data Integration

In [1]:
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from pgmpy.estimators import MaximumLikelihoodEstimator

import pandas as pd
import numpy as np

from helper import *
from parameter_estimation import *

In [2]:
import sys
del sys.modules['helper']
del sys.modules['parameter_estimation']
from helper import *
from parameter_estimation import *

## Working with a simplified version of the Cancer Model

<img style="float: left" src="BN.png" width="300">

In [3]:
# Create model, add CPDs

cancer_model = BayesianModel([('Cancer', 'Xray'), 
                              ('Cancer', 'Dyspnoea')])
cpd_cancer = TabularCPD(variable='Cancer', variable_card=2,
                        values=[[0.53], [0.47]])
cpd_xray = TabularCPD(variable='Xray', variable_card=2,
                      values=[[0.9, 0.2], [0.1, 0.8]],
                      evidence=['Cancer'], evidence_card=[2])
cpd_dysp = TabularCPD(variable='Dyspnoea', variable_card=2,
                      values=[[0.65, 0.3], [0.35, 0.7]],
                      evidence=['Cancer'], evidence_card=[2])

cancer_model.add_cpds(cpd_cancer, cpd_xray, cpd_dysp)

In [4]:
print(cpd_cancer)
print(cpd_xray)
print(cpd_dysp)

╒══════════╤══════╕
│ Cancer_0 │ 0.53 │
├──────────┼──────┤
│ Cancer_1 │ 0.47 │
╘══════════╧══════╛
╒════════╤══════════╤══════════╕
│ Cancer │ Cancer_0 │ Cancer_1 │
├────────┼──────────┼──────────┤
│ Xray_0 │ 0.9      │ 0.2      │
├────────┼──────────┼──────────┤
│ Xray_1 │ 0.1      │ 0.8      │
╘════════╧══════════╧══════════╛
╒════════════╤══════════╤══════════╕
│ Cancer     │ Cancer_0 │ Cancer_1 │
├────────────┼──────────┼──────────┤
│ Dyspnoea_0 │ 0.65     │ 0.3      │
├────────────┼──────────┼──────────┤
│ Dyspnoea_1 │ 0.35     │ 0.7      │
╘════════════╧══════════╧══════════╛


In [5]:
# Inference 
cancer_infer = VariableElimination(cancer_model)

In [6]:
# Queries
q1 = cancer_infer.query(variables=['Dyspnoea'])
print(q1['Dyspnoea'])

q2 = cancer_infer.query(variables=['Xray'])
print(q2['Xray'])

q3 = cancer_infer.query(variables=['Xray'], evidence={'Cancer': 1})
print(q3['Xray'])

q4 = cancer_infer.query(variables=['Xray'], evidence={'Dyspnoea': 1})
print(q4['Xray'])

╒════════════╤═════════════════╕
│ Dyspnoea   │   phi(Dyspnoea) │
╞════════════╪═════════════════╡
│ Dyspnoea_0 │          0.4855 │
├────────────┼─────────────────┤
│ Dyspnoea_1 │          0.5145 │
╘════════════╧═════════════════╛
╒════════╤═════════════╕
│ Xray   │   phi(Xray) │
╞════════╪═════════════╡
│ Xray_0 │      0.5710 │
├────────┼─────────────┤
│ Xray_1 │      0.4290 │
╘════════╧═════════════╛
╒════════╤═════════════╕
│ Xray   │   phi(Xray) │
╞════════╪═════════════╡
│ Xray_0 │      0.2000 │
├────────┼─────────────┤
│ Xray_1 │      0.8000 │
╘════════╧═════════════╛
╒════════╤═════════════╕
│ Xray   │   phi(Xray) │
╞════════╪═════════════╡
│ Xray_0 │      0.4524 │
├────────┼─────────────┤
│ Xray_1 │      0.5476 │
╘════════╧═════════════╛


#### Generate some synethtic data using Forward Sampling

In [7]:
inference_cm = BayesianModelSampling(cancer_model)

In [8]:
df_cancer = inference_cm.forward_sample(size=1000000, return_type='dataframe')
df_cancer.head()

Unnamed: 0,Cancer,Xray,Dyspnoea
0,1,1,0
1,1,0,0
2,0,0,0
3,0,0,0
4,1,1,0


In [9]:
## Generate two different tables
df_cancer_A = df_cancer[['Cancer','Xray']].copy()
df_cancer_B = df_cancer[['Cancer','Dyspnoea']].copy()

In [10]:
df_cancer_A['Cancer'].sum()/len(df_cancer_A['Cancer'])

0.46996

## Working with biased data

<img style="float: left" src="biased_BN.png" width="300">

#### Generate biased subsample (Table A)
- p(Cancer=1) is much higher in this dataset
- maybe a slightly older population, and therefore more likely to get cancer

In [11]:
n_samples = 500000

In [12]:
# Generate biased sample with p(Cancer = 0) = 0.3 (previously 0.53)
df_sampled_A = biased_subsample(df_cancer_A, n_samples, 'Cancer', 0.30)
print('p(Xray=1) =',df_sampled_A['Xray'].sum()/n_samples)
print('p(Cancer=1) =',df_sampled_A['Cancer'].sum()/n_samples)

p(Xray=1) = 0.589168
p(Cancer=1) = 0.7


**Estimate parameters for Table A**

In [13]:
test_model = BayesianModel([('Cancer', 'Xray')])
estimator = MaximumLikelihoodEstimator(test_model, df_sampled_A)

In [14]:
cpd_est_xray = estimator.estimate_cpd('Xray')
cpd_est_cancer = estimator.estimate_cpd('Cancer')
# estimator.get_parameters()

print(cpd_est_cancer)
print(cpd_est_xray)
# print(cpd_est_dyspnoea)

╒═══════════╤═════╕
│ Cancer(0) │ 0.3 │
├───────────┼─────┤
│ Cancer(1) │ 0.7 │
╘═══════════╧═════╛
╒═════════╤═════════════════════╤═══════════╕
│ Cancer  │ Cancer(0)           │ Cancer(1) │
├─────────┼─────────────────────┼───────────┤
│ Xray(0) │ 0.9020266666666666  │ 0.20032   │
├─────────┼─────────────────────┼───────────┤
│ Xray(1) │ 0.09797333333333333 │ 0.79968   │
╘═════════╧═════════════════════╧═══════════╛




#### Generate unbiased subsample (Table B)

- Has original distribution

In [15]:
df_sampled_B = df_cancer_B.sample(n=n_samples)
print('p(Dyspnoea=1) =',df_sampled_B['Dyspnoea'].sum()/n_samples)
print('p(Cancer=1) =',df_sampled_B['Cancer'].sum()/n_samples)

p(Dyspnoea=1) = 0.513048
p(Cancer=1) = 0.47017


### Bayesian Data Integration w/ Importance Sampling Correction 
-----

In [16]:
cancer_model_cor = BayesianModel([('Cancer', 'Xray'), 
                              ('Cancer', 'Dyspnoea')])

In [17]:
parent, b_child_y1x0, b_child_y1x1, child_y1x0, child_y1x1 = parameter_estimation(df_sampled_B, df_sampled_A)

In [18]:
print(parent, b_child_y1x0, b_child_y1x1, child_y1x0, child_y1x1)

0.4702 0.098 0.7997 0.349 0.6979


In [19]:
cancer_model_cor = BayesianModel([('Cancer', 'Xray'), 
                              ('Cancer', 'Dyspnoea')])

cpd_cancer_cor = TabularCPD(variable='Cancer', variable_card=2,
                        values=[[1-parent], [parent]])
cpd_xray_cor = TabularCPD(variable='Xray', variable_card=2,
                      values=[[1-b_child_y1x0, 1-b_child_y1x1], [b_child_y1x0, b_child_y1x1]],
                      evidence=['Cancer'], evidence_card=[2])
cpd_dysp_cor = TabularCPD(variable='Dyspnoea', variable_card=2,
                      values=[[1-child_y1x0, 1-child_y1x1], [child_y1x0, child_y1x1]],
                      evidence=['Cancer'], evidence_card=[2])

cancer_model_cor.add_cpds(cpd_cancer_cor, cpd_xray_cor, cpd_dysp_cor)

In [20]:
print(cpd_cancer_cor)
print(cpd_xray_cor)
print(cpd_dysp_cor)

╒══════════╤════════╕
│ Cancer_0 │ 0.5298 │
├──────────┼────────┤
│ Cancer_1 │ 0.4702 │
╘══════════╧════════╛
╒════════╤══════════╤═════════════════════╕
│ Cancer │ Cancer_0 │ Cancer_1            │
├────────┼──────────┼─────────────────────┤
│ Xray_0 │ 0.902    │ 0.20030000000000003 │
├────────┼──────────┼─────────────────────┤
│ Xray_1 │ 0.098    │ 0.7997              │
╘════════╧══════════╧═════════════════════╛
╒════════════╤══════════╤═════════════════════╕
│ Cancer     │ Cancer_0 │ Cancer_1            │
├────────────┼──────────┼─────────────────────┤
│ Dyspnoea_0 │ 0.651    │ 0.30210000000000004 │
├────────────┼──────────┼─────────────────────┤
│ Dyspnoea_1 │ 0.349    │ 0.6979              │
╘════════════╧══════════╧═════════════════════╛


In [21]:
cancer_infer_cor = VariableElimination(cancer_model_cor)

In [22]:
# Queries
q1 = cancer_infer_cor.query(variables=['Dyspnoea'])
print(q1['Dyspnoea'])

q2 = cancer_infer_cor.query(variables=['Xray'])
print(q2['Xray'])

q3 = cancer_infer_cor.query(variables=['Xray'], evidence={'Cancer': 1})
print(q3['Xray'])

q4 = cancer_infer_cor.query(variables=['Xray'], evidence={'Dyspnoea': 1})
print(q4['Xray'])

╒════════════╤═════════════════╕
│ Dyspnoea   │   phi(Dyspnoea) │
╞════════════╪═════════════════╡
│ Dyspnoea_0 │          0.4869 │
├────────────┼─────────────────┤
│ Dyspnoea_1 │          0.5131 │
╘════════════╧═════════════════╛
╒════════╤═════════════╕
│ Xray   │   phi(Xray) │
╞════════╪═════════════╡
│ Xray_0 │      0.5721 │
├────────┼─────────────┤
│ Xray_1 │      0.4279 │
╘════════╧═════════════╛
╒════════╤═════════════╕
│ Xray   │   phi(Xray) │
╞════════╪═════════════╡
│ Xray_0 │      0.2003 │
├────────┼─────────────┤
│ Xray_1 │      0.7997 │
╘════════╧═════════════╛
╒════════╤═════════════╕
│ Xray   │   phi(Xray) │
╞════════╪═════════════╡
│ Xray_0 │      0.4532 │
├────────┼─────────────┤
│ Xray_1 │      0.5468 │
╘════════╧═════════════╛


### The results are the same as above.