In [1]:
from pgmpy.models import BayesianModel

cancer_model = BayesianModel([('Pollution', 'Cancer'), ('Smoker', 'Cancer'),
                              ('Cancer', 'Xray'), ('Cancer', 'Dyspnoea')])

In [2]:
print(cancer_model)




In [3]:
cancer_model.nodes()

NodeView(('Pollution', 'Cancer', 'Smoker', 'Xray', 'Dyspnoea'))

In [4]:
cancer_model.edges()

OutEdgeView([('Pollution', 'Cancer'), ('Cancer', 'Xray'), ('Cancer', 'Dyspnoea'), ('Smoker', 'Cancer')])

In [5]:
cancer_model.get_cpds()

[]

In [6]:
from pgmpy.factors.discrete import TabularCPD

cpd_poll = TabularCPD(variable='Pollution',
                      variable_card=2,
                      values=[[0.9], [0.1]])
cpd_smoke = TabularCPD(variable='Smoker',
                       variable_card=2,
                       values=[[0.3], [0.7]])
cpd_cancer = TabularCPD(variable='Cancer',
                        variable_card=2,
                        values=[[0.03, 0.05, 0.001, 0.02],
                                [0.97, 0.95, 0.999, 0.98]],
                        evidence=['Smoker', 'Pollution'],
                        evidence_card=[2, 2])
cpd_xray = TabularCPD(variable='Xray',
                      variable_card=2,
                      values=[[0.9, 0.2], [0.1, 0.8]],
                      evidence=['Cancer'],
                      evidence_card=[2])
cpd_dysp = TabularCPD(variable='Dyspnoea',
                      variable_card=2,
                      values=[[0.65, 0.3], [0.35, 0.7]],
                      evidence=['Cancer'],
                      evidence_card=[2])

In [7]:
# Associating the parameters with the model structure.
cancer_model.add_cpds(cpd_poll, cpd_smoke, cpd_cancer, cpd_xray, cpd_dysp)
# Checking if the cpds are valid for the model.
cancer_model.check_model()

True

In [8]:
cancer_model.get_cpds()

[<TabularCPD representing P(Pollution:2) at 0x7f0ebb00c6a0>,
 <TabularCPD representing P(Smoker:2) at 0x7f0ebb00c668>,
 <TabularCPD representing P(Cancer:2 | Smoker:2, Pollution:2) at 0x7f0ebb00c630>,
 <TabularCPD representing P(Xray:2 | Cancer:2) at 0x7f0ebb00c748>,
 <TabularCPD representing P(Dyspnoea:2 | Cancer:2) at 0x7f0ebb00c780>]

In [9]:
print(cancer_model.get_cpds('Pollution'))
print(cancer_model.get_cpds('Smoker'))
print(cancer_model.get_cpds('Xray'))
print(cancer_model.get_cpds('Dyspnoea'))
print(cancer_model.get_cpds('Cancer'))

+-------------+-----+
| Pollution_0 | 0.9 |
+-------------+-----+
| Pollution_1 | 0.1 |
+-------------+-----+
+----------+-----+
| Smoker_0 | 0.3 |
+----------+-----+
| Smoker_1 | 0.7 |
+----------+-----+
+--------+----------+----------+
| Cancer | Cancer_0 | Cancer_1 |
+--------+----------+----------+
| Xray_0 | 0.9      | 0.2      |
+--------+----------+----------+
| Xray_1 | 0.1      | 0.8      |
+--------+----------+----------+
+------------+----------+----------+
| Cancer     | Cancer_0 | Cancer_1 |
+------------+----------+----------+
| Dyspnoea_0 | 0.65     | 0.3      |
+------------+----------+----------+
| Dyspnoea_1 | 0.35     | 0.7      |
+------------+----------+----------+
+-----------+-------------+-------------+-------------+-------------+
| Smoker    | Smoker_0    | Smoker_0    | Smoker_1    | Smoker_1    |
+-----------+-------------+-------------+-------------+-------------+
| Pollution | Pollution_0 | Pollution_1 | Pollution_0 | Pollution_1 |
+-----------+------------

In [10]:
cancer_model.local_independencies('Xray')
cancer_model.local_independencies('Pollution')
cancer_model.local_independencies('Smoker')
cancer_model.local_independencies('Dyspnoea')
cancer_model.local_independencies('Cancer')

(Cancer _|_ Xray, Dyspnoea | Smoker, Pollution)

In [11]:
cancer_model.get_independencies()

(Pollution _|_ Smoker)
(Pollution _|_ Xray, Dyspnoea | Cancer)
(Pollution _|_ Xray, Dyspnoea | Cancer, Smoker)
(Pollution _|_ Dyspnoea | Cancer, Xray)
(Pollution _|_ Xray | Cancer, Dyspnoea)
(Pollution _|_ Dyspnoea | Cancer, Smoker, Xray)
(Pollution _|_ Xray | Cancer, Smoker, Dyspnoea)
(Smoker _|_ Pollution)
(Smoker _|_ Xray, Dyspnoea | Cancer)
(Smoker _|_ Xray, Dyspnoea | Cancer, Pollution)
(Smoker _|_ Dyspnoea | Cancer, Xray)
(Smoker _|_ Xray | Cancer, Dyspnoea)
(Smoker _|_ Dyspnoea | Cancer, Xray, Pollution)
(Smoker _|_ Xray | Cancer, Pollution, Dyspnoea)
(Xray _|_ Smoker, Pollution, Dyspnoea | Cancer)
(Xray _|_ Pollution, Dyspnoea | Cancer, Smoker)
(Xray _|_ Smoker, Dyspnoea | Cancer, Pollution)
(Xray _|_ Smoker, Pollution | Cancer, Dyspnoea)
(Xray _|_ Dyspnoea | Cancer, Smoker, Pollution)
(Xray _|_ Pollution | Cancer, Smoker, Dyspnoea)
(Xray _|_ Smoker | Cancer, Pollution, Dyspnoea)
(Dyspnoea _|_ Smoker, Pollution, Xray | Cancer)
(Dyspnoea _|_ Pollution, Xray | Cancer, Smoker)
(Dy

In [12]:
# Doing exact inference using Variable Elimination
from pgmpy.inference import VariableElimination

cancer_infer = VariableElimination(cancer_model)

In [13]:
# Computing the probability of bronc given smoke.
q = cancer_infer.query(variables=['Cancer'], evidence={'Smoker': 1})
print(q['Cancer'])

+----------+---------------+
| Cancer   |   phi(Cancer) |
| Cancer_0 |        0.0029 |
+----------+---------------+
| Cancer_1 |        0.9971 |
+----------+---------------+


  phi1.values = phi1.values[slice_]


In [14]:
# Computing the probability of bronc given smoke.
q = cancer_infer.query(variables=['Cancer'],
                       evidence={
                           'Smoker': 1,
                           'Pollution': 1
                       })
print(q['Cancer'])

+----------+---------------+
| Cancer   |   phi(Cancer) |
| Cancer_0 |        0.0200 |
+----------+---------------+
| Cancer_1 |        0.9800 |
+----------+---------------+


In [15]:
import sys
import urllib
from urllib.request import urlopen

import matplotlib.pyplot as plt  # Visuals
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn as skl

In [16]:
Cleveland_data_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data'
np.set_printoptions(
    threshold=sys.maxsize)  #see a whole array when we output it
names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal', 'heartdisease'
]
heartDisease = pd.read_csv(urlopen(Cleveland_data_URL),
                           names=names)  #gets Cleveland data

In [17]:
del heartDisease['ca']
del heartDisease['slope']
del heartDisease['thal']
del heartDisease['oldpeak']

heartDisease = heartDisease.replace('?', np.nan)

In [18]:
from pgmpy.estimators import BayesianEstimator, MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel

model = BayesianModel([('age', 'trestbps'), ('age', 'fbs'),
                       ('sex', 'trestbps'), ('sex', 'trestbps'),
                       ('exang', 'trestbps'), ('trestbps', 'heartdisease'),
                       ('fbs', 'heartdisease'), ('heartdisease', 'restecg'),
                       ('heartdisease', 'thalach'), ('heartdisease', 'chol')])

# Learing CPDs using Maximum Likelihood Estimators
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)
#for cpd in model.get_cpds():
#   print("CPD of {variable}:".format(variable=cpd.variable))
#  print(cpd)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  states = sorted(list(self.data.ix[:, variable].dropna().unique()))
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  state_count_data = data.ix[:, variable].value_counts()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  state_counts.ix[:, (state_counts == 0).all()] = 1


In [19]:
print(model.get_cpds('age'))
print(model.get_cpds('chol'))
print(model.get_cpds('sex'))
model.get_independencies()

+---------+------------+
| age(28) | 0.00383142 |
+---------+------------+
| age(29) | 0.00383142 |
+---------+------------+
| age(30) | 0.00383142 |
+---------+------------+
| age(31) | 0.00766284 |
+---------+------------+
| age(32) | 0.0153257  |
+---------+------------+
| age(33) | 0.00766284 |
+---------+------------+
| age(34) | 0.0153257  |
+---------+------------+
| age(35) | 0.0191571  |
+---------+------------+
| age(36) | 0.0191571  |
+---------+------------+
| age(37) | 0.0306513  |
+---------+------------+
| age(38) | 0.0191571  |
+---------+------------+
| age(39) | 0.0344828  |
+---------+------------+
| age(40) | 0.0191571  |
+---------+------------+
| age(41) | 0.0383142  |
+---------+------------+
| age(42) | 0.0268199  |
+---------+------------+
| age(43) | 0.0421456  |
+---------+------------+
| age(44) | 0.0268199  |
+---------+------------+
| age(45) | 0.0229885  |
+---------+------------+
| age(46) | 0.045977   |
+---------+------------+
| age(47) | 0.0344828  |


(age _|_ sex, exang)
(age _|_ sex, exang | fbs)
(age _|_ sex | exang)
(age _|_ exang | sex)
(age _|_ restecg, chol, thalach | heartdisease)
(age _|_ sex | exang, fbs)
(age _|_ exang | sex, fbs)
(age _|_ restecg, chol, heartdisease, thalach | trestbps, fbs)
(age _|_ restecg, chol, thalach | heartdisease, fbs)
(age _|_ restecg, chol | heartdisease, thalach)
(age _|_ chol, thalach | restecg, heartdisease)
(age _|_ restecg, chol, thalach | exang, heartdisease)
(age _|_ restecg, thalach | chol, heartdisease)
(age _|_ restecg, chol, thalach | sex, heartdisease)
(age _|_ restecg, chol, thalach | trestbps, heartdisease)
(age _|_ restecg, chol, heartdisease | trestbps, fbs, thalach)
(age _|_ restecg, chol | heartdisease, fbs, thalach)
(age _|_ chol, heartdisease, thalach | restecg, trestbps, fbs)
(age _|_ chol, thalach | restecg, heartdisease, fbs)
(age _|_ restecg, chol, heartdisease, thalach | trestbps, exang, fbs)
(age _|_ restecg, chol, thalach | heartdisease, exang, fbs)
(age _|_ restecg, 

In [20]:
from pgmpy.inference import VariableElimination

HeartDisease_infer = VariableElimination(model)

In [21]:
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={'age': 28})
print(q['heartdisease'])

+----------------+---------------------+
| heartdisease   |   phi(heartdisease) |
| heartdisease_0 |              0.6333 |
+----------------+---------------------+
| heartdisease_1 |              0.3667 |
+----------------+---------------------+


  phi1.values = phi1.values[slice_]
  phi.values = phi.values[slice_]


In [22]:
q = HeartDisease_infer.query(variables=['heartdisease'],
                             evidence={'chol': 100})
print(q['heartdisease'])

+----------------+---------------------+
| heartdisease   |   phi(heartdisease) |
| heartdisease_0 |              1.0000 |
+----------------+---------------------+
| heartdisease_1 |              0.0000 |
+----------------+---------------------+
