In [3]:
import pandas as pd
import wrapt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.factors.discrete import TabularCPD
import warnings
warnings.filterwarnings('ignore')

# Base models

# Meter Leak

In [2]:
# Data 
df_ml = pd.read_csv('meter_leak_pred_data.csv')
df_ml.head()

Unnamed: 0,Prs_ds_disc,Prs_us_disc,MinTemp_disc,MaxTemp_disc,Avg_Temp_disc,Temp_range_disc,Rain_disc,meterleak_count
0,2,1,0,0,0,2,0,0
1,2,1,1,2,1,2,0,0
2,2,1,0,0,0,2,0,0
3,2,1,0,0,0,2,0,0
4,2,2,0,0,0,2,1,1


In [3]:
# Separating independent variables(predictors) and the target (dependant) variable
X_ml = df_ml[['Prs_ds_disc','Prs_us_disc','Temp_range_disc', 'Avg_Temp_disc','Rain_disc']]
y_ml = df_ml.meterleak_count

In [4]:
#Spliting the train and test sets on a 70:30 basis
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_ml, y_ml, test_size = .3)

In [5]:
# Joining the train set for model development
X_train_ml['meterleak_count'] = y_train_ml

In [6]:
#frame = df_ml[['Prs_ds_disc', 'Temp_range_disc', 'Rain_disc', 'meterleak_count']]

In [7]:
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
model_meterleak = BayesianModel([('Prs_ds_disc', 'meterleak_count'), ('Temp_range_disc', 'meterleak_count'), ('Rain_disc', 'meterleak_count') ])
model_meterleak.fit(X_train_ml)
#model1.get_cpds()

In [8]:
# Alligning test set with data model variables
X_test_ml = X_test_ml[['Prs_ds_disc','Temp_range_disc','Rain_disc']]

In [9]:
# Using fitted model to obtain predictions based on test set
y_pred_meterleak = model_meterleak.predict(X_test_ml)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [10]:
# Measuring the accuracy of the models based on the test set
print(accuracy_score(y_pred_meterleak, y_test_ml))
#print(accuracy_score(y_pred_meterleak2, y_test_ml))

0.8


In [11]:
# Printing a CPD with it's state names defined.
print(model_meterleak.get_cpds('Temp_range_disc'))

+--------------------+----------+
| Temp_range_disc(0) | 0.317647 |
+--------------------+----------+
| Temp_range_disc(1) | 0.352941 |
+--------------------+----------+
| Temp_range_disc(2) | 0.329412 |
+--------------------+----------+


# Pipe leak Models

In [61]:
# Data 
df_pl = pd.read_csv('pipe_leak_pred_data_lagged.csv')
df_pl.head()

Unnamed: 0,Prs_ds_disc,Prs_ds_disc_l1,Prs_us_disc,Prs_us_disc_l1,MinTemp_disc,MinTemp_disc_l1,MaxTemp_disc,MaxTemp_disc_l1,Avg_Temp_disc,Avg_Temp_disc_l1,Temp_range_disc,Temp_range_disc_l1,Rain_disc,Rain_disc_l1,pipeleak_count,pipeleak_count_l1
0,2,2,1,1,0,1,0,2,0,1,2,2,0,0,0,0
1,2,2,1,1,1,0,2,0,1,0,2,2,0,0,0,0
2,2,2,1,1,0,0,0,0,0,0,2,2,0,0,0,0
3,2,2,1,2,0,0,0,0,0,0,2,2,0,1,0,0
4,2,2,2,2,0,0,0,0,0,0,2,1,1,1,0,0


## Transition models for meter

In [62]:
# defining target for transition model
X_pl_trans = df_pl[['Prs_ds_disc','Prs_ds_disc_l1']]
y_pl_trans = df_pl.Prs_ds_disc_l1

In [63]:
#Spliting the train and test sets on a 70:30 basis for transtion model
X_train_pl_trans, X_test_pl_trans, y_train_pl_trans, y_test_pl_trans = train_test_split(X_pl, y_pl_trans, test_size = .3)

In [64]:
# Joining the train set for model development
X_train_pl_trans['Prs_ds_disc_l1'] = y_train_pl_trans

In [65]:
# transtion model
trans_model_pipeleak =  BayesianModel([('Prs_ds_disc', 'Prs_ds_disc_l1')])
trans_model_pipeleak.fit(X_train_pl_trans)

In [66]:
# Alligning test set with data model variables transition model previous pressure data
X_test_pl_trans = X_test_pl_trans[['Prs_ds_disc']]

In [67]:
# Using fitted model to obtain predictions based on test set for transtion model
y_pred_pipeleak_trans = trans_model_pipeleak.predict(X_test_pl_trans)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [68]:
# Measuring the accuracy of the models based on the test set
# transtion model
print(accuracy_score(y_pred_pipeleak_trans, y_test_pl_trans))

0.8545454545454545


## Base Model

In [69]:
# Separating independent variables(predictors) and the target (dependant) variable
X_pl = df_pl[['Prs_ds_disc','pipeleak_count']]
y_pl = df_pl.pipeleak_count

In [70]:
#Spliting the train and test sets on a 70:30 basis
X_train_pl, X_test_pl, y_train_pl, y_test_pl = train_test_split(X_pl, y_pl, test_size = .3)

In [71]:
# Joining the train set for model development
X_train_pl['pipeleak_count'] = y_train_pl

In [72]:
# Base model
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
model_pipeleak = BayesianModel([('Prs_ds_disc', 'pipeleak_count')])
model_pipeleak.fit(X_train_pl)
#model1.get_cpds()

In [73]:
# Alligning test set with data model variables base model 
X_test_pl = X_test_pl[['Prs_ds_disc']]

In [74]:
# Using fitted model to obtain predictions based on test set
y_pred_pipeleak = model_pipeleak.predict(X_test_pl)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [75]:
# Measuring the accuracy of the models based on the test set
#Base model
print(accuracy_score(y_pred_pipeleak, y_test_pl))


0.8818181818181818


## Combining models

In [None]:
# combine for modelling
future_pred = model_pipeleak.predict(trans_model_pipeleak.predict(X_test_pl_trans))

In [58]:
trans_pred = trans_model_pipeleak.predict(X_test_pl_trans)
trans_pred.rename(columns={'Prs_ds_disc_l1': 'Prs_ds_disc'}, inplace=True)


In [59]:
trans_pred.head()

Unnamed: 0,Prs_ds_disc
125,1
66,1
347,0
252,0
203,1


In [60]:
future_pred = model_pipeleak.predict(trans_pred)

In [None]:
print(accuracy_score(future_pred, y_test_pl))

In [31]:
# Printing a CPD with it's state names defined.
print(model_pipeleak.get_cpds('Prs_ds_disc_l1'))

+-------------------+----------------------+---------------------+----------------------+
| Prs_ds_disc       | Prs_ds_disc(0)       | Prs_ds_disc(1)      | Prs_ds_disc(2)       |
+-------------------+----------------------+---------------------+----------------------+
| Prs_ds_disc_l1(0) | 0.9069767441860465   | 0.10975609756097561 | 0.011627906976744186 |
+-------------------+----------------------+---------------------+----------------------+
| Prs_ds_disc_l1(1) | 0.08139534883720931  | 0.7439024390243902  | 0.19767441860465115  |
+-------------------+----------------------+---------------------+----------------------+
| Prs_ds_disc_l1(2) | 0.011627906976744186 | 0.14634146341463414 | 0.7906976744186046   |
+-------------------+----------------------+---------------------+----------------------+


# Total daily leaks

In [52]:
# Data 
df_tl = pd.read_csv('total_leak_pred_data.csv')
df_tl.head()

Unnamed: 0,Prs_ds_disc,Prs_us_disc,MinTemp_disc,MaxTemp_disc,Avg_Temp_disc,Temp_range_disc,Rain_disc,total_leak_count
0,2,1,0,0,0,2,0,0
1,2,1,1,2,1,2,0,0
2,2,1,0,0,0,2,0,0
3,2,1,0,0,0,2,0,0
4,2,2,0,0,0,2,1,1


In [53]:
# Separating independent variables(predictors) and the target (dependant) variable
X_tl = df_tl[['Prs_ds_disc','Prs_us_disc','Temp_range_disc', 'Avg_Temp_disc','Rain_disc']]
y_tl = df_tl.total_leak_count

In [54]:
#Spliting the train and test sets on a 70:30 basis
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size = .3)

In [55]:
# Joining the train set for model development
X_train_tl['total_leak_count'] = y_train_tl

In [56]:
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
model_totalleak = BayesianModel([('Prs_ds_disc', 'total_leak_count'), ('Temp_range_disc', 'total_leak_count'), ('Rain_disc', 'total_leak_count') ])
model_totalleak.fit(X_train_tl)
#model1.get_cpds()

In [57]:
# Alligning test set with data model variables
X_test_tl = X_test_tl[['Prs_ds_disc','Temp_range_disc','Rain_disc']]

In [58]:
# Using fitted model to obtain predictions based on test set
y_pred_totalleak = model_totalleak.predict(X_test_tl)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [59]:
# Measuring the accuracy of the models based on the test set
print(accuracy_score(y_pred_totalleak, y_test_tl))


0.6


In [60]:
# Printing a CPD with it's state names defined.
print(model_totalleak.get_cpds('total_leak_count'))

+---------------------+---------------------+---------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+---------------------+---------------------+---------------------+--------------------+--------------------+---------------------+--------------------+--------------------+---------------------+---------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
| Prs_ds_disc         | Prs_ds_disc(0)      | Prs_ds_disc(0)      | Prs_ds_disc(0)      | Prs_ds_disc(0)     | Prs_ds_disc(0)     | Prs_ds_disc(0)     | Prs_ds_disc(0)     | Prs_ds_disc(0)     | Prs_ds_disc(0)      | Prs_ds_disc(1)     | Prs_ds_disc(1)      | Prs_ds_disc(1)      | Prs_ds_disc(1)      | Prs_ds_disc(1)     | Prs_ds_disc(1)     | Prs_ds_disc(1)      | Prs_ds_disc(1)     | Prs_ds_di

In [61]:
cpd = model_totalleak.get_cpds('total_leak_count')

In [62]:
type(cpd)

pgmpy.factors.discrete.CPD.TabularCPD

# With transition

In [16]:
from itertools import combinations
from collections import defaultdict
import numpy as np
import networkx as nx
from pgmpy.factors.discrete import TabularCPD
from pgmpy.base import DirectedGraph
from pgmpy.models import DynamicBayesianNetwork as DBN
from pgmpy.factors.discrete import TabularCPD

In [17]:
dbn = DBN()

In [None]:
cpd_pr = model_pipeleak.get_cpds('Prs_ds_disc')
cpd_prl = model_pipeleak.get_cpds('Prs_ds_disc_l1')
cpd_plc = model_pipeleak.get_cpds('pipeleak_count')


In [51]:
dbn.add_edges_from([(('Prs_ds_disc', 0), ('pipe_leak_count', 0)), (('Temp_range_disc', 0), ('total_leak_count', 0)),
                    (('Rain_disc', 0), ('total_leak_count', 0)), (('Prs_ds_disc', 0), ('Prs_ds_disc', 1)), 
                    (('Temp_range_disc', 0), ('Temp_range_disc', 1)), (('Rain_disc', 0), ('Rain_disc', 1))])

In [21]:
dbn.add_edges_from([(('D', 0),('G', 0)),(('I', 0),('G', 0)),(('D', 0),('D', 1)),(('I', 0),('I', 1))])
grade_cpd = TabularCPD(('G', 0), 3, [[0.3, 0.05, 0.9, 0.5],[0.4, 0.25, 0.8, 0.03],[0.3, 0.7, 0.02, 0.2]],
                                evidence=[('I', 0),('D', 0)],
                                evidence_card=[2, 2])
d_i_cpd = TabularCPD(('D',1), 2, [[0.6, 0.3],[0.4, 0.7]],evidence=[('D',0)],evidence_card=[2])
diff_cpd = TabularCPD(('D', 0), 2, [[0.6, 0.4]])
intel_cpd = TabularCPD(('I', 0), 2, [[0.7, 0.3]])
i_i_cpd = TabularCPD(('I', 1), 2, [[0.5, 0.4],[0.5, 0.6]], evidence=[('I', 0)], evidence_card=[2])
dbn.add_cpds(grade_cpd, d_i_cpd, diff_cpd, intel_cpd, i_i_cpd)
dbn.get_cpds()

[]