In [1]:
import pandas as pd
import wrapt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.factors.discrete import TabularCPD
import warnings
warnings.filterwarnings('ignore')

# Base models

# Meter Leak

In [2]:
# Data 
df_ml = pd.read_csv('meter_leak_pred_data.csv')
df_ml.head()

Unnamed: 0,Prs_ds_disc,Prs_us_disc,MinTemp_disc,MaxTemp_disc,Avg_Temp_disc,Temp_range_disc,Rain_disc,meterleak_count
0,2,1,0,0,0,2,0,0
1,2,1,1,2,1,2,0,0
2,2,1,0,0,0,2,0,0
3,2,1,0,0,0,2,0,0
4,2,2,0,0,0,2,1,1


In [3]:
# Separating independent variables(predictors) and the target (dependant) variable
X_ml = df_ml[['Prs_ds_disc','Prs_us_disc','Temp_range_disc', 'Avg_Temp_disc','Rain_disc']]
y_ml = df_ml.meterleak_count

In [4]:
#Spliting the train and test sets on a 70:30 basis
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_ml, y_ml, test_size = .3)

In [5]:
# Joining the train set for model development
X_train_ml['meterleak_count'] = y_train_ml

In [8]:
#frame = df_ml[['Prs_ds_disc', 'Temp_range_disc', 'Rain_disc', 'meterleak_count']]

In [6]:
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
model_meterleak = BayesianModel([('Prs_ds_disc', 'meterleak_count'), ('Temp_range_disc', 'meterleak_count'), ('Rain_disc', 'meterleak_count') ])
model_meterleak.fit(X_train_ml)
#model1.get_cpds()

In [9]:
# Alligning test set with data model variables
X_test_ml = X_test_ml[['Prs_ds_disc','Temp_range_disc','Rain_disc']]

In [10]:
# Using fitted model to obtain predictions based on test set
y_pred_meterleak = model_meterleak.predict(X_test_ml)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [12]:
# Measuring the accuracy of the models based on the test set
print(accuracy_score(y_pred_meterleak, y_test_ml))
#print(accuracy_score(y_pred_meterleak2, y_test_ml))

0.8363636363636363


In [13]:
# Printing a CPD with it's state names defined.
print(model_meterleak.get_cpds('Temp_range_disc'))

+--------------------+----------+
| Temp_range_disc(0) | 0.34902  |
+--------------------+----------+
| Temp_range_disc(1) | 0.305882 |
+--------------------+----------+
| Temp_range_disc(2) | 0.345098 |
+--------------------+----------+


# Pipe leak

In [16]:
# Data 
df_pl = pd.read_csv('pipe_leak_pred_data.csv')
df_pl.head()

Unnamed: 0,Prs_ds_disc,Prs_us_disc,MinTemp_disc,MaxTemp_disc,Avg_Temp_disc,Temp_range_disc,Rain_disc,pipeleak_count
0,2,1,0,0,0,2,0,0
1,2,1,1,2,1,2,0,0
2,2,1,0,0,0,2,0,0
3,2,1,0,0,0,2,0,0
4,2,2,0,0,0,2,1,0


In [18]:
# Separating independent variables(predictors) and the target (dependant) variable
X_pl = df_pl[['Prs_ds_disc','Prs_us_disc','Temp_range_disc', 'Avg_Temp_disc','Rain_disc']]
y_pl = df_pl.pipeleak_count

In [19]:
#Spliting the train and test sets on a 70:30 basis
X_train_pl, X_test_pl, y_train_pl, y_test_pl = train_test_split(X_pl, y_pl, test_size = .3)

In [20]:
# Joining the train set for model development
X_train_pl['meterleak_count'] = y_train_pl

In [21]:
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
model_pipeleak = BayesianModel([('Prs_ds_disc', 'meterleak_count'), ('Temp_range_disc', 'meterleak_count'), ('Rain_disc', 'meterleak_count') ])
model_pipeleak.fit(X_train_pl)
#model1.get_cpds()

In [22]:
# Alligning test set with data model variables
X_test_pl = X_test_pl[['Prs_ds_disc','Temp_range_disc','Rain_disc']]

In [23]:
# Using fitted model to obtain predictions based on test set
y_pred_pipeleak = model_pipeleak.predict(X_test_pl)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [24]:
# Measuring the accuracy of the models based on the test set
print(accuracy_score(y_pred_pipeleak, y_test_pl))


0.8727272727272727


In [25]:
# Printing a CPD with it's state names defined.
print(model_pipeleak.get_cpds('Temp_range_disc'))

+--------------------+----------+
| Temp_range_disc(0) | 0.321569 |
+--------------------+----------+
| Temp_range_disc(1) | 0.34902  |
+--------------------+----------+
| Temp_range_disc(2) | 0.329412 |
+--------------------+----------+


# Total daily leaks

In [36]:
# Data 
df_tl = pd.read_csv('total_leak_pred_data.csv')
df_tl.head()

Unnamed: 0,Prs_ds_disc,Prs_us_disc,MinTemp_disc,MaxTemp_disc,Avg_Temp_disc,Temp_range_disc,Rain_disc,total_leak_count
0,2,1,0,0,0,2,0,0
1,2,1,1,2,1,2,0,0
2,2,1,0,0,0,2,0,0
3,2,1,0,0,0,2,0,0
4,2,2,0,0,0,2,1,1


In [37]:
# Separating independent variables(predictors) and the target (dependant) variable
X_tl = df_tl[['Prs_ds_disc','Prs_us_disc','Temp_range_disc', 'Avg_Temp_disc','Rain_disc']]
y_tl = df_tl.total_leak_count

In [38]:
#Spliting the train and test sets on a 70:30 basis
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size = .3)

In [39]:
# Joining the train set for model development
X_train_tl['total_leak_count'] = y_train_tl

In [40]:
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
model_totalleak = BayesianModel([('Prs_ds_disc', 'total_leak_count'), ('Temp_range_disc', 'total_leak_count'), ('Rain_disc', 'total_leak_count') ])
model_totalleak.fit(X_train_tl)
#model1.get_cpds()

In [41]:
# Alligning test set with data model variables
X_test_tl = X_test_tl[['Prs_ds_disc','Temp_range_disc','Rain_disc']]

In [42]:
# Using fitted model to obtain predictions based on test set
y_pred_totalleak = model_totalleak.predict(X_test_tl)
#y_pred_meterleak2 = model_meterleak2.predict(X_test_ml)

In [43]:
# Measuring the accuracy of the models based on the test set
print(accuracy_score(y_pred_totalleak, y_test_tl))


0.5909090909090909


In [35]:
# Printing a CPD with it's state names defined.
print(model_totalleak.get_cpds('Temp_range_disc'))

+--------------------+----------+
| Temp_range_disc(0) | 0.345098 |
+--------------------+----------+
| Temp_range_disc(1) | 0.329412 |
+--------------------+----------+
| Temp_range_disc(2) | 0.32549  |
+--------------------+----------+


# Transtion models

In [None]:
# lagged
df_tl['Prs_ds_disc_1'] = df_tl['Prs_ds_disc'].shift(1)
df_tl['Prs_ds_disc_1'] = df_tl['Prs_ds_disc'].shift(1)
df_tl['Prs_ds_disc_1'] = df_tl['Prs_ds_disc'].shift(1)