In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from hmmlearn import hmm
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
import functions_data_processing as fsd
from sklearn import svm
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from itertools import combinations
from sklearn.preprocessing import LabelBinarizer

In [9]:
# This cell changes the values of the df.csv file in order for the absent state to have a value of 0 instead of 0.1 and the present state to have a value of 1 instead of 0.9
# Furthermore it adds a new column in which we have both values of Microwave and Default combined

df_imported = pd.read_csv('df.csv', index_col=0)
df_imported = df_imported
df_imported.index = pd.to_datetime(df_imported.index)
df = pd.DataFrame()
df['Microwave'] = df_imported['Microwave']
df['Default'] = df_imported['Default']
df['Ground Truth'] = df_imported['Ground Truth']
df['Microwave - Default (combined)'] = 'temporary'
df['Garage door'] = df_imported['Garage door']

df =df.mask(df==0.9, 1)
df =df.mask(df==0.1, 0)

mask1 = (df['Microwave']==1.0) & (df['Default']==1.0)
mask2 = (df['Microwave']==0.0) & (df['Default']==1.0)
mask3 = (df['Microwave']==0.0) & (df['Default']==0.0)
mask4 = (df['Microwave']==1.0) & (df['Default']==0.0)

df['Microwave - Default (combined)'] = ''
df['Microwave - Default (combined)'] = df['Microwave - Default (combined)'].mask(mask1, '11')
df['Microwave - Default (combined)'] = df['Microwave - Default (combined)'].mask(mask2, '01')
df['Microwave - Default (combined)'] = df['Microwave - Default (combined)'].mask(mask3, '00')
df['Microwave - Default (combined)'] = df['Microwave - Default (combined)'].mask(mask4, '10')
df.iloc[:1025]

Unnamed: 0,Microwave,Default,Ground Truth,Microwave - Default (combined),Garage door
2016-01-01 05:00:00,0.0,1.0,1.0,01,0.0
2016-01-01 05:01:00,0.0,1.0,1.0,01,0.0
2016-01-01 05:02:00,0.0,1.0,1.0,01,0.0
2016-01-01 05:03:00,0.0,1.0,1.0,01,0.0
2016-01-01 05:04:00,0.0,1.0,1.0,01,0.0
...,...,...,...,...,...
2016-01-01 22:00:00,0.0,1.0,1.0,01,0.0
2016-01-01 22:01:00,0.0,1.0,1.0,01,0.0
2016-01-01 22:02:00,0.0,1.0,1.0,01,0.0
2016-01-01 22:03:00,0.0,1.0,1.0,01,0.0


In [10]:
# A function that selects a section of the data frame based on a starting date and interval

def interval_model(data_frame, starting_date, interval):
    
    s_date = pd.to_datetime(starting_date, format='%Y-%m-%d %H:%M:%S.%f')
    data_frame = data_frame.loc[s_date : s_date + interval]
    return data_frame

In [11]:
# An example of using the function interval_model defined above
df_model = df['Microwave - Default (combined)']

df_model = interval_model(data_frame= df_model, starting_date='2016-02-01 05:00:00', interval=datetime.timedelta(days=7))

df_model.iloc[:]

2016-02-01 05:00:00    01
2016-02-01 05:01:00    01
2016-02-01 05:02:00    01
2016-02-01 05:03:00    01
2016-02-01 05:04:00    01
                       ..
2016-02-08 04:56:00    01
2016-02-08 04:57:00    01
2016-02-08 04:58:00    01
2016-02-08 04:59:00    01
2016-02-08 05:00:00    01
Name: Microwave - Default (combined), Length: 10081, dtype: object

In [12]:
# The following cells are used just for testing and getting a feel for how the HMM model is deployed and can be ignored 



''' A function that returns a list of lists where each list has 4 values based on the counted values of the ostates in the status input variable'''
def sequence2counts(status, ostates2id):
    ans = []
    for word, idx in ostates2id.items():
       
        count = status.count(word)
        ans.append(count)
    return ans


# Smart home situation states: hidden state
h_states = ['absent', 'present'] # not at home, at home
id2hstates = dict(zip(range(len(h_states)), h_states)) # Dict from numbers to words

# Initial distribution of hidden states
start_probs = np.array([0.5, 0.5])

# microwave and GT: observable states: 
# Pos 1: Default $Df$
# Pos 2: Microvave on off
o_states = ['00', '01', '10', '11']  
id2ostates = dict(zip(o_states, range(len(o_states))))

# Transition probs from hidden to observable states
emission_probs = np.array([[0.25, 0.1, 0.4, 0.25],
                           [0.2, 0.5, 0.1, 0.2]])

# Transition matrix of hidden states
trans_mat = np.array([[0.5, 0.5], [0.5, 0.5]])

In [13]:

lst = [0] *7*24
for i in range(1,7*24,1):
    lst[i-1] = df_model.tolist()[((i-1)*60):(i*60)]
   
    
lst[167] = df_model.tolist()[(167*60):(168*60)]

print(len(lst[167]))

60


In [14]:
observations = lst


# Format observations
X = []
for status in observations:
    row = sequence2counts(status, id2ostates)
    X.append(row)
data = np.array(X, dtype=int)


n_trials = len(observations[0])


# Build the model
# Set up model:
model = hmm.MultinomialHMM(n_components=len(h_states),
        n_trials=n_trials,
        init_params='')

model.n_features = len(o_states)

model.startprob_ = start_probs
model.transmat_ = trans_mat
model.emissionprob_ = emission_probs



model.fit(data)

# Estimate state
logprob, state_ests = model.decode(data)

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


In [15]:
# Print states
print("Estimated states")
print([id2hstates[x] for x in state_ests])
print('------------------------------------')
print("Learned emission probs:")
print(model.emissionprob_)
print('------------------------------------')
print("Learned transition matrix:")
print(model.transmat_)

Estimated states
['present', 'present', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'absent', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'present', 'pr

In [16]:
y_true_original = interval_model(data_frame= df['Ground Truth'], starting_date='2016-01-01 05:00:00', interval=datetime.timedelta(days=7))



y_true = y_true_original
ss = ShuffleSplit(n_splits=25, test_size=0.3)


list_true = [0] *7*24
for i in range(1,7*24,1):
    list_true[i-1] = y_true.tolist()[((i-1)*60):(i*60)]
   
    
list_true[167] = y_true.tolist()[(167*60):(168*60)]

l = []
for status in list_true:
    row = sequence2counts(status, dict(zip(range(len([0,1])), [0,1])))
    l.append(row)
y_true = np.array(l, dtype=int)



for i in range(y_true.shape[0]):
    if y_true[i,0]>=y_true[i,1]:
        y_true[i]=0
    else:
        y_true[i]=1



y_true = y_true[:,0]

y_pred_proba_all = np.zeros(len(y_true))