# Neuer Datensatz
Data from https://www.kaggle.com/datasets/jeanmidev/smart-meters-in-london

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import mchmm as mc

## Load files

In [2]:
filename1 = "data/london_hourly/block_10.csv"
filename2 = "data/london_hourly/block_11.csv"
filename3 = "data/london_hourly/block_12.csv"
names = [filename1, filename2, filename3]

frame_types = dict(zip(['LCLid', 'tstp', 'energy(kWh/hh)'], [str, object, np.floating]))
kwargs = {
    'dtype': frame_types,
    'na_values': {'Null'}
}
dataframes = (pd.read_csv(name, **kwargs) for name in names)
data = pd.concat(dataframes, ignore_index=True)
data.fillna(0);
data['energy(kWh/hh)'] = data['energy(kWh/hh)'].replace(np.nan, 0)

# create a list from each energy meter
values = data.groupby('LCLid')['energy(kWh/hh)'].apply(list)
timepoints = data.groupby('LCLid')['tstp'].apply(list)

In [3]:
# start um 12 a clock
start_index = []
for timepoint in timepoints:
    for i in range(len(timepoint)):
        if timepoint[i][11:16] == '12:00':
            start_index.append(i)
            break
new_values = [values[i][start_index[i]:] for i in range(len(values))]
new_timepoints = [timepoints[i][start_index[i]:] for i in range(len(timepoints))]

# remove meters that have less then 25000 entries
min_length = 25000
new_values_with_certain_length = [new_values[i] for i in range(len(new_values)) if len(new_values[i]) > min_length]
new_timepoints_with_certain_length = [new_timepoints[i] for i in range(len(new_timepoints)) if len(new_timepoints[i]) > min_length]

# convert to float
new_values_float = list(map(lambda x: list(map(float, x)), new_values_with_certain_length))
# sum over all meaters at the same timepoint
summed_values_float = np.array([sum(new_values_float[i][k] for i in range(len(new_values_float))) for k in range(min_length)])

# scale list
max_v = 20
max_summed_values = max(summed_values_float)
summed_values_scaled = np.array([max_v * summed_values_float[i]/ max_summed_values for i in range(len(summed_values_float))])

# convert to int
summed_values_scaled_int =  summed_values_scaled.astype(int)

## Initialize markov chain

In [4]:
chain = mc.MarkovChain().from_data(summed_values_scaled_int)

In [43]:
fig = plt.figure(figsize=(20, 10), dpi=80)
plt.plot(chain.simulate(n=1000, start=5, ret="states"))

[<matplotlib.lines.Line2D at 0x7fb84cffaa00>]

In [14]:
len(new_values_float)

141

In [12]:
summed_values_float[1:50]

array([45.2060002, 36.7889998, 37.4870001, 39.4699999, 38.7099999,
       42.2040002, 41.2960001, 43.454    , 42.9719998, 46.273    ,
       49.7339998, 52.694    , 57.3090001, 58.7709999, 56.1169999,
       55.1519999, 54.43     , 52.199    , 49.0160001, 48.6340002,
       47.3590001, 41.8180001, 34.8369999, 30.2139999, 33.5370001,
       32.322    , 28.0590001, 25.927    , 24.3719998, 23.801    ,
       23.306    , 23.0140001, 22.4889999, 22.186    , 23.603    ,
       27.1949999, 33.9200001, 35.3579999, 33.862    , 37.0520001,
       41.6369999, 39.0110001, 38.8659999, 37.5360001, 34.4219999,
       33.7409999, 34.1909999, 35.0680002, 34.0009999])

In [6]:
data

Unnamed: 0,LCLid,tstp,energy(kWh/hh)
0,MAC000322,2012-03-06 12:30:00.0000000,0.000
1,MAC000322,2012-03-06 13:30:00.0000000,0.067
2,MAC000322,2012-03-06 14:00:00.0000000,0.060
3,MAC000322,2012-03-06 14:30:00.0000000,0.082
4,MAC000322,2012-03-06 15:00:00.0000000,0.052
...,...,...,...
4676602,MAC005399,2014-02-27 22:00:00.0000000,0.121
4676603,MAC005399,2014-02-27 22:30:00.0000000,0.033
4676604,MAC005399,2014-02-27 23:00:00.0000000,0.066
4676605,MAC005399,2014-02-27 23:30:00.0000000,0.082
