In [2]:
import logging
import time
import pandas as pd
import numpy as np
import datahandling as dh
from report import sensor_stats

In [3]:
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

In [4]:
input_datafiles = ['/Users/sam/Downloads/BuildAX/Floor3.csv']

In [5]:
pd.set_option('chained_assignment', None)  # Hush up, SettingWithCopyWarning

start_time = time.time()
# Use a generator to concatenate datafiles into a list 
# Single threaded: 60.73 seconds 
df = pd.concat( (dh.readfile(infile) for infile in input_datafiles) )

# Multithreaded:  19.43 seconds. Winner!
#p = multiprocessing.Pool()
#df = pd.concat(p.map(dh.readfile, input_datafiles))

log.info("Running final sort on merge...")
df.sort_index(inplace=True) # Sort again on merge

# Lots of subprocesses hanging around: clean 'em up:
#p.close()
#p.join()

log.info("+ Data read in {0:.2f}s".format(time.time() - start_time))

# Extract sensor IDs / names and split into dict by sensor ID
t_start, t_end = (df.index.min(), df.index.max())
# names = dh.unique_sensors(df)
dfs = dh.split_by_id(df)

assert(len(dfs['422EA95A']) == 59847)

INFO:datahandling:Reading data from /Users/sam/Downloads/BuildAX/Floor3.csv
INFO:datahandling:Detected MIME: text/csv
INFO:__main__:Running final sort on merge...
INFO:__main__:+ Data read in 2.16s


In [6]:
check_length = len(dfs[list(dfs.keys())[0]])
check_length

147165

In [7]:
# Apply fixes to the data and diff the PIR movement
#dfs = dh.clean_data(dfs)

# Limit range
for i in dfs:
    dfs[i].loc[:, 'Temp'] = dfs[i].loc[:, 'Temp']\
        .apply(lambda d: d if (d > -500) and (d < 1000) else np.NaN)

    dfs[i].loc[:, 'Humidity'] = dfs[i].loc[:, 'Humidity']\
        .apply(lambda d: d if (d > 0.0) and (d < 101.0) else np.NaN)


dfs = dh.fix_light(dfs)

dfs = dh.fix_humidity(dfs)

dfs = dh.fix_temp(dfs)

assert(len(dfs[list(dfs.keys())[0]]) == check_length)

In [8]:
dfs_bak = dfs

In [71]:
dfs = dfs_bak.copy()

In [72]:
#dfs = dh.diff_pir(dfs)

ಠ_ಠ = 1e9  # scale factor to use
σ = 5      # detect trigger above 5σ standard deviations

for i in dfs:
    d = dfs[i].loc[:, ['PIREnergy']]

    # Time deltas
    df_time = pd.DataFrame(d.index, index=d.index) \
        .diff().fillna(0)                          \
        .div(np.timedelta64(1, 's'))               \
        .astype('int64')

    # Differentiate & fix wrapping at 2^16,
    # then normalize to 0 and apply scale factor
    df_diff = d['PIREnergy'].diff()               \
        .apply(lambda x: x if x > 0 else x+65535) \
        .astype('float')                          \
        .div(df_time['DateTime'].astype('float'), axis='index') \
        .diff() \
        * ಠ_ಠ

    # Calculate std. deviation
    df_std = df_diff.rolling(window=250, center=False).std() * σ

    # Event triggers
    df_event = (df_diff > df_std).to_frame(name='Event')

    # Store views into original DataFrame
    dfs[i].loc[:, 'Event'] = df_event[df_event['Event'] == True]
    dfs[i].loc[:, 'PIRDiff'] = df_diff
    # dfs[i].loc[:,'PIRStd'] = df_std

assert(len(dfs[list(dfs.keys())[0]]) == check_length)

In [73]:
len(dfs[list(dfs.keys())[0]])

147165

In [74]:
print(dfs[i].PIRDiff[:10])
print(len(dfs[i]))

DateTime
2016-02-02 17:58:31             NaN
2016-02-04 07:35:28             NaN
2016-02-08 13:55:25   -1.225261e+08
2016-02-08 14:00:16    1.387769e+10
2016-02-09 07:33:33   -1.399888e+10
2016-02-09 08:31:46    9.651322e+09
2016-02-09 09:15:27    3.928234e+09
2016-02-09 13:22:58   -1.177617e+10
2016-02-10 15:28:20   -1.275398e+09
2016-03-23 17:52:49   -5.419100e+08
Name: PIRDiff, dtype: float64
932


PIR Diff result on Pandas 0.19.0 (with error)
```
DateTime
2016-02-02 17:58:31             NaN
2016-02-04 07:35:28             NaN
2016-02-08 13:55:25   -1.225261e+08
2016-02-09 07:33:33   -1.399888e+10
2016-02-09 13:22:58   -1.177617e+10
2016-02-10 15:28:20   -1.275398e+09
2016-03-23 17:52:49   -5.419100e+08
2016-03-23 19:04:56   -1.390800e+09
2016-03-23 19:35:09   -8.563218e+08
2016-03-23 19:36:07   -3.448276e+08
Name: PIRDiff, dtype: float64

```
Expected result (`0.18.1`):
```
DateTime
2016-02-02 17:58:31          NaN
2016-02-04 07:35:28          NaN
2016-02-08 13:55:25    -0.122526
2016-02-08 14:00:16    13.877688
2016-02-09 07:33:33   -13.998882
2016-02-09 08:31:46     9.651322
2016-02-09 09:15:27     3.928234
2016-02-09 13:22:58   -11.776167
2016-02-10 15:28:20    -1.275398
2016-03-23 17:52:49    -0.541910
Name: PIRDiff, dtype: float64
```

In [75]:
# Scrub erroneous values:
pir_threshold = 1500 # arbitrary but big... 


In [76]:
out_of_threshold = dfs[i][ (dfs[i].PIRDiff > pir_threshold) | (dfs[i].PIRDiff < -pir_threshold) ]
zeroed_values = pd.DataFrame(0, index=out_of_threshold.index, columns=['PIRDiff'])
dfs[i].update(zeroed_values)
zeroed_values[:10]

Unnamed: 0_level_0,PIRDiff
DateTime,Unnamed: 1_level_1
2016-02-08 13:55:25,0
2016-02-08 14:00:16,0
2016-02-09 07:33:33,0
2016-02-09 08:31:46,0
2016-02-09 09:15:27,0
2016-02-09 13:22:58,0
2016-02-10 15:28:20,0
2016-03-23 17:52:49,0
2016-03-23 17:53:18,0
2016-03-23 17:55:15,0


In [77]:
for i in dfs:
    out_of_threshold = dfs[i][ (dfs[i].PIRDiff > pir_threshold) | (dfs[i].PIRDiff < -pir_threshold) ]
    zeroed_values = pd.DataFrame(0, index=out_of_threshold.index, columns=['PIRDiff'])
    dfs[i].update(zeroed_values)
    

In [78]:
dfs[list(dfs.keys())[0]][:10]

Unnamed: 0_level_0,Name,RSSI,Type,SequenceNo,TransmitPower,Battery,Humidity,Temp,Light,PIRCount,PIREnergy,Switch,Event,PIRDiff
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-05-15 14:51:56,42CAE0CE,-34,2,40,20,3228,30.51,22.6,4,35,31744,1,,
2015-05-15 14:52:51,42CAE0CE,-29,2,41,20,3228,30.51,22.6,4,36,32720,1,,
2015-05-15 14:53:25,42CAE0CE,-28,1,42,20,3232,29.47,22.7,14,36,33474,1,,0.0
2015-05-15 14:55:08,42CAE0CE,-30,2,43,20,3232,29.47,22.7,14,37,34442,1,,0.0
2015-05-15 14:58:24,42CAE0CE,-29,1,44,20,3230,29.17,22.7,3,37,36552,1,,0.0
2015-05-15 15:03:22,42CAE0CE,-34,1,45,20,3227,29.17,22.6,1,37,39398,1,,0.0
2015-05-15 15:08:21,42CAE0CE,-41,1,46,20,3226,29.08,22.6,0,37,42143,1,,0.0
2015-05-15 15:13:20,42CAE0CE,-39,1,47,20,3226,29.08,22.5,0,37,44972,1,,0.0
2015-05-15 15:13:27,42CAE0CE,-34,2,48,20,3226,29.08,22.5,0,38,45078,1,,0.0
2015-05-15 15:18:19,42CAE0CE,-38,1,49,20,3225,29.08,22.5,1,38,48199,1,,0.0


In [79]:
dfs = sensor_stats(dfs, 10)

INFO:report: ID      | Packets 
INFO:report:42CAE0CE | 147165
INFO:report:42A91C18 | 158564
INFO:report:42CA8185 | 57646
INFO:report:4259DD00 | 38260
INFO:report:42D8421B | 44686
INFO:report:426D6758 | 50762
INFO:report:426BBB67 | 107222
INFO:report:429D6BDE | 4720
INFO:report:422EA95A | 59847
INFO:report:42F519EF | 2177
INFO:report:425AE41F | 932


In [80]:
len(dfs['42CAE0CE'])

147165

```
# Drop indices of dfs where threshold value not in tolerance
dfs = {
    i: dfs[i].drop(
        dfs[i][dfs[i].PIRDiff > pir_threshold].index
    ) 
    for i in dfs
}

assert(len(dfs[list(dfs.keys())[0]]) == check_length)

dfs = {
    i: dfs[i].drop(dfs[i][dfs[i].PIRDiff < -pir_threshold].index) 
    for i in dfs
}

assert(len(dfs[list(dfs.keys())[0]]) == check_length)
```
