In [2]:
# -*- coding: utf-8 -*-
from __future__ import print_function
import datetime

%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

In [12]:
file = '../data/raw/e-Obs_GPRS_Himalayan_Griffon,_Sherub,_Bhutan-Thang_Kaar_Dorje_(4014)-annotated-bursts.csv'

In [10]:
!head -2 ../data/raw/e-Obs_GPRS_Himalayan_Griffon,_Sherub,_Bhutan-Thang_Kaar_Dorje_\(4014\)-annotated-bursts.csv

type,tag-serial-number,burst-start-timestamp,annotation,eobs_acceleration_axes,eobs_acceleration_sampling_frequency_per_axis,eobs_accelerations_raw
acc,0,2014-12-11 06:30:07.000,"Rest","XYZ",18.74,"2016 1604 1928 2026 1618 1922 2028 1614 1934 2025 1600 1933 2028 1600 1953 2069 1600 1928 2030 1608 1934 2029 1610 1933 2024 1614 1936 2017 1600 1937 2040 1607 1929 2025 1602 1944"


In [13]:
dfraw = pd.read_csv(file, parse_dates=['burst-start-timestamp'])

In [14]:
dfraw.type.describe()

count     28399
unique        1
top         acc
freq      28399
Name: type, dtype: object

"type" column can be safely discarded

In [15]:
dfraw['tag-serial-number'].describe()

count    28399.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: tag-serial-number, dtype: float64

"tag-serial-number" column carries no obvious information

In [16]:
dfraw['burst-start-timestamp'].describe()

count                   28399
unique                  28399
top       2015-02-07 12:50:00
freq                        1
first     2014-12-11 06:30:07
last      2015-06-28 02:10:00
Name: burst-start-timestamp, dtype: object

In [17]:
dfraw.annotation.value_counts()

Rest            19819
Fly              4370
Restless         3267
Others            441
WHR               334
Flap_flight        92
Feed               49
Flap_land          15
Flap_takeoff       12
Name: annotation, dtype: int64

In [18]:
dfraw.eobs_acceleration_axes.describe()

count     28399
unique        1
top         XYZ
freq      28399
Name: eobs_acceleration_axes, dtype: object

Axes always have the same order -- good

In [19]:
dfraw.eobs_acceleration_sampling_frequency_per_axis.value_counts()

18.74    28399
Name: eobs_acceleration_sampling_frequency_per_axis, dtype: int64

We'll discard the 10.54 Hz measurements as there are so few of them

The column "eobs_accelerations_raw" carries acceleration information (40 measurements on each axis, for all three axes).

In [21]:
dfraw.eobs_accelerations_raw.apply(lambda x: len(x.split())).value_counts()

120    25499
36      2900
Name: eobs_accelerations_raw, dtype: int64

Looks clean.

In [50]:
df = pd.DataFrame()

In [51]:
df['time'] = dfraw['burst-start-timestamp']
df['acc'] = dfraw['eobs_accelerations_raw']
df['state'] = dfraw['annotation']

In [52]:
valid = (df['acc'].apply(lambda x: len(x.split())) == 120)

In [53]:
df = df[valid]

In [54]:
df['acc_x'] = df['acc'].apply(lambda x: map(int, x.split()[0:40]))
df['acc_y'] = df['acc'].apply(lambda x: map(int, x.split()[40:80]))
df['acc_z'] = df['acc'].apply(lambda x: map(int, x.split()[80:120]))

In [55]:
del df['acc']

Grouping all fly states into a single "Fly" one

In [56]:
df['state'].value_counts()

Rest            17646
Fly              3847
Restless         3108
Others            419
WHR               321
Flap_flight        86
Feed               46
Flap_land          14
Flap_takeoff       12
Name: state, dtype: int64

In [57]:
df.loc[df.state.isin(['Flap-flight', 'Flap-takeoff', 'Flap-land']), 'state'] = 'Fly'

Now filtering: only 3 states, and only 50 Hz bursts

In [58]:
valid = df.state.isin(['Rest', 'Fly', 'Feed']) #& (dfraw.eobs_acceleration_sampling_frequency_per_axis == 50.00)
valid.value_counts()

True     21539
False     3960
Name: state, dtype: int64

In [59]:
df = df[valid]

Converting acceleration values to SI (m.s^-2) based on pre-deployment calibration

In [62]:
g=9.80665 # m.s^(-2)
df['acc_x_cal'] = df['acc_x'].apply(lambda l: [(x-2030)*0.0023*g for x in l])
df['acc_y_cal'] = df['acc_y'].apply(lambda l: [(x-2033)*0.0023*g for x in l])
df['acc_z_cal'] = df['acc_z'].apply(lambda l: [(x-1951)*0.0023*g for x in l])

Let's make sure that none of the acceleration values is too high or too low

In [65]:
df.acc_x_cal.apply(lambda x: np.mean(x)).describe()

count    21539.000000
mean        -4.676967
std          0.428443
min         -9.178877
25%         -4.948068
50%         -4.697704
75%         -4.404485
max         -1.645409
Name: acc_x_cal, dtype: float64

In [66]:
df.acc_y_cal.apply(lambda x: np.mean(x)).describe()

count    21539.000000
mean        -4.929726
std          0.413185
min         -7.867851
25%         -5.191665
50%         -4.955398
75%         -4.673739
max         -1.655559
Name: acc_y_cal, dtype: float64

In [67]:
df.acc_z_cal.apply(lambda x: np.mean(x)).describe()

count    21539.000000
mean        -3.036212
std          0.459177
min         -6.707381
25%         -3.318448
50%         -3.047220
75%         -2.741032
max          0.166909
Name: acc_z_cal, dtype: float64

All three axes look homogeneous and within reasonable bounds. Using the mean as an indicator shouldn't be a problem.

Let's create composite variables (mean and coefficient of variation) from the acceleration bursts

In [68]:
df['acc_x_mean'] = df.acc_x_cal.apply(lambda x: np.mean(x))
df['acc_y_mean'] = df.acc_y_cal.apply(lambda x: np.mean(x))
df['acc_z_mean'] = df.acc_z_cal.apply(lambda x: np.mean(x))

df['acc_x_std'] = df.acc_x_cal.apply(lambda x: np.std(x))
df['acc_y_std'] = df.acc_y_cal.apply(lambda x: np.std(x))
df['acc_z_std'] = df.acc_z_cal.apply(lambda x: np.std(x))

df['acc_x_cv'] = df.acc_x_cal.apply(lambda x: np.std(x)/np.mean(x))
df['acc_y_cv'] = df.acc_y_cal.apply(lambda x: np.std(x)/np.mean(x))
df['acc_z_cv'] = df.acc_z_cal.apply(lambda x: np.std(x)/np.mean(x))

Exporting clean data to CSV

In [69]:
df[['acc_x_mean', 'acc_x_std', 'acc_x_cv', 'acc_y_mean', 'acc_y_std', 'acc_y_cv', 'acc_z_mean', 'acc_z_std', 'acc_z_cv', 'state']]\
.to_csv('../data/processed/indiv_1.csv', header=True, index=False)

In [70]:
!head -10 ../data/processed/indiv_1.csv

acc_x_mean,acc_x_std,acc_x_cv,acc_y_mean,acc_y_std,acc_y_cv,acc_z_mean,acc_z_std,acc_z_cv,state
-4.8426218365,3.76003668127,-0.776446480485,-5.12061584737,3.66430356848,-0.715598216639,-3.28912589337,3.68817314391,-1.12132319147,Rest
-4.73999524425,3.89743015629,-0.822243473982,-5.01291431375,3.78058890185,-0.754169863123,-3.193829772,3.82719824746,-1.19831002924,Rest
-4.70559841937,3.79819946074,-0.807166086485,-4.8516439545,3.86599927603,-0.79684315508,-2.95981858637,3.90424431902,-1.31908230355,Rest
-4.2787394615,4.27749796731,-0.999709845809,-4.53643370688,4.26052204979,-0.939178730494,-2.77373740262,3.99598574096,-1.44065034317,Rest
-4.38644099512,4.06636372804,-0.927030303737,-4.7614227745,4.01340657067,-0.84290069602,-2.85268093512,4.02365850064,-1.41048318832,Rest
-4.78679748137,3.47520478975,-0.725997872957,-5.06479149225,3.47264368295,-0.685643957558,-3.11150294525,3.41089442308,-1.09622085632,Rest
-4.217840165,3.97522597131,-0.942479045152,-4.54207253062,3.99238497731