In [29]:
import boto3
import time
import pandas as pd
import numpy as np
from tsfresh import extract_features

## Connect to AWS Athena and query data

In [3]:
client = boto3.client('athena')

In [4]:
RESULT_OUTPUT_LOCATION = "s3://smu-is614-iot-step-tracker/queries/"

In [5]:
query = f"""
    SELECT
        *
    FROM
        "smu-iot"."microbit"
"""

In [6]:
def has_query_succeeded(client, execution_id):
    state = "RUNNING"
    max_execution = 5

    while max_execution > 0 and state in ["RUNNING", "QUEUED"]:
        max_execution -= 1
        response = client.get_query_execution(QueryExecutionId=execution_id)
        if (
            "QueryExecution" in response
            and "Status" in response["QueryExecution"]
            and "State" in response["QueryExecution"]["Status"]
        ):
            state = response["QueryExecution"]["Status"]["State"]
            if state == "SUCCEEDED":
                return True

        time.sleep(5)

    return False


def get_athena_query(client, query, output):
    response = client.start_query_execution(
        QueryString=query,
        ResultConfiguration={"OutputLocation": output}
    )
    
    time.sleep(1)
        
    if has_query_succeeded(client, response['QueryExecutionId']):
        result = client.get_query_results(
            QueryExecutionId=response['QueryExecutionId']
        )
        print ("query is successful!")
        return result['ResultSet']['Rows']
    
    print ("No result returned!")
    return None

In [7]:
%%time

res = get_athena_query(client, query, RESULT_OUTPUT_LOCATION)

cols = {}
for c in res[0]['Data']:
    k = c['VarCharValue']
    cols[k] = None
    
data = np.array([[result['VarCharValue'] for result in res[i]['Data']] for i in range(1, len(res))])
    
df = pd.DataFrame(columns=cols, data=data)

query is successful!
CPU times: total: 0 ns
Wall time: 1.51 s


In [8]:
df = df.astype({
    'timestamp': np.int64,
    'uuid': 'str',
    'gyro_x': 'float',
    'gyro_y': 'float',
    'gyro_z': 'float',
    'accel_x': 'float',
    'accel_y': 'float',
    'accel_z': 'float'
})

## Generate Dummy Time Series Data

In [39]:
df = pd.read_csv("..\data\sample_data.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.set_index('timestamp', inplace=True)


## Explore Preprocessing libraries

Based on this [paper](https://ieeexplore.ieee.org/document/8672772), we can use the following pre-processing methods:

* Spike Removal
* Noise Removal (Savitzky-Golay smoothing filter)
* Mean, Standard Deviation, Peak to Peak Amplitude, Skewness , Kurtosis 

In [52]:
## spike removal

# def remove_spike(x):
#     std = np.std(x)
#     return std

# In progress

## Use automated time-series feature extractor from TS fresh

In [53]:
from tsfresh import extract_features

In [54]:
df['time'] = pd.to_datetime(df.index)
df['id'] = 1

In [55]:
features = extract_features(df, column_id="id", column_sort="time").dropna(axis=1)

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.59it/s]


In [56]:
features

Unnamed: 0,gyro_x__variance_larger_than_standard_deviation,gyro_x__has_duplicate_max,gyro_x__has_duplicate_min,gyro_x__has_duplicate,gyro_x__sum_values,gyro_x__abs_energy,gyro_x__mean_abs_change,gyro_x__mean_change,gyro_x__mean_second_derivative_central,gyro_x__median,...,accel_y__fourier_entropy__bins_3,accel_y__fourier_entropy__bins_5,accel_y__fourier_entropy__bins_10,accel_y__fourier_entropy__bins_100,accel_y__permutation_entropy__dimension_3__tau_1,accel_y__permutation_entropy__dimension_4__tau_1,accel_y__permutation_entropy__dimension_5__tau_1,accel_y__permutation_entropy__dimension_6__tau_1,accel_y__permutation_entropy__dimension_7__tau_1,accel_y__mean_n_absolute_max__number_of_maxima_7
1,1.0,0.0,0.0,0.0,570.832118,412739.714625,50.081569,0.315089,-0.292198,1.295522,...,0.608998,1.114931,1.73331,3.681396,1.78022,3.126448,4.384515,5.003652,5.179411,128.53955


In [57]:
!pip show package tsfresh

Name: tsfresh
Version: 0.20.0
Summary: tsfresh extracts relevant characteristics from time series
Home-page: https://github.com/blue-yonder/tsfresh
Author: Maximilian Christ, Nils Braun, Julius Neuffer, Andreas W. Kempa-Liehr
Author-email: max.christ@me.com
License: MIT
Location: c:\users\songh\anaconda3\envs\smu-iot\lib\site-packages
Requires: cloudpickle, dask, distributed, numpy, pandas, patsy, requests, scikit-learn, scipy, statsmodels, stumpy, tqdm
Required-by: 


