# Imports

In [7]:
import os
import glob
import requests
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sagemaker

from util import *

%matplotlib inline

# Download and extract data

In [8]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00507/wisdm-dataset.zip'
file_name = url.split('/')[-1]

if not os.path.exists(file_name):
    with requests.get(url, stream=True) as response, open(file_name, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
    with zipfile.ZipFile(file_name) as zip_file:
        zip_file.extractall()

# Read data to memory and write to csv

In [11]:
DATA_DIR = 'data'
DATA_PATH = 'data/har.csv'
RAW_DATA_DIR = 'wisdm-dataset/raw/watch'
ACTIVITY_MAP = {
    'A' : 'walking',
    'B' : 'jogging', 
    'C' : 'stairs', 
    'D' : 'sitting', 
    'E' : 'standing'
} 

def read_watch_data():
    if os.path.exists(DATA_PATH):
        return pd.read_csv(DATA_PATH)
    sensor_dfs = []
    activity_map = get_activity_map()
    for sensor in ['accel', 'gyro']:
        files = sorted(glob.glob(os.path.join(os.path.join(RAW_DATA_DIR, sensor), '*.txt')))
        df = pd.concat([pd.read_csv(file, header=None) for file in files], axis=0, ignore_index=True)
        df.columns = ['subject', 'activity', 'timestamp'] + [sensor + '_' + axis for axis in 'xyz']
        df[df.columns[-1]] = df[df.columns[-1]].str.replace(';', '', regex=False).astype(float)
        df = df.loc[df['activity'].isin(ACTIVITY_MAP.keys())]
        df['activity'] = df['activity'].map(ACTIVITY_MAP)
        sensor_dfs.append(df)
    har_df = pd.merge(sensor_dfs[0], sensor_dfs[1], on=sensor_dfs[0].columns[0:3].tolist())
    har_df = har_df.dropna()
    har_df = har_df.drop_duplicates()
    os.makedirs(DATA_DIR, exist_ok=True)
    har_df.to_csv(DATA_PATH, index=False)
    return har_df

In [12]:
har_df = read_watch_data()
display(har_df.head())
display(har_df.shape)

Unnamed: 0,subject,activity,timestamp,accel_x,accel_y,accel_z,gyro_x,gyro_y,gyro_z
0,1600,walking,90426757696641,4.972757,-0.158317,6.696732,0.314944,-1.022277,-0.309962
1,1600,walking,90426807196641,3.25372,-0.191835,6.107758,0.387382,-0.618541,-0.048972
2,1600,walking,90426856696641,2.801216,-0.155922,5.997625,0.070999,-0.20948,-0.195978
3,1600,walking,90426906196641,3.770868,-1.051354,7.731027,0.037975,0.254976,-0.156563
4,1600,walking,90426955696641,4.661511,0.169689,9.684695,0.073129,0.719431,-0.001035


(884901, 9)

In [None]:
har_df.info()

# Exploratory Data Analysis

In [None]:
with pd.option_context('display.max_rows', None):
    display(pd.pivot_table(har_df, index=har_df.columns[:2].tolist(), aggfunc='count'))

### Readings for subjects 1637, 1638, 1639, and 1640 look out of 'shape'. Data points most likely got dropped because of varying timestamps. Let's plot to confirm.

In [None]:
def plot_sensor_readings(sensor, subject, start=0, end=1024):
    for activity in ACTIVITY_MAP.values():
        data = har_df.query(f'subject == {subject} and activity == "{activity}"')[start:end]
        if len(data) == 0:
            continue
        fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, figsize=(20, 10), sharex=True)
        plot_axis(ax0, data['timestamp'], data[sensor + '_x'], 'X-Axis')
        plot_axis(ax1, data['timestamp'], data[sensor + '_y'], 'Y-Axis')
        plot_axis(ax2, data['timestamp'], data[sensor + '_z'], 'Z-Axis')
        plt.subplots_adjust(hspace=0.2)
        fig.suptitle(sensor.upper() + ' : ' + activity.upper())
        plt.subplots_adjust(top=0.90)
        plt.show()

def plot_axis(ax, x, y, title):
    ax.plot(x, y, 'r')
    ax.set_title(title)
    ax.xaxis.set_visible(False)
    ax.set_ylim([min(y) - np.std(y), max(y) + np.std(y)])
    ax.set_xlim([min(x), max(x)])
    ax.grid(True)

In [None]:
plot_sensor_readings('accel', 1637)

### Dropping data of subjects 1637 to 1640 inclusive.

In [None]:
har_df = har_df.loc[~har_df['subject'].isin([1637, 1638, 1639, 1640])].reset_index()

In [None]:
with pd.option_context('display.max_rows', None):
    display(pd.pivot_table(har_df, index=har_df.columns[:2].tolist(), aggfunc='count'))

In [None]:
har_df['activity'].value_counts().plot(kind='bar', title='Activity Counts', colors=['r', 'g', 'b', 'y', 'k'])

In [None]:
SEQ_LEN = 60 # corresponding to 3 secs (20 * 3). Data was sampled at 20 Hz.
COLUMNS = ['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']
DIMS = len(COLUMNS)

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

scaler = MinMaxScaler()
har_df[COLUMNS] = scaler.fit_transform(har_df[COLUMNS])

encoder = LabelEncoder()
har_df['activity'] = encoder.fit_transform(har_df['activity'])

In [None]:
X = []
y = []
    
for index, data in har_df.groupby(['subject', 'activity'], sort=False):
    data = data[COLUMNS].values
    label = index[1]
    for i in range(0, len(data) - SEQ_LEN, SEQ_LEN):
        X.append(np.expand_dims(data[i:i+SEQ_LEN], axis=0))
        y.append(label)

In [None]:
X = np.concatenate(X)
y = np.asarray(y)

In [None]:
import numpy as np
import pandas as pd 
import tensorflow as tf

In [None]:
tf.__version__

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, input_shape=(SEQ_LEN, DIMS)),
#     tf.keras.layers.MaxPool1D(),
#     tf.keras.layers.Conv1D(filters=64, kernel_size=3),
#     tf.keras.layers.MaxPool1D(),
#     tf.keras.layers.LSTM(units=64, return_sequences=True),
    tf.keras.layers.LSTM(units=64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=5, activation='softmax')
])

# This function keeps the learning rate at 0.001 for the first ten epochs
# and decreases it exponentially after that.
def scheduler(epoch):
    if epoch < 10:
        return 0.001
    return 0.001 * tf.math.exp(0.1 * (10 - epoch))

model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
%%timeit
history = model.fit(X_train, y_train, 
                    batch_size=32, 
                    epochs=100, 
                    validation_split=0.1, 
                    callbacks=[
                        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10), 
                        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3)
                    ])

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['lr'], label='lr')

In [None]:
fig, ax = plt.subplots()

accuracy = history.history['accuracy']
loss = history.history['loss']

plt.plot(accuracy, label='accuracy')
# plt.plot(loss, label='loss')

plt.legend()
plt.show()

In [None]:
model.metrics_names

In [None]:
eval_loss, eval_accuracy = model.evaluate(X_test, y_test)

In [None]:
eval_accuracy