In [2]:
%matplotlib inline

from dask.distributed import Client
import dask.dataframe as dd
import matplotlib.pyplot as plt
import joblib
import numpy as np

In [3]:
# Start and connect to local client

client = Client(n_workers=4)
# client = Client("scheduler-address:8786")  # connecting to remote cluster

In [4]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:45185  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 15.64 GiB


# 1. Data Preparation

## a) Load Data

In [5]:
# Read in files

In [6]:
def read_file(filepath):
    df = dd.read_csv(filepath, sep = ',', header = None)
    df.columns = ['subject_id', 'activity_code', 'timestamp', 'x_coord', 'y_coord', 'z_coord']
    df['z_coord'] = df['z_coord'].str.replace(";","").astype('float64') # remove ; and ensure float (having issues with lineterminator)
    return df

In [7]:
phone_gyro_df = read_file('wisdm-dataset/raw/phone/gyro/*.txt')
phone_accel_df = read_file('wisdm-dataset/raw/phone/accel/*.txt')
watch_gyro_df = read_file('wisdm-dataset/raw/watch/gyro/*.txt')
watch_accel_df = read_file('wisdm-dataset/raw/watch/accel/*.txt')

## b) Exploratory Data Analysis

In [70]:
def plot_subject_activity(df, title_append=None):
    subject_id = df.head(1)['subject_id'][0]
    activity_code = df.head(1)['activity_code'][0]
    title = f'Subject {subject_id} Performing Activity {activity_code}' 
    if title_append != None:
        title+=str(f' ({title_append})')
        
    fig, ax = plt.subplots(3, figsize=(20,10))
    fig.suptitle(title)
    ax[0].plot(range(len(df)), df['x_coord'].compute().to_numpy())
    ax[0].set_title('x_coord')
    ax[1].plot(range(len(df)), df['y_coord'].compute().to_numpy())
    ax[1].set_title('y_coord')
    ax[2].plot(range(len(df)), df['z_coord'].compute().to_numpy())
    ax[2].set_title('z_coord')
    ax[2].set_xlabel('Interval')

In [None]:
temp = phone_gyro_df[(phone_gyro_df['subject_id'] == 1600) & (phone_gyro_df['activity_code'] == 'A')]
plot_subject_activity(temp, 'Phone Gyro')

In [None]:
# Join gyroscope and accelerometer data by device

In [None]:
len(phone_gyro_df[ (phone_gyro_df['subject_id'] == 1600) & (phone_gyro_df['activity_code'] == 'A') ])

In [None]:
combined_watch_df = watch_accel_df.merge(watch_gyro_df, on=["subject_id","timestamp"], how="inner")
combined_watch_df.compute()

In [None]:
combined_phone_df = phone_accel_df.merge(phone_gyro_df, on=["subject_id","timestamp"], how="inner")
combined_phone_df.compute()

In [None]:
# Group by subject id, activity, time interval window (3s) - mean, std x, y, z with group by

# https://stackoverflow.com/questions/35898667/group-by-time-and-other-column-in-pandas

In [None]:
# Join back together for single dataframe 

# 2. Model Selection & Training

In [None]:
# train test split
# fit model
# hyperparamter tuning depending on model selected (if time permits)

# 3. Model Test

In [None]:
# test /validate and provide final accurracy 

# 4. Results & Conclusion

In [None]:
# Summarize final results and conclusion - reinclude any helpful charts/graphs

In [None]:
client.shutdown()