In [1]:
import os
import glob
import numpy as np
import pandas as pd

from util import *

%load_ext autoreload
%autoreload 2

In [2]:
har_accel_df = read_sensor_data('accel')
har_gyro_df = read_sensor_data('gyro')

har_df = [har_accel_df, har_gyro_df]

In [3]:
assert har_accel_df.shape[0] == 3777046
assert har_gyro_df.shape[0] == 3440342

In [4]:
activity_map = get_activity_map()

for df in har_df:
    df['activity'] = df['activity'].map(activity_map)

In [5]:
if not (os.path.exists('data/accel.csv') or os.path.exists('data/gyro.csv')): 
    har_accel_df.to_csv('data/accel.csv', index=False)
    har_gyro_df.to_csv('data/gyro.csv', index=False)

In [6]:
subject_id = 1600
activity = 'walking'

for df in har_df:
    display(df.query(f'subject_id == {subject_id} and activity == "{activity}"'))

Unnamed: 0,subject_id,activity,timestamp,accel_x,accel_y,accel_z
0,1600,walking,90426708196641,7.091625,-0.591667,8.195502
1,1600,walking,90426757696641,4.972757,-0.158317,6.696732
2,1600,walking,90426807196641,3.253720,-0.191835,6.107758
3,1600,walking,90426856696641,2.801216,-0.155922,5.997625
4,1600,walking,90426906196641,3.770868,-1.051354,7.731027
...,...,...,...,...,...,...
3600,1600,walking,90606357482081,8.099585,-1.080084,-1.855357
3601,1600,walking,90606407407481,8.166622,-1.180641,-2.358139
3602,1600,walking,90606457332881,8.056489,0.406715,-2.987814
3603,1600,walking,90606507258281,8.477868,0.859219,-2.449119


Unnamed: 0,subject_id,activity,timestamp,gyro_x,gyro_y,gyro_z
0,1600,walking,90426757696641,0.314944,-1.022277,-0.309962
1,1600,walking,90426807196641,0.387382,-0.618541,-0.048972
2,1600,walking,90426856696641,0.070999,-0.209480,-0.195978
3,1600,walking,90426906196641,0.037975,0.254976,-0.156563
4,1600,walking,90426955696641,0.073129,0.719431,-0.001035
...,...,...,...,...,...,...
3598,1600,walking,90606307556681,1.656970,-1.339567,3.452509
3599,1600,walking,90606357482081,1.307563,-1.838111,2.433051
3600,1600,walking,90606407407481,1.996789,-1.512140,1.505206
3601,1600,walking,90606457332881,2.001051,-0.521444,0.906527


#### We can see that the data is not consistent. Samples exist for which both sensor readings are not present. Let's merge the dataframe to solve this problem.

In [7]:
har_df = pd.merge(har_accel_df, har_gyro_df, on=['subject_id', 'activity', 'timestamp'], how='inner')

display(har_df.head())
display(har_df.shape)

Unnamed: 0,subject_id,activity,timestamp,accel_x,accel_y,accel_z,gyro_x,gyro_y,gyro_z
0,1600,walking,90426757696641,4.972757,-0.158317,6.696732,0.314944,-1.022277,-0.309962
1,1600,walking,90426807196641,3.25372,-0.191835,6.107758,0.387382,-0.618541,-0.048972
2,1600,walking,90426856696641,2.801216,-0.155922,5.997625,0.070999,-0.20948,-0.195978
3,1600,walking,90426906196641,3.770868,-1.051354,7.731027,0.037975,0.254976,-0.156563
4,1600,walking,90426955696641,4.661511,0.169689,9.684695,0.073129,0.719431,-0.001035


(3368542, 9)

In [8]:
if not os.path.exists('data/har.csv'):
    har_df.to_csv('data/har.csv', index=False)