## Dataset Overview
The Human Activity Recognition Trondheim (HARTH) dataset comprises recordings from 22 participants, each wearing two 3-axial Axivity AX3 accelerometers for approximately 2 hours in a free-living environment. The sensors, positioned on the right thigh and lower back, capture motion data essential for human activity recognition (HAR). This dataset's rich, professionally annotated data offers a comprehensive benchmark for developing advanced machine learning models aimed at precise HAR in real-world settings.

- Subject Area: Computer Science/Human Activity Recognition
- Dataset Characteristics: Multivariate, Time Series
- Associated Tasks: Classification
- Feature Type: Real-valued sensor data
- Sampling Rate: 50 Hz
- Annotations: Activities annotated frame-by-frame using video recordings from a chest-mounted camera
- Total Instances: 6,461,328
- Total Features: 8, including senesor readings (back_x, back_y, back_z, thigh_x, thigh_y, thigh_z)igh_z).

In [1]:
import io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import tensorflow as tf
import zipfile
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, TimeDistributed
from keras.models import Sequential
from keras.layers import TimeDistributed, Conv1D, MaxPooling1D, Flatten, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from keras.optimizers import Adam
from keras.optimizers.schedules import ExponentialDecay
from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam




## Load the Dataset

In [2]:
# URL for UCI data
zip_file_url = "https://archive.ics.uci.edu/static/public/779/harth.zip"
files = [
    'S029.csv', 'S028.csv', 'S027.csv', 'S026.csv', 'S025.csv',
    'S024.csv', 'S023.csv', 'S022.csv', 'S021.csv', 'S020.csv',
    'S019.csv', 'S018.csv', 'S017.csv', 'S016.csv', 'S015.csv',
    'S014.csv', 'S013.csv', 'S012.csv', 'S010.csv', 'S009.csv',
    'S008.csv', 'S006.csv',
]
data_types = {
    'back_x': 'float64', 'back_y': 'float64', 'back_z': 'float64',
    'thigh_x': 'float64', 'thigh_y': 'float64', 'thigh_z': 'float64',
    'label': 'int32'
}

dataframes = []
response = requests.get(zip_file_url)
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
    for file_name in files:
        with zip_file.open('harth/' + file_name) as csv_file:
            df = pd.read_csv(csv_file, dtype=data_types, header=0)
            print(f"Loaded {file_name} with shape: {df.shape}")
            if df.empty:
                print(f"Warning: {file_name} is empty.")
            dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)
print(f"Combined DataFrame shape: {combined_df.shape}")
if combined_df.empty:
    raise ValueError("The combined DataFrame is empty. Check if the files are being loaded correctly.")

Loaded S029.csv with shape: (178716, 8)
Loaded S028.csv with shape: (165178, 8)
Loaded S027.csv with shape: (158584, 8)
Loaded S026.csv with shape: (195172, 8)
Loaded S025.csv with shape: (231729, 8)
Loaded S024.csv with shape: (170534, 8)
Loaded S023.csv with shape: (137646, 9)
Loaded S022.csv with shape: (337602, 8)
Loaded S021.csv with shape: (302247, 9)
Loaded S020.csv with shape: (371496, 8)
Loaded S019.csv with shape: (297945, 8)
Loaded S018.csv with shape: (322271, 8)
Loaded S017.csv with shape: (366609, 8)
Loaded S016.csv with shape: (355418, 8)
Loaded S015.csv with shape: (418392, 9)
Loaded S014.csv with shape: (366487, 8)
Loaded S013.csv with shape: (369077, 8)
Loaded S012.csv with shape: (382414, 8)
Loaded S010.csv with shape: (351649, 8)
Loaded S009.csv with shape: (154464, 8)
Loaded S008.csv with shape: (418989, 8)
Loaded S006.csv with shape: (408709, 8)
Combined DataFrame shape: (6461328, 10)


## Add Metadata
Adding labels to the dataset for readability.


In [3]:
activity_labels = {
    1: 'walking',
    2: 'running',
    3: 'shuffling',
    4: 'stairs (ascending)',
    5: 'stairs (descending)',
    6: 'standing',
    7: 'sitting',
    8: 'lying',
    13: 'cycling (sit)',
    14: 'cycling (stand)',
    130: 'cycling (sit, inactive)',
    140: 'cycling (stand, inactive)'
}
combined_df['label'] = combined_df['label'].map(activity_labels)

# Output activity labels
activity_labels

{1: 'walking',
 2: 'running',
 3: 'shuffling',
 4: 'stairs (ascending)',
 5: 'stairs (descending)',
 6: 'standing',
 7: 'sitting',
 8: 'lying',
 13: 'cycling (sit)',
 14: 'cycling (stand)',
 130: 'cycling (sit, inactive)',
 140: 'cycling (stand, inactive)'}

## Initial Data Exploration

In [4]:
# Define the original 8 features
original_features = ['timestamp', 'back_x', 'back_y', 'back_z', 'thigh_x', 'thigh_y', 'thigh_z', 'label']

# Create a temporary DataFrame with only the original 8 features
temp_df = combined_df[original_features]

# Displaying DataFrame information for the temporary DataFrame
print('Data Info')
temp_df.info()

# Adding a visual separator for clarity
print('\nData Head')

# Displaying the first few rows of the temporary DataFrame
temp_df.head()

Data Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6461328 entries, 0 to 6461327
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   timestamp  object 
 1   back_x     float64
 2   back_y     float64
 3   back_z     float64
 4   thigh_x    float64
 5   thigh_y    float64
 6   thigh_z    float64
 7   label      object 
dtypes: float64(6), object(2)
memory usage: 394.4+ MB

Data Head


Unnamed: 0,timestamp,back_x,back_y,back_z,thigh_x,thigh_y,thigh_z,label
0,2019-01-12 00:00:00.000,-1.0,-0.071289,-0.215332,-0.99707,-0.124268,0.142334,standing
1,2019-01-12 00:00:00.020,-1.0,-0.084473,-0.210449,-0.964844,-0.107422,0.160645,standing
2,2019-01-12 00:00:00.040,-0.997559,-0.111328,-0.199219,-0.971191,-0.108887,0.170898,standing
3,2019-01-12 00:00:00.060,-1.006592,-0.13916,-0.209717,-0.986084,-0.112061,0.154297,standing
4,2019-01-12 00:00:00.080,-1.030029,-0.140137,-0.22876,-0.98584,-0.127441,0.155029,standing


## Data Preprocessing
Imputing missing values and performing initial data processing steps

In [5]:
# List of columns to drop
columns_to_drop = ['Unnamed: 0', 'index', 'back_x_rolling_mean', 'thigh_x_rolling_mean']

# Drop the columns
combined_df = combined_df.drop(columns=columns_to_drop, errors='ignore')

# Display missing values 
print("Missing values per column:")
print(combined_df.isnull().sum())

Missing values per column:
timestamp    0
back_x       0
back_y       0
back_z       0
thigh_x      0
thigh_y      0
thigh_z      0
label        0
dtype: int64


## LSTM Activity Predictions

In [None]:
# Encode categorical labels
label_encoder = LabelEncoder()
combined_df['label_encoded'] = label_encoder.fit_transform(combined_df['label'])

# Define input features and target variable
X = combined_df[['back_x', 'back_y', 'back_z', 'thigh_x', 'thigh_y', 'thigh_z']].values
y = combined_df['label_encoded'].values

# Reshape the input features for LSTM
# LSTM input shape: [samples, time steps, features]
# Here, we assume each sample has 1 time step
X = X.reshape(X.shape[0], 1, X.shape[1])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(LSTM(units=64, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=8, batch_size=64, validation_split=0.1, verbose=1)

# Summarize History for Accuracy
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Summarize History for Loss
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()



Epoch 1/8


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8