# Task 1: Dataset Exploration
Create a summary table: participants, time per participant, demographics, total labels.

In [9]:
# import libraries
import os
from pathlib import Path
import numpy as np
import pandas as pd

In [10]:
# get paths, check validity
DATA_ROOT = Path("wisdm-dataset")
RAW_DIR = DATA_ROOT / "raw"
ARFF_DIR = DATA_ROOT / "arff_files"

print("DATA_ROOT exists:", DATA_ROOT.exists())
print("Subfolders:")
for p in DATA_ROOT.iterdir():
    print(" -", p)

print("\nRAW subfolders:")
for p in RAW_DIR.rglob("*"):
    if p.is_dir():
        print(" -", p)

print("\nARFF subfolders:")
for p in ARFF_DIR.rglob("*"):
    if p.is_dir():
        print(" -", p)

DATA_ROOT exists: True
Subfolders:
 - wisdm-dataset\.activity_key.txt.swp
 - wisdm-dataset\activity_key.txt
 - wisdm-dataset\arffmagic-master
 - wisdm-dataset\arff_files
 - wisdm-dataset\change_raw_act.pl
 - wisdm-dataset\raw
 - wisdm-dataset\README.txt

RAW subfolders:
 - wisdm-dataset\raw\phone
 - wisdm-dataset\raw\watch
 - wisdm-dataset\raw\phone\accel
 - wisdm-dataset\raw\phone\gyro
 - wisdm-dataset\raw\watch\accel
 - wisdm-dataset\raw\watch\gyro

ARFF subfolders:
 - wisdm-dataset\arff_files\phone
 - wisdm-dataset\arff_files\watch


 - wisdm-dataset\arff_files\phone\accel
 - wisdm-dataset\arff_files\phone\gyro
 - wisdm-dataset\arff_files\watch\accel
 - wisdm-dataset\arff_files\watch\gyro


In [11]:
# load and clean data
def load_raw_sensor_data(device="watch", sensor="accel"):
    """
    Load all raw files for a given device/sensor pair into a single DataFrame.

    device: "phone" or "watch"
    sensor: "accel" or "gyro"
    """
    sensor_dir = RAW_DIR / device / sensor
    if not sensor_dir.exists():
        raise FileNotFoundError(f"Sensor directory does not exist: {sensor_dir}")

    all_files = sorted(sensor_dir.glob(f"data_*_{sensor}_{device}.txt"))
    if not all_files:
        raise FileNotFoundError(f"No data files found in {sensor_dir}")

    dfs = []
    for f in all_files:
        df = pd.read_csv(
            f,
            header=None,
            names=["subject_id", "activity", "timestamp", "x", "y", "z_raw"],
            sep=",",
            engine="python"
        )

        # remove trailing ;
        df["z"] = df["z_raw"].astype(str).str.replace(";", "", regex=False).astype(float)
        df = df.drop(columns=["z_raw"])

        dfs.append(df)

    full_df = pd.concat(dfs, ignore_index=True)
    return full_df

In [12]:
# check for null or bad values in the dataset
VALID_ACTIVITIES = set(list("ABCDEFGHIJKLMOPQRS"))  # Aâ€“S without N

def check_dataframe_quality(df, name="data"):
    print(f"\n=== Quality report: {name} ===")

    # shape of data
    print(f"Rows: {len(df):,}, Columns: {df.shape[1]}")

    print("\nColumn dtypes:")
    print(df.dtypes)

    # number of null or nan values
    print("\nNull / NaN counts:")
    print(df.isna().sum())

    # number of infinity values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        inf_mask = np.isinf(df[numeric_cols].to_numpy())
        n_inf = inf_mask.sum()
        print(f"\nNumber of infinite values in numeric columns: {n_inf}")
        if n_inf > 0:
            col_inf_counts = np.isinf(df[numeric_cols]).sum()
            print("Inf counts per numeric column:")
            print(col_inf_counts)
    else:
        print("\nNo numeric columns detected for inf check.")

In [15]:
# watch data quality checks
watch_accel = load_raw_sensor_data(device="watch", sensor="accel")
watch_gyro = load_raw_sensor_data(device="watch", sensor="gyro")

check_dataframe_quality(watch_accel, name="watch_accel")
check_dataframe_quality(watch_gyro, name="watch_gyro")


=== Quality report: watch_accel ===
Rows: 3,777,046, Columns: 6

Column dtypes:
subject_id      int64
activity       object
timestamp       int64
x             float64
y             float64
z             float64
dtype: object

Null / NaN counts:
subject_id    0
activity      0
timestamp     0
x             0
y             0
z             0
dtype: int64

Number of infinite values in numeric columns: 0

=== Quality report: watch_gyro ===
Rows: 3,440,342, Columns: 6

Column dtypes:
subject_id      int64
activity       object
timestamp       int64
x             float64
y             float64
z             float64
dtype: object

Null / NaN counts:
subject_id    0
activity      0
timestamp     0
x             0
y             0
z             0
dtype: int64

Number of infinite values in numeric columns: 0


In [None]:
# helper for time per participant
def time_per_participant_from_counts(df, sampling_rate=20.0):
    counts = df.groupby("subject_id").size().rename("n_samples")
    summary = counts.to_frame()
    
    summary["time_sec"] = summary["n_samples"] / sampling_rate
    summary["time_min"] = summary["time_sec"] / 60.0
    summary["time_hr"] = summary["time_sec"] / 3600.0
    
    return summary

In [24]:
# summary of data
def summarize_participants(df, name="data"):
    print(f"\n=== Participant summary: {name} ===")
    participants = df["subject_id"].unique()
    print(f"Number of participants: {len(participants)}")

    unique_labels = sorted(watch_accel["activity"].unique())
    print(f"Number of unique labels: {len(unique_labels)}")

    avg_min = time_per_participant_from_counts(df)["time_min"].mean()
    print("Average minutes per participant:", round(avg_min, 2))

# Example
summarize_participants(watch_accel, name="watch_accel")


=== Participant summary: watch_accel ===
Number of participants: 51
Number of unique labels: 18
Average minutes per participant: 61.72
