In [1]:
import pandas as pd
from pathlib import Path

ACTIVITY = 'activity'
DEVICE = 'device'
START_TIME = 'start_time'
END_TIME = 'end_time'

TIME = 'time'
VALUE = 'value'
NAME = 'name'


def load_ordoneza_dataset(data_dir: str):
    data_dir = Path(data_dir)

    raw_act_path = data_dir / 'OrdonezB_ADLs.txt'
    corrected_act_path = data_dir / 'OrdonezB_ADLs_corr.txt'
    
    # 修复分隔符不一致问题
    with open(raw_act_path, 'r') as f_in, open(corrected_act_path, 'w') as f_out:
        for i, line in enumerate(f_in):
            if i < 2:  # 保留前两行标题
                f_out.write(line)
                continue
                
            parts = line.strip().split()
            if len(parts) != 5:
                continue  # 跳过格式错误行
                
            # 重组为正确格式：日期时间 TAB 日期时间 TAB 活动
            new_line = f"{parts[0]} {parts[1]}\t\t{parts[2]} {parts[3]}\t\t{parts[4]}\n"
            f_out.write(new_line)

    # ----------------- 处理活动数据 -----------------
    df_act = pd.read_csv(
        corrected_act_path,
        delimiter='\t+',
        skiprows=[0, 1],
        names=[START_TIME, END_TIME, ACTIVITY],
        engine='python'
    )
    
    # 转换时间格式
    df_act[START_TIME] = pd.to_datetime(df_act[START_TIME])
    df_act[END_TIME] = pd.to_datetime(df_act[END_TIME])

    # ----------------- 处理设备数据 -----------------
    # 加载传感器数据
    df_sen = pd.read_csv(
        data_dir / 'OrdonezB_Sensors.txt',
        delimiter='\t+',
        skiprows=[0, 1],
        names=[START_TIME, END_TIME, 'location', 'type', 'place'],
        engine='python'
    )
    
    # 生成唯一设备名称
    df_sen[DEVICE] = df_sen['place'] + '_' + df_sen['location'] + '_' + df_sen['type']
    
    # 多了个空格
    # df_sen[START_TIME] = df_sen[START_TIME].apply(lambda x: x.strip())
    # df_sen[END_TIME] = df_sen[END_TIME].apply(lambda x: x.strip())
            
    df_sen[START_TIME] = pd.to_datetime(df_sen[START_TIME])
    df_sen[END_TIME] = pd.to_datetime(df_sen[END_TIME])
    
    # 生成设备区域映射表
    device_areas = df_sen[['place', 'location', 'type', DEVICE]].drop_duplicates()
    
    df_start = df_sen.copy().drop(columns=END_TIME)
    df_end = df_sen.copy().drop(columns=START_TIME)

    df_start[VALUE] = True
    df_end[VALUE] = False

    df_start.rename(columns={START_TIME: TIME}, inplace=True)
    df_end.rename(columns={END_TIME: TIME}, inplace=True)

    df_events = pd.concat([df_end, df_start]).sort_values(TIME) \
        .reset_index(drop=True)

    return {
        'activities': df_act,
        'devices': df_events,
        'device_areas': device_areas,
        'activity_list': df_act[ACTIVITY].unique().tolist(),
        'device_list': df_events[DEVICE].unique().tolist()
    }

if __name__ == "__main__":
    dataset = load_ordoneza_dataset(Path("../dataset/UCI_ADL_Binary"))
    
    print("Activities:")
    print(dataset['activities'].head())
    
    print("\nDevice Events:")
    print(dataset['devices'].head())
    
    print("\nDevice Areas Mapping:")
    print(dataset['device_areas'].head())


Activities:
           start_time            end_time       activity
0 2012-11-11 21:14:00 2012-11-12 00:22:59  Spare_Time/TV
1 2012-11-12 00:24:00 2012-11-12 00:43:59  Spare_Time/TV
2 2012-11-12 00:48:00 2012-11-12 00:49:59       Grooming
3 2012-11-12 00:50:00 2012-11-12 01:51:59  Spare_Time/TV
4 2012-11-12 01:52:00 2012-11-12 01:52:59       Grooming

Device Events:
                 time location      type    place                device  value
0 2012-11-11 21:14:21     Seat  Pressure   Living  Living_Seat_Pressure   True
1 2012-11-12 00:21:49     Seat  Pressure   Living  Living_Seat_Pressure  False
2 2012-11-12 00:22:57     Door       PIR   Living       Living_Door_PIR   True
3 2012-11-12 00:22:59     Door       PIR   Living       Living_Door_PIR  False
4 2012-11-12 00:23:14     Door       PIR  Kitchen      Kitchen_Door_PIR   True

Device Areas Mapping:
      place location      type                device
0    Living     Seat  Pressure  Living_Seat_Pressure
1    Living     Door       

In [2]:
from utils import correct_activities
# the activity grooming is often overlapped by sleeping
# Since grooming is more important make it the dominant activity (opinionated)
dataset['activities'], correction_act = correct_activities(dataset['activities'], excepts=['Grooming'], retain_corrections=True)
print(correction_act)

[(            start_time            end_time   activity
69 2012-11-14 00:28:00 2012-11-14 00:29:59  Toileting
70 2012-11-14 00:29:00 2012-11-14 05:12:59   Sleeping,                start_time            end_time   activity
0 2012-11-14 00:28:00.000 2012-11-14 00:29:00  Toileting
1 2012-11-14 00:29:00.001 2012-11-14 05:12:59   Sleeping), (            start_time            end_time       activity
82 2012-11-14 12:29:00 2012-11-14 12:52:59  Spare_Time/TV
83 2012-11-14 12:52:00 2012-11-14 12:54:59        Leaving,                start_time            end_time       activity
0 2012-11-14 12:29:00.000 2012-11-14 12:52:00  Spare_Time/TV
1 2012-11-14 12:52:00.001 2012-11-14 12:54:59        Leaving), (            start_time            end_time       activity
89 2012-11-14 20:11:00 2012-11-14 21:37:59  Spare_Time/TV
90 2012-11-14 21:37:00 2012-11-14 21:47:59         Dinner,                start_time            end_time       activity
0 2012-11-14 20:11:00.000 2012-11-14 21:37:00  Spare_Time/TV
1 2