In [1]:
import numpy as np
import pandas as pd
import glob
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
SD = 42
random.seed(SD)
np.random.seed(SD)
os.environ['PYTHONHASHSEED'] = str(SD)

In [3]:
data_dir = 'Data/ch2025_data_items'
metrics_train_dir = 'Data/ch2025_metrics_train.csv'
submission_dir = 'Data/ch2025_submission_sample.csv'

parquet_files = glob.glob(os.path.join(data_dir, 'ch2025_*.parquet'))
parquet_files

['Data/ch2025_data_items\\ch2025_mACStatus.parquet',
 'Data/ch2025_data_items\\ch2025_mActivity.parquet',
 'Data/ch2025_data_items\\ch2025_mAmbience.parquet',
 'Data/ch2025_data_items\\ch2025_mBle.parquet',
 'Data/ch2025_data_items\\ch2025_mGps.parquet',
 'Data/ch2025_data_items\\ch2025_mLight.parquet',
 'Data/ch2025_data_items\\ch2025_mScreenStatus.parquet',
 'Data/ch2025_data_items\\ch2025_mUsageStats.parquet',
 'Data/ch2025_data_items\\ch2025_mWifi.parquet',
 'Data/ch2025_data_items\\ch2025_wHr.parquet',
 'Data/ch2025_data_items\\ch2025_wLight.parquet',
 'Data/ch2025_data_items\\ch2025_wPedo.parquet']

In [4]:
metrics_train = pd.read_csv(metrics_train_dir)
metrics_train

Unnamed: 0,subject_id,sleep_date,lifelog_date,Q1,Q2,Q3,S1,S2,S3
0,id01,2024-06-27,2024-06-26,0,0,0,0,0,1
1,id01,2024-06-28,2024-06-27,0,0,0,0,1,1
2,id01,2024-06-29,2024-06-28,1,0,0,1,1,1
3,id01,2024-06-30,2024-06-29,1,0,1,2,0,0
4,id01,2024-07-01,2024-06-30,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...
445,id10,2024-09-06,2024-09-05,1,0,0,1,0,1
446,id10,2024-09-08,2024-09-07,1,1,0,2,1,1
447,id10,2024-09-09,2024-09-08,1,1,1,0,1,1
448,id10,2024-09-12,2024-09-11,0,0,0,0,0,0


In [5]:
submission = pd.read_csv(submission_dir)

In [6]:
lifelog_data = {}


for file_path in parquet_files:
    name = os.path.basename(file_path).replace('.parquet', '').replace('ch2025_', '')
    lifelog_data[name] = pd.read_parquet(file_path)
    print(f"✅ Loaded: {name}, shape = {lifelog_data[name].shape}")

✅ Loaded: mACStatus, shape = (939896, 3)
✅ Loaded: mActivity, shape = (961062, 3)
✅ Loaded: mAmbience, shape = (476577, 3)
✅ Loaded: mBle, shape = (21830, 3)
✅ Loaded: mGps, shape = (800611, 3)
✅ Loaded: mLight, shape = (96258, 3)
✅ Loaded: mScreenStatus, shape = (939653, 3)
✅ Loaded: mUsageStats, shape = (45197, 3)
✅ Loaded: mWifi, shape = (76336, 3)
✅ Loaded: wHr, shape = (382918, 3)
✅ Loaded: wLight, shape = (633741, 3)
✅ Loaded: wPedo, shape = (748100, 9)


In [7]:
for key, df in lifelog_data.items():
    globals()[f"{key}_df"] = df


In [8]:
for key, df in lifelog_data.items():
    print(key)
    display(df.head(10))

mACStatus


Unnamed: 0,subject_id,timestamp,m_charging
0,id01,2024-06-26 12:03:00,0
1,id01,2024-06-26 12:04:00,0
2,id01,2024-06-26 12:05:00,0
3,id01,2024-06-26 12:06:00,0
4,id01,2024-06-26 12:07:00,0
5,id01,2024-06-26 12:08:00,0
6,id01,2024-06-26 12:09:00,0
7,id01,2024-06-26 12:10:00,0
8,id01,2024-06-26 12:11:00,0
9,id01,2024-06-26 12:12:00,0


mActivity


Unnamed: 0,subject_id,timestamp,m_activity
0,id01,2024-06-26 12:03:00,4
1,id01,2024-06-26 12:04:00,0
2,id01,2024-06-26 12:05:00,0
3,id01,2024-06-26 12:06:00,0
4,id01,2024-06-26 12:07:00,0
5,id01,2024-06-26 12:08:00,0
6,id01,2024-06-26 12:09:00,0
7,id01,2024-06-26 12:10:00,0
8,id01,2024-06-26 12:11:00,3
9,id01,2024-06-26 12:12:00,3


mAmbience


Unnamed: 0,subject_id,timestamp,m_ambience
0,id01,2024-06-26 13:00:10,"[[Music, 0.30902618], [Vehicle, 0.081680894], ..."
1,id01,2024-06-26 13:02:10,"[[Music, 0.62307084], [Vehicle, 0.021118319], ..."
2,id01,2024-06-26 13:04:10,"[[Horse, 0.25209898], [Animal, 0.24263993], [C..."
3,id01,2024-06-26 13:06:10,"[[Speech, 0.93433166], [Inside, large room or ..."
4,id01,2024-06-26 13:08:10,"[[Speech, 0.8935082], [Inside, small room, 0.0..."
5,id01,2024-06-26 13:10:10,"[[Speech, 0.79542226], [Inside, large room or ..."
6,id01,2024-06-26 13:12:10,"[[Speech, 0.8184474], [Buzz, 0.027313255], [In..."
7,id01,2024-06-26 13:14:10,"[[Speech, 0.80110717], [Inside, large room or ..."
8,id01,2024-06-26 13:16:10,"[[Speech, 0.50018805], [Domestic animals, pets..."
9,id01,2024-06-26 13:18:10,"[[Speech, 0.9326062], [Inside, large room or h..."


mBle


Unnamed: 0,subject_id,timestamp,m_ble
0,id01,2024-06-26 12:13:00,"[{'address': '00:15:7C:11:80:8D', 'device_clas..."
1,id01,2024-06-26 12:23:00,"[{'address': '0A:B1:26:4D:76:21', 'device_clas..."
2,id01,2024-06-26 12:33:00,"[{'address': '04:F5:AE:39:95:E0', 'device_clas..."
3,id01,2024-06-26 13:23:00,"[{'address': '06:C0:D2:6D:9F:69', 'device_clas..."
4,id01,2024-06-26 14:23:00,"[{'address': '10:2B:41:74:9F:B1', 'device_clas..."
5,id01,2024-06-26 14:33:00,"[{'address': '00:17:FC:88:60:D5', 'device_clas..."
6,id01,2024-06-26 14:53:00,"[{'address': '06:4F:DA:1E:13:38', 'device_clas..."
7,id01,2024-06-26 15:13:00,"[{'address': '00:17:FC:88:60:D5', 'device_clas..."
8,id01,2024-06-26 15:23:00,"[{'address': '00:17:FC:88:60:D5', 'device_clas..."
9,id01,2024-06-26 15:43:00,"[{'address': '00:17:FC:88:60:D5', 'device_clas..."


mGps


Unnamed: 0,subject_id,timestamp,m_gps
0,id01,2024-06-26 12:03:00,"[{'altitude': 110.6, 'latitude': 0.2077385, 'l..."
1,id01,2024-06-26 12:04:00,"[{'altitude': 110.8, 'latitude': 0.2078068, 'l..."
2,id01,2024-06-26 12:05:00,"[{'altitude': 110.7, 'latitude': 0.2078214, 'l..."
3,id01,2024-06-26 12:06:00,"[{'altitude': 110.7, 'latitude': 0.2078395, 'l..."
4,id01,2024-06-26 12:07:00,"[{'altitude': 110.8, 'latitude': 0.2078478, 'l..."
5,id01,2024-06-26 12:08:00,"[{'altitude': 110.7, 'latitude': 0.207852, 'lo..."
6,id01,2024-06-26 12:09:00,"[{'altitude': 110.7, 'latitude': 0.207851, 'lo..."
7,id01,2024-06-26 12:10:00,"[{'altitude': 110.8, 'latitude': 0.2078541, 'l..."
8,id01,2024-06-26 12:11:00,"[{'altitude': 110.7, 'latitude': 0.2078541, 'l..."
9,id01,2024-06-26 12:12:00,"[{'altitude': 110.7, 'latitude': 0.2078556, 'l..."


mLight


Unnamed: 0,subject_id,timestamp,m_light
0,id01,2024-06-26 12:03:00,534.0
1,id01,2024-06-26 12:13:00,846.0
2,id01,2024-06-26 12:23:00,826.0
3,id01,2024-06-26 12:33:00,851.0
4,id01,2024-06-26 12:43:00,428.0
5,id01,2024-06-26 12:53:00,306.0
6,id01,2024-06-26 13:03:00,482.0
7,id01,2024-06-26 13:13:00,1586.0
8,id01,2024-06-26 13:23:00,1208.0
9,id01,2024-06-26 13:33:00,1403.0


mScreenStatus


Unnamed: 0,subject_id,timestamp,m_screen_use
0,id01,2024-06-26 12:03:00,0
1,id01,2024-06-26 12:04:00,0
2,id01,2024-06-26 12:05:00,0
3,id01,2024-06-26 12:06:00,0
4,id01,2024-06-26 12:07:00,0
5,id01,2024-06-26 12:08:00,0
6,id01,2024-06-26 12:09:00,0
7,id01,2024-06-26 12:10:00,0
8,id01,2024-06-26 12:11:00,0
9,id01,2024-06-26 12:12:00,0


mUsageStats


Unnamed: 0,subject_id,timestamp,m_usage_stats
0,id01,2024-06-26 13:00:00,"[{'app_name': ' 캐시워크', 'total_time': 69}, {'ap..."
1,id01,2024-06-26 13:10:00,"[{'app_name': '통화', 'total_time': 26419}, {'ap..."
2,id01,2024-06-26 13:20:00,"[{'app_name': '메시지', 'total_time': 388651}, {'..."
3,id01,2024-06-26 13:30:00,"[{'app_name': '메시지', 'total_time': 211633}, {'..."
4,id01,2024-06-26 13:50:00,"[{'app_name': '카카오톡', 'total_time': 35446}, {'..."
5,id01,2024-06-26 14:00:00,"[{'app_name': '폴Pay', 'total_time': 9975}, {'a..."
6,id01,2024-06-26 14:10:00,"[{'app_name': ' 캐시워크', 'total_time': 101}, {'a..."
7,id01,2024-06-26 14:20:00,"[{'app_name': '카카오톡', 'total_time': 23348}, {'..."
8,id01,2024-06-26 14:30:00,"[{'app_name': '토스', 'total_time': 133932}, {'a..."
9,id01,2024-06-26 14:40:00,"[{'app_name': '롯데ON', 'total_time': 57190}, {'..."


mWifi


Unnamed: 0,subject_id,timestamp,m_wifi
0,id01,2024-06-26 12:03:00,"[{'bssid': 'a0:0f:37:9a:5d:8b', 'rssi': -78}, ..."
1,id01,2024-06-26 12:13:00,"[{'bssid': 'a0:0f:37:9a:5d:8b', 'rssi': -79}, ..."
2,id01,2024-06-26 12:23:00,"[{'bssid': '10:e3:c7:0a:74:d1', 'rssi': -78}, ..."
3,id01,2024-06-26 12:33:00,"[{'bssid': '10:e3:c7:09:7f:bc', 'rssi': -80}, ..."
4,id01,2024-06-26 12:43:00,"[{'bssid': '56:46:ae:59:b1:13', 'rssi': -44}, ..."
5,id01,2024-06-26 12:53:00,"[{'bssid': '5a:86:94:4e:08:38', 'rssi': -49}, ..."
6,id01,2024-06-26 13:03:00,"[{'bssid': '08:5d:dd:85:4b:3f', 'rssi': -79}, ..."
7,id01,2024-06-26 13:13:00,"[{'bssid': '88:36:6c:99:d3:e8', 'rssi': -84}, ..."
8,id01,2024-06-26 13:23:00,"[{'bssid': '08:5d:dd:cb:b7:d9', 'rssi': -77}, ..."
9,id01,2024-06-26 13:33:00,"[{'bssid': '00:07:89:b6:0e:d8', 'rssi': -80}, ..."


wHr


Unnamed: 0,subject_id,timestamp,heart_rate
0,id01,2024-06-26 12:23:00,"[134, 134, 135, 133, 134, 135, 134, 135, 134, ..."
1,id01,2024-06-26 12:24:00,"[123, 122, 121, 120, 121, 121, 120, 118, 119, ..."
2,id01,2024-06-26 12:25:00,"[120, 119, 117, 116, 119, 121, 123, 123, 121, ..."
3,id01,2024-06-26 12:26:00,"[125, 124, 124, 124, 125, 124, 124, 123, 123, ..."
4,id01,2024-06-26 12:27:00,"[116, 116, 117, 118, 116, 116, 116, 117, 115, ..."
5,id01,2024-06-26 12:28:00,"[111, 110, 110, 111, 112, 112, 110, 111, 111, ..."
6,id01,2024-06-26 12:29:00,"[122, 122, 128, 133, 135, 142, 134, 133, 133, ..."
7,id01,2024-06-26 12:30:00,"[127, 126, 128, 128, 128, 128, 127, 126, 124, ..."
8,id01,2024-06-26 12:31:00,"[116, 116, 117, 119, 118, 119, 120, 120, 119, ..."
9,id01,2024-06-26 12:32:00,"[113, 113, 109, 112, 107, 108, 108, 108, 109, ..."


wLight


Unnamed: 0,subject_id,timestamp,w_light
0,id01,2024-06-26 12:17:00,633.0
1,id01,2024-06-26 12:18:00,483.0
2,id01,2024-06-26 12:19:00,541.0
3,id01,2024-06-26 12:20:00,547.0
4,id01,2024-06-26 12:21:00,547.0
5,id01,2024-06-26 12:22:00,552.0
6,id01,2024-06-26 12:23:00,40.0
7,id01,2024-06-26 12:24:00,28.0
8,id01,2024-06-26 12:25:00,51.0
9,id01,2024-06-26 12:26:00,576.0


wPedo


Unnamed: 0,subject_id,timestamp,step,step_frequency,running_step,walking_step,distance,speed,burned_calories
0,id01,2024-06-26 12:09:00,10,0.166667,0,0,8.33,0.138833,0.0
1,id01,2024-06-26 12:10:00,0,0.0,0,0,0.0,0.0,0.0
2,id01,2024-06-26 12:11:00,0,0.0,0,0,0.0,0.0,0.0
3,id01,2024-06-26 12:12:00,0,0.0,0,0,0.0,0.0,0.0
4,id01,2024-06-26 12:13:00,0,0.0,0,0,0.0,0.0,0.0
5,id01,2024-06-26 12:14:00,0,0.0,0,0,0.0,0.0,0.0
6,id01,2024-06-26 12:15:00,0,0.0,0,0,0.0,0.0,0.0
7,id01,2024-06-26 12:16:00,0,0.0,0,0,0.0,0.0,0.0
8,id01,2024-06-26 12:17:00,0,0.0,0,0,0.0,0.0,0.0
9,id01,2024-06-26 12:18:00,0,0.0,0,0,0.0,0.0,0.0


In [9]:
for key, df in lifelog_data.items():
    print(key,  ": ",  df.info())
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939896 entries, 0 to 939895
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   subject_id  939896 non-null  object        
 1   timestamp   939896 non-null  datetime64[ns]
 2   m_charging  939896 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 21.5+ MB
mACStatus :  None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961062 entries, 0 to 961061
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   subject_id  961062 non-null  object        
 1   timestamp   961062 non-null  datetime64[ns]
 2   m_activity  961062 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 22.0+ MB
mActivity :  None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476577 entries, 0 to 476576
Data columns (total 3 columns):
 #

In [10]:
for key, df in lifelog_data.items():
    print(df.isna().sum())
    print()

subject_id    0
timestamp     0
m_charging    0
dtype: int64

subject_id    0
timestamp     0
m_activity    0
dtype: int64

subject_id    0
timestamp     0
m_ambience    0
dtype: int64

subject_id    0
timestamp     0
m_ble         0
dtype: int64

subject_id    0
timestamp     0
m_gps         0
dtype: int64

subject_id    0
timestamp     0
m_light       0
dtype: int64

subject_id      0
timestamp       0
m_screen_use    0
dtype: int64

subject_id       0
timestamp        0
m_usage_stats    0
dtype: int64

subject_id    0
timestamp     0
m_wifi        0
dtype: int64

subject_id    0
timestamp     0
heart_rate    0
dtype: int64

subject_id    0
timestamp     0
w_light       0
dtype: int64

subject_id         0
timestamp          0
step               0
step_frequency     0
running_step       0
walking_step       0
distance           0
speed              0
burned_calories    0
dtype: int64



In [11]:
submission['lifelog_date'] = pd.to_datetime(submission['lifelog_date'])
test_key = set(zip(submission['subject_id'], submission['lifelog_date'].dt.date))

dataframes = {
    'mACStatus': (mACStatus_df, 'timestamp'),
    'mActivity': (mActivity_df, 'timestamp'),
    'mAmbience': (mAmbience_df, 'timestamp'),
    'mBle': (mBle_df, 'timestamp'),
    'mGps': (mGps_df, 'timestamp'),
    'mLight': (mLight_df, 'timestamp'),
    'mScreenStatus': (mScreenStatus_df, 'timestamp'),
    'mUsageStats': (mUsageStats_df, 'timestamp'),
    'mWifi': (mWifi_df, 'timestamp'),
    'wHr': (wHr_df, 'timestamp'),
    'wLight': (wLight_df, 'timestamp'),
    'wPedo': (wPedo_df, 'timestamp'),
}


800611

In [12]:
def split_test_train(df, subject_col='subject_id', timestamp_col='timestamp'):
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
    df = df.dropna(subset=[timestamp_col])
    df['date_only'] = df[timestamp_col].dt.date
    df['key'] = list(zip(df[subject_col], df['date_only']))\
    
    test_df = df[df['key'].isin(test_key)].drop(columns=['date_only', 'key'])
    train_df = df[~df['key'].isin(test_key)].drop(columns=['date_only', 'key'])

    return test_df, train_df


In [13]:
for name, (df, ts_col) in dataframes.items():
    print(f"⏳ {name} Splitting...")
    test_df, train_df = split_test_train(df.copy(), subject_col='subject_id', timestamp_col=ts_col)
    globals()[f"{name}_test"] = test_df
    globals()[f"{name}_train"] = train_df
    print(f"✅ {name}_test → {test_df.shape}, {name}_train → {train_df.shape}")

⏳ mACStatus Splitting...
✅ mACStatus_test → (335849, 3), mACStatus_train → (604047, 3)
⏳ mActivity Splitting...
✅ mActivity_test → (343579, 3), mActivity_train → (617483, 3)
⏳ mAmbience Splitting...
✅ mAmbience_test → (170453, 3), mAmbience_train → (306124, 3)
⏳ mBle Splitting...
✅ mBle_test → (8140, 3), mBle_train → (13690, 3)
⏳ mGps Splitting...
✅ mGps_test → (287386, 3), mGps_train → (513225, 3)
⏳ mLight Splitting...
✅ mLight_test → (34439, 3), mLight_train → (61819, 3)
⏳ mScreenStatus Splitting...
✅ mScreenStatus_test → (336160, 3), mScreenStatus_train → (603493, 3)
⏳ mUsageStats Splitting...
✅ mUsageStats_test → (16499, 3), mUsageStats_train → (28698, 3)
⏳ mWifi Splitting...
✅ mWifi_test → (27467, 3), mWifi_train → (48869, 3)
⏳ wHr Splitting...
✅ wHr_test → (143311, 3), wHr_train → (239607, 3)
⏳ wLight Splitting...
✅ wLight_test → (233809, 3), wLight_train → (399932, 3)
⏳ wPedo Splitting...
✅ wPedo_test → (288832, 9), wPedo_train → (459268, 9)


In [14]:
#Transform Data per day

def process_mACStatus(df:pd.DataFrame):

    #Todo 사용가능 피처 더 생각해보기기

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df = df.sort_values(['subject_id', 'timestamp'])

    results = []

    for (subj, date), group in df.groupby(['subject_id', 'date']):
        status = group['m_charging'].values #Status On/Off
        times = group['timestamp'].values

        ratio_charging = status.mean()

        #The number Chaging the charge Value
        transitions = (status[1:] != status[:-1]).sum()

        lengths = []
        current_len = 0

        for val in status:
            if val == 1:
                current_len += 1

            elif current_len > 0:
                lengths.append(current_len)
                current_len = 0
        
        #When list is finished
        if current_len > 0:
            lengths.append(current_len)

        avg_charging_duration = np.mean(lengths) if lengths else 0
        max_charging_duration = np.max(lengths) if lengths else 0

        results.append({
            'subject_id': subj,
            'date': date,
            'chrarging_ratio': ratio_charging,
            'avg_charging_duration': avg_charging_duration,
            'max_charging_duration': max_charging_duration,
        })



    return pd.DataFrame(results)

processing_mACStatus = process_mACStatus(mACStatus_df)
print(processing_mACStatus)

    subject_id        date  chrarging_ratio  avg_charging_duration  \
0         id01  2024-06-26         0.215859              13.363636   
1         id01  2024-06-27         0.158571              13.875000   
2         id01  2024-06-28         0.180282              17.066667   
3         id01  2024-06-29         0.286567             192.000000   
4         id01  2024-06-30         0.144286              50.500000   
..         ...         ...              ...                    ...   
695       id10  2024-09-21         0.339552             227.500000   
696       id10  2024-09-22         0.423077             201.666667   
697       id10  2024-09-24         0.500709             141.200000   
698       id10  2024-09-25         0.233094             108.000000   
699       id10  2024-09-26         0.226515              99.666667   

     max_charging_duration  
0                       41  
1                       65  
2                       76  
3                      328  
4             

In [15]:
def process_mActivity(df:pd.DataFrame):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date 

    summary = []

    for (subj, date), group in df.groupby(['subject_id', 'date']):
        counts = group['m_activity'].value_counts(normalize=True)
        row = {'subject_id': subj, 'date': date}

        for i in range(9):
            row[f'activity_{i}_ratio'] = counts.get(i,0)

        row['dominant_activity'] = group['m_activity'].mode()[0]
        row['num_unique_activities'] = group['m_activity'].nunique()

        summary.append(row)

    return pd.DataFrame(summary)


processing_mActivity = process_mActivity(mActivity_df)    
print(processing_mActivity)

    subject_id        date  activity_0_ratio  activity_1_ratio  \
0         id01  2024-06-26          0.125176          0.001406   
1         id01  2024-06-27          0.146528          0.000000   
2         id01  2024-06-28          0.111806          0.000694   
3         id01  2024-06-29          0.065972          0.000000   
4         id01  2024-06-30          0.138194          0.000000   
..         ...         ...               ...               ...   
695       id10  2024-09-21          0.025694          0.000000   
696       id10  2024-09-22          0.007639          0.000000   
697       id10  2024-09-24          0.026389          0.000000   
698       id10  2024-09-25          0.041259          0.000000   
699       id10  2024-09-26          0.044056          0.000000   

     activity_2_ratio  activity_3_ratio  activity_4_ratio  activity_5_ratio  \
0                   0          0.672293          0.157525                 0   
1                   0          0.611111          

In [16]:
top_10_labels = [
    "Inside, small room", "Speech", "Silence", "Music",
    "Narration, monologue", "Child speech, kid speaking",
    "Conversation", "Speech synthesizer", "Shout", "Babbling"
]

def process_mAmbience_top10(df:pd.DataFrame):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    # 초기화
    for label in top_10_labels + ['others']:
        df[label] = 0.0

    for idx, row in df.iterrows():
        parsed = ast.literal_eval(row['m_ambience']) if isinstance(row['m_ambience'], str) else row['m_ambience']
        others_prob = 0.0

        for label, prob in parsed:
            prob = float(prob)
            if label in top_10_labels:
                df.at[idx, label] = prob
            else:
                others_prob += prob

        df.at[idx, 'others'] = others_prob

    return df.drop(columns=['m_ambience'])

mAmbience_df2= process_mAmbience_top10(mAmbience_df)

def summarize_mAmbience_daily(df):
    prob_cols = [col for col in df.columns if col not in ['subject_id', 'timestamp', 'date']]

    # 하루 단위로 평균값 요약
    daily_summary = df.groupby(['subject_id', 'date'])[prob_cols].mean().reset_index()
    return daily_summary

processing_mAmbience = summarize_mAmbience_daily(mAmbience_df2)
processing_mAmbience

Unnamed: 0,subject_id,date,"Inside, small room",Speech,Silence,Music,"Narration, monologue","Child speech, kid speaking",Conversation,Speech synthesizer,Shout,Babbling,others
0,id01,2024-06-26,2.183661e-02,2.466539e-01,0.116573,3.041167e-02,1.807150e-03,0.000645,0.000919,0.000033,0.000639,0.000000,0.500729
1,id01,2024-06-27,5.669892e-05,8.622866e-08,0.998611,7.056701e-15,9.689560e-30,0.000000,0.000000,0.000000,0.000000,0.000000,0.001542
2,id01,2024-06-28,1.019045e-05,1.434175e-03,0.995774,7.036657e-15,2.030282e-05,0.000002,0.000002,0.000005,0.000000,0.000000,0.003555
3,id01,2024-06-29,3.832783e-04,8.622867e-08,0.998611,7.056701e-15,9.689562e-30,0.000000,0.000000,0.000000,0.000000,0.000000,0.002089
4,id01,2024-06-30,2.072503e-27,8.622866e-08,0.998685,7.056702e-15,9.689561e-30,0.000000,0.000000,0.000000,0.000000,0.000000,0.000934
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,id10,2024-09-21,3.844562e-02,1.281655e-01,0.012739,4.454126e-02,5.803523e-04,0.001658,0.000257,0.000007,0.000028,0.000153,0.847109
696,id10,2024-09-22,6.442265e-02,1.161079e-01,0.023788,5.760270e-02,6.344101e-04,0.000434,0.000147,0.000026,0.000136,0.000067,0.767387
697,id10,2024-09-24,4.972465e-02,1.235537e-01,0.029274,1.525791e-02,6.240653e-04,0.001087,0.000448,0.000002,0.000000,0.000204,0.836962
698,id10,2024-09-25,5.078197e-02,1.814980e-01,0.009223,3.573640e-02,1.393347e-03,0.000918,0.000725,0.000022,0.000003,0.000132,0.715986


In [17]:
def process_mBle(df:pd.DataFrame):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    features = []

    for idx, row in df.iterrows():
        entry = ast.literal_eval(row['m_ble']) if isinstance(row['m_ble'], str) else row['m_ble']

        rssi_list = []
        class_0_cnt = 0
        class_other_cnt = 0

        for device in entry:
            try:
                rssi = int(device['rssi'])
                rssi_list.append(rssi)

                if str(device['device_class']) == 0:
                    class_0_cnt += 1
                else:
                    class_other_cnt += 1

            except:
                continue

        feature = {
            'subject_id': row['subject_id'],
            'date': row['date'],
            'device_class_0_cnt': class_0_cnt,
            'device_class_others_cnt': class_other_cnt,
            'device_count': len(rssi_list),
            'rssi_mean': np.mean(rssi_list) if rssi_list else np.nan,
            'rssi_min': np.min(rssi_list) if rssi_list else np.nan,
            'rssi_max': np.max(rssi_list) if rssi_list else np.nan,
        }
        features.append(feature)


    return pd.DataFrame(features)


def summarize_mBle_daily(df:pd.DataFrame):
        # row 단위 BLE feature 추출
    df = process_mBle(df)

        # 하루 단위로 cnt 합치기
    grouped = df.groupby(['subject_id', 'date']).agg({
            'device_class_0_cnt': 'sum',
            'device_class_others_cnt': 'sum',
            'rssi_mean': 'mean',
            'rssi_min': 'min',
            'rssi_max': 'max',
        }).reset_index()

        # 총합 구해서 비율 계산
    total_cnt = grouped['device_class_0_cnt'] + grouped['device_class_others_cnt']
    grouped['device_class_0_ratio'] = grouped['device_class_0_cnt'] / total_cnt.replace(0, np.nan)
    grouped['device_class_others_ratio'] = grouped['device_class_others_cnt'] / total_cnt.replace(0, np.nan)

    # 필요 없는 원래 cnt 컬럼 제거
    grouped.drop(columns=['device_class_0_cnt', 'device_class_others_cnt'], inplace=True)

    return grouped

processing_mBle = summarize_mBle_daily(mBle_df)
processing_mBle

Unnamed: 0,subject_id,date,rssi_mean,rssi_min,rssi_max,device_class_0_ratio,device_class_others_ratio
0,id01,2024-06-26,-75.668055,-94.0,-27.0,0.0,1.0
1,id01,2024-06-27,-73.848158,-94.0,-34.0,0.0,1.0
2,id01,2024-06-28,-77.019204,-94.0,-39.0,0.0,1.0
3,id01,2024-06-29,-69.110887,-93.0,-33.0,0.0,1.0
4,id01,2024-06-30,-74.805502,-92.0,-35.0,0.0,1.0
...,...,...,...,...,...,...,...
646,id10,2024-09-21,-85.490760,-103.0,-48.0,0.0,1.0
647,id10,2024-09-22,-85.245702,-104.0,-38.0,0.0,1.0
648,id10,2024-09-24,-80.688905,-102.0,-29.0,0.0,1.0
649,id10,2024-09-25,-81.278353,-102.0,-38.0,0.0,1.0


In [18]:
def generator_len(gen):
    count = 0
    for _ in gen:
        count += 1

    return count

In [27]:
def process_mGPS(df:pd.DataFrame):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    features = []


    max_iter = len(df)
    for idx, row in tqdm(df.iterrows(), leave=False,total=max_iter):
        gps_list = ast.literal_eval(row['m_gps']) if isinstance(row['m_gps'], str) else row['m_gps']

        altitudes = []
        latitudes = []
        longitudes = []
        speeds = []

        for entry in gps_list:
            try:
                altitudes.append(float(entry['altitude']))
                latitudes.append(float(entry['latitude']))
                longitudes.append(float(entry['longitude']))
                speeds.append(float(entry['speed']))
            except:
                continue

        features.append({
            'subject_id': row['subject_id'],
            'date': row['date'],
            'altitude_mean': np.mean(altitudes) if altitudes else np.nan,
            'latitude_std': np.std(latitudes) if latitudes else np.nan,
            'longitude_std': np.std(longitudes) if longitudes else np.nan,
            'speed_mean': np.mean(speeds) if speeds else np.nan,
            'speed_max': np.max(speeds) if speeds else np.nan,
            'speed_std': np.std(speeds) if speeds else np.nan,
        })

    return pd.DataFrame(features)

m_Gps_df2 = process_mGPS(mGps_df)

m_Gps_df2 = m_Gps_df2.groupby(['subject_id', 'date']).agg({
    'altitude_mean': 'mean',
    'latitude_std': 'mean',
    'longitude_std': 'mean',
    'speed_mean': 'mean',
    'speed_max': 'max',
    'speed_std': 'mean'
}).reset_index()

  0%|          | 0/800611 [00:00<?, ?it/s]

In [20]:
processing_mGPS = m_Gps_df2
del m_Gps_df2

processing_mGPS

Unnamed: 0,subject_id,date,altitude_mean,latitude_std,longitude_std,speed_mean,speed_max,speed_std
0,id01,2024-06-26,90.030669,0.000052,0.000051,0.577513,28.2200,0.454185
1,id01,2024-06-27,92.753390,0.000061,0.000075,0.875319,36.6356,0.682681
2,id01,2024-06-28,91.200044,0.000057,0.000065,0.684367,49.5476,0.547474
3,id01,2024-06-29,102.121991,0.000029,0.000041,0.484402,31.0703,0.357083
4,id01,2024-06-30,108.978348,0.000034,0.000039,0.383900,33.8230,0.268387
...,...,...,...,...,...,...,...,...
655,id10,2024-09-21,104.473020,0.000008,0.000008,0.145535,15.2323,0.082467
656,id10,2024-09-22,98.556704,0.000028,0.000034,0.338729,33.2872,0.233564
657,id10,2024-09-24,99.852833,0.000020,0.000031,0.327904,22.0213,0.213728
658,id10,2024-09-25,94.827374,0.000035,0.000079,0.549382,16.6092,0.392587


In [21]:
def process_mLight(df): #TODO Add deviations, minimum, and maximum for each day and night later
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    # 밤(22~05시), 낮(06~21시) 구분
    df['is_night'] = df['hour'].apply(lambda h: h >= 22 or h < 6)

    # 하루 단위 요약
    daily = df.groupby(['subject_id', 'date']).agg(
        light_mean=('m_light', 'mean'),
        light_std=('m_light', 'std'),
        light_max=('m_light', 'max'),
        light_min=('m_light', 'min'),
        light_night_mean=('m_light', lambda x: x[df.loc[x.index, 'is_night']].mean()),
        light_day_mean=('m_light', lambda x: x[~df.loc[x.index, 'is_night']].mean()),
        light_night_ratio=('is_night', 'mean')  # 밤 시간 측정 비율
    ).reset_index()

    return daily

processing_mLight = process_mLight(mLight_df)
processing_mLight

Unnamed: 0,subject_id,date,light_mean,light_std,light_max,light_min,light_night_mean,light_day_mean,light_night_ratio
0,id01,2024-06-26,364.506849,395.659440,1886.000000,0.0,184.923077,403.416667,0.178082
1,id01,2024-06-27,332.069444,1300.535681,11248.000000,0.0,27.708333,484.250000,0.333333
2,id01,2024-06-28,219.201389,260.682900,1834.000000,0.0,44.541667,306.531250,0.333333
3,id01,2024-06-29,91.416667,312.065205,3498.000000,0.0,1.083333,136.583333,0.333333
4,id01,2024-06-30,98.909722,300.448148,2691.000000,0.0,14.354167,141.187500,0.333333
...,...,...,...,...,...,...,...,...,...
695,id10,2024-09-21,111.584535,266.482829,1272.982544,0.0,27.045001,153.854302,0.333333
696,id10,2024-09-22,92.265942,509.643433,5888.025391,0.0,35.998595,120.399615,0.333333
697,id10,2024-09-24,247.154544,1121.933203,9442.980469,0.0,6.813282,367.325175,0.333333
698,id10,2024-09-25,374.243924,1752.616675,14954.018555,0.0,103.252503,509.739634,0.333333


In [25]:
def process_mScreenStatus(df:pd.DataFrame):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    features = []

    for (subj, date), group in df.groupby(['subject_id', 'date']):
        status = group['m_screen_use'].values
        ratio_on = status.mean()
        transitions = (status[1:] != status[:-1]).sum()

        duration = []
        current = 0
        
        for val in status:
            if val == 1:
                current += 1
            elif current > 0:
                duration.append(current)
                current = 0

        if current > 0:
            duration.append(current)

        features.append({
                'subject_id': subj,
                'date': date,
                'screen_on_ratio': ratio_on,
                'screen_on_transitions': transitions,
                'screen_on_duration_avg': np.mean(duration) if duration else 0,
                'screen_on_duration_max': np.max(duration) if duration else 0,
            })

    return pd.DataFrame(features)

pricessing_mScreenStatus = process_mScreenStatus(mScreenStatus_df)
pricessing_mScreenStatus


Unnamed: 0,subject_id,date,screen_on_ratio,screen_on_transitions,screen_on_duration_avg,screen_on_duration_max
0,id01,2024-06-26,0.295359,54,7.777778,60
1,id01,2024-06-27,0.365734,82,12.756098,79
2,id01,2024-06-28,0.319718,90,10.088889,51
3,id01,2024-06-29,0.237857,66,10.090909,82
4,id01,2024-06-30,0.220423,82,7.634146,69
...,...,...,...,...,...,...
695,id10,2024-09-21,0.399270,69,15.628571,148
696,id10,2024-09-22,0.473333,68,18.257143,169
697,id10,2024-09-24,0.325714,89,10.133333,120
698,id10,2024-09-25,0.375000,91,11.086957,93


In [23]:
status = [0, 0, 1, 1, 0, 0, 1]

print(status[1:])
print(status[:-1])

[0, 1, 1, 0, 0, 1]
[0, 0, 1, 1, 0, 0]


In [2]:
dir_name = "Data/PreProcessingData/"
file_name = "data.pkl"

import pickle as pkl
import pandas as pd

lifelog_data:dict[str, pd.DataFrame] = {}
with open(dir_name + file_name, 'rb') as f:
    lifelog_data = pkl.load(f)

for name, df in lifelog_data.items():
    print(name)
    display(df.head())

mACStatus


Unnamed: 0,subject_id,timestamp,m_charging
681,id01,2024-06-27 00:00:00,0
682,id01,2024-06-27 00:01:00,0
683,id01,2024-06-27 00:02:00,0
684,id01,2024-06-27 00:03:00,0
685,id01,2024-06-27 00:04:00,0


mActivity


Unnamed: 0,subject_id,timestamp,m_activity
711,id01,2024-06-27 00:00:00,4
712,id01,2024-06-27 00:01:00,4
713,id01,2024-06-27 00:02:00,4
714,id01,2024-06-27 00:03:00,4
715,id01,2024-06-27 00:04:00,4


mAmbience


Unnamed: 0,subject_id,timestamp,m_ambience
304,id01,2024-06-27 00:00:10,"[[Silence, 0.9999998], [Speech, 8.634858E-8], ..."
305,id01,2024-06-27 00:02:10,"[[Silence, 0.9999998], [Speech, 8.634858E-8], ..."
306,id01,2024-06-27 00:04:10,"[[Silence, 0.9999998], [Speech, 8.634858E-8], ..."
307,id01,2024-06-27 00:06:10,"[[Silence, 0.9999998], [Speech, 8.634858E-8], ..."
308,id01,2024-06-27 00:08:10,"[[Silence, 0.9999998], [Speech, 8.634858E-8], ..."


mBle


Unnamed: 0,subject_id,timestamp,m_ble


mGps


Unnamed: 0,subject_id,timestamp,m_gps
707,id01,2024-06-27 00:00:00,"[{'altitude': 103.9, 'latitude': 0.2306188, 'l..."
708,id01,2024-06-27 00:01:00,"[{'altitude': 103.9, 'latitude': 0.2306402, 'l..."
709,id01,2024-06-27 00:02:00,"[{'altitude': 103.9, 'latitude': 0.2306416, 'l..."
710,id01,2024-06-27 00:03:00,"[{'altitude': 103.9, 'latitude': 0.2306376, 'l..."
711,id01,2024-06-27 00:04:00,"[{'altitude': 103.9, 'latitude': 0.2306338, 'l..."


mLight


Unnamed: 0,subject_id,timestamp,m_light
4395,id01,2024-08-02 00:07:00,0.0
4396,id01,2024-08-02 00:17:00,0.0
4397,id01,2024-08-02 00:27:00,0.0
4398,id01,2024-08-02 00:37:00,0.0
4399,id01,2024-08-02 00:47:00,0.0


mScreenStatus


Unnamed: 0,subject_id,timestamp,m_screen_use
711,id01,2024-06-27 00:00:00,0
712,id01,2024-06-27 00:01:00,0
713,id01,2024-06-27 00:02:00,0
714,id01,2024-06-27 00:03:00,0
715,id01,2024-06-27 00:04:00,0


mUsageStats


Unnamed: 0,subject_id,timestamp,m_usage_stats
1126,id01,2024-07-10 00:30:00,"[{'app_name': '통화', 'total_time': 32062}, {'ap..."
1127,id01,2024-07-10 00:40:00,"[{'app_name': ' ✝️성경일독Q', 'total_time': 690}]"
1128,id01,2024-07-10 01:00:00,"[{'app_name': '재난문자', 'total_time': 47}]"
1129,id01,2024-07-10 01:20:00,"[{'app_name': '재난문자', 'total_time': 46}]"
1130,id01,2024-07-10 01:50:00,"[{'app_name': '재난문자', 'total_time': 45}]"


mWifi


Unnamed: 0,subject_id,timestamp,m_wifi
3572,id01,2024-08-02 00:07:00,"[{'bssid': '04:09:a5:3a:c8:6a', 'rssi': -41}, ..."
3573,id01,2024-08-02 00:17:00,"[{'bssid': '04:09:a5:3a:c8:6a', 'rssi': -40}, ..."
3574,id01,2024-08-02 00:27:00,"[{'bssid': '04:09:a5:3a:c8:6a', 'rssi': -39}, ..."
3575,id01,2024-08-02 00:37:00,"[{'bssid': '04:09:a5:3a:c8:6a', 'rssi': -39}, ..."
3576,id01,2024-08-02 00:47:00,"[{'bssid': '04:09:a5:3a:c8:6a', 'rssi': -39}, ..."


wHr


Unnamed: 0,subject_id,timestamp,heart_rate
86055,id03,2024-08-11 00:00:00,"[97, 98, 95, 96, 95, 96, 98, 96, 95, 94, 94, 9..."
86056,id03,2024-08-11 00:01:00,"[83, 83, 81, 82, 82, 81, 81, 81, 84, 80, 83, 8..."
86057,id03,2024-08-11 00:02:00,"[80, 78, 80, 78, 78, 77, 77, 77, 76, 76, 78, 7..."
86058,id03,2024-08-11 00:03:00,"[70, 68, 66, 63, 64, 63, 63, 64, 65, 66, 68, 6..."
86059,id03,2024-08-11 00:04:00,"[82, 81, 82, 82, 80, 81, 77, 79, 79, 81, 82, 8..."


wLight


Unnamed: 0,subject_id,timestamp,w_light
12252,id01,2024-07-15 00:00:00,0.0
12253,id01,2024-07-15 00:02:00,0.0
12254,id01,2024-07-15 00:03:00,0.0
12255,id01,2024-07-15 00:04:00,0.0
12256,id01,2024-07-15 00:05:00,0.0


wPedo


Unnamed: 0,subject_id,timestamp,step,step_frequency,running_step,walking_step,distance,speed,burned_calories
643,id01,2024-06-27 00:00:00,0,0.0,0,0,0.0,0.0,0.0
644,id01,2024-06-27 00:01:00,0,0.0,0,0,0.0,0.0,0.0
645,id01,2024-06-27 00:02:00,0,0.0,0,0,0.0,0.0,0.0
646,id01,2024-06-27 00:03:00,0,0.0,0,0,0.0,0.0,0.0
647,id01,2024-06-27 00:04:00,0,0.0,0,0,0.0,0.0,0.0


In [9]:
len(lifelog_data['wHr'].iloc[0,2])

59