In [None]:
import os
import torch
from lib.utils import (
load_config,
get_experiment_dir,
make_windowed_dataset_from_sessions,
get_participant_id,
get_participant_projects,
get_raw_dataset_path,
get_sessions_for_project,
)


participants = list(set([filename.split('_')[0] for filename in os.listdir('data/002_60s_windowsplit')]))

import pandas as pd
df = []

for i, participant in enumerate(participants):
    participant_id = get_participant_id(participant)
    projects = get_participant_projects(participant_id)

    all_sessions_for_participant = []
    for project_name in projects:
        raw_dataset_path = get_raw_dataset_path(project_name)
        sessions = get_sessions_for_project(project_name)
        sessions = [s for s in sessions if s.get('keep') != 0 and s.get('smoking_verified') == 1]
        for session in sessions:
            bouts = [bout for bout in session['bouts'] if bout['label'] == 'andrew smoking labels']
            bout_durations_s = [(bout['end'] - bout['start'])/1e9 for bout in bouts]
            df.append({
                'participant': f"{i}",
                'session_id': session['session_id'],
                'session_length_hours': (session['stop_ns'] - session['start_ns'])/1e9/3600,
                'start_ns': session['start_ns'],
                'stop_ns': session['stop_ns'],
                'num_bouts': len(bouts),
                'total_bout_duration_s': sum(bout_durations_s),
                'bout_durations_s': bout_durations_s,
                'average_bout_duration_s': sum(bout_durations_s)/len(bout_durations_s) if len(bout_durations_s) > 0 else 0
            })

df = pd.DataFrame(df)
df

In [None]:
total_participants = df['participant'].nunique()
total_amount_of_data_hours = df['session_length_hours'].sum()
average_data_per_participant_hours = total_amount_of_data_hours / total_participants
total_sessions = len(df)

print(f"""Across {total_participants} participants, a total of {total_amount_of_data_hours:.1f} hours of time series data were collected across {total_sessions} continuous data segments.\nOn average, each participant contributed {total_amount_of_data_hours/total_participants:.1f} hours of data. Each data segment represents a continuous recording session\nlasting an average of {df['session_length_hours'].mean():.2f} hours. Each session is annotated with smoking bout information,\nenabling detailed analysis of smoking behavior patterns over time. Each participant contributed an average of {total_sessions/total_participants:.1f} sessions to the dataset.\nThe total number of smoking bouts recorded across all sessions is {df['num_bouts'].sum()}. The total duration of all smoking bouts across all sessions is {df['total_bout_duration_s'].sum()/3600:.1f} hours.\nThe average number of bouts per session is {df['num_bouts'].mean():.2f}. The percentage of sessions containing at least one bout is {(df['num_bouts'] > 0).sum() / total_sessions * 100:.1f}%.\nThe average duration of smoking bouts across all sessions is {df[df['num_bouts'] > 0]['average_bout_duration_s'].mean():.1f} seconds.""")

import seaborn as sns
import matplotlib.pyplot as plt
fig,ax = plt.subplots(nrows=2,ncols=2,figsize=(10,10),dpi=200)
sns.histplot(data=df, x='session_length_hours', bins=20, ax=ax[0,0])
sns.boxplot(data=df, y='session_length_hours', hue='participant', ax=ax[0,1])
sns.boxplot(data=df, y='num_bouts', hue='participant', ax=ax[1,0])
sns.histplot(data=df, x='num_bouts', bins=20, ax=ax[1,1])

In [None]:
pd.DataFrame(df)

In [None]:
duration_hours = ((sessions[0]['stop_ns'] - sessions[0]['start_ns']) / 1e9) / 3600
print(f"Duration of first session in hours: {duration_hours:.2f} hours")

In [None]:
f"""There were {len(participants)} participants processed. 
"""