Determing the threshold for the Time Interval across all participants:

In [13]:
import os
import pandas as pd

# Defining the main directory containing the train set participant folders
TRAIN_DIR = '/Users/nandana_hemanth/Documents/Semester II/DATA 245/Project/Dataset/child-mind-institute-problematic-internet-use/series_train.parquet'

# Storing interval statistics for each participant
interval_stats = []

# Iterating over each participant's folder
for folder in os.listdir(TRAIN_DIR):
    participant_path = os.path.join(TRAIN_DIR, folder)
    if os.path.isdir(participant_path):
        # Printing the current participant being processed
        print(f"Processing participant: {folder}")
        
        # Loading participant's data
        file_path = os.path.join(participant_path, 'part-0.parquet')
        series_data = pd.read_parquet(file_path)
        
        # Step 1: Converting to Continuous Time Scale if needed
        if 'day_time' not in series_data.columns:
            series_data['time_of_day_hours'] = series_data['time_of_day'] / 1e9 / 3600  # Convert to hours
            series_data['day_time'] = series_data['relative_date_PCIAT'] + (series_data['time_of_day_hours'] / 24)
        
        # Step 2: Calculating Time Differences in Seconds
        series_data['time_diff'] = (series_data['day_time'].diff() * 86400).round(0)  # Convert to seconds
        
        # Step 3: Collecting Statistics for Time Differences
        interval_stats.append({
            'participant_id': folder,
            'mean_interval': series_data['time_diff'].mean(),
            'median_interval': series_data['time_diff'].median(),
            '25th_percentile': series_data['time_diff'].quantile(0.25),
            '75th_percentile': series_data['time_diff'].quantile(0.75),
            '90th_percentile': series_data['time_diff'].quantile(0.90),
            'max_interval': series_data['time_diff'].max()
        })

# Creating a DataFrame with the interval statistics for analysis
interval_df = pd.DataFrame(interval_stats)
print("Interval Statistics Across Participants:\n", interval_df.describe())

Processing participant: id=0d01bbf2
Processing participant: id=cefdb7fe
Processing participant: id=58391429
Processing participant: id=2ca2206f
Processing participant: id=19455336
Processing participant: id=ca33a5e7
Processing participant: id=92bb8516
Processing participant: id=2812951b
Processing participant: id=6b6467f4
Processing participant: id=9d6b1410
Processing participant: id=22c72c4e
Processing participant: id=b3b200af
Processing participant: id=ebf30e46
Processing participant: id=2a0b8386
Processing participant: id=71c1b1d2
Processing participant: id=e683d2c9
Processing participant: id=bebff291
Processing participant: id=d4d2f272
Processing participant: id=81d3ab22
Processing participant: id=051680a0
Processing participant: id=1aff6762
Processing participant: id=a0522c83
Processing participant: id=b447e66d
Processing participant: id=90161e10
Processing participant: id=adbd6839
Processing participant: id=3e5d5b58
Processing participant: id=cd68643b
Processing participant: id=a