Determing the threshold for the Battery Level across all participants:

In [19]:
import os
import pandas as pd
import random

# Defining the path to the train folder
TRAIN_DIR = '/Users/nandana_hemanth/Documents/Semester II/DATA 245/Project/Dataset/child-mind-institute-problematic-internet-use/series_train.parquet'
# Step 1: Getting a list of all the participant folders in the train directory
participant_folders = [f for f in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, f))]

# Step 2: Taking the entire participant train dataset
sample_size = 996
sampled_folders = random.sample(participant_folders, sample_size)
battery_data = []

# Step 3: Analyzing Battery Levels for each sampled participant
for folder in sampled_folders:
    # Constructing the path to the parquet file within each folder
    file_path = os.path.join(TRAIN_DIR, folder, 'part-0.parquet')
    
    # Loading the parquet file for the participant
    series_data = pd.read_parquet(file_path)
    
    # Checking if the required columns are present
    if 'time_of_day' in series_data.columns and 'relative_date_PCIAT' in series_data.columns:
        # Creating continuous time scale
        series_data['time_of_day_hours'] = series_data['time_of_day'] / 1e9 / 3600  # Convrting nanoseconds to hours
        series_data['day_time'] = series_data['relative_date_PCIAT'] + (series_data['time_of_day_hours'] / 24)

        # Extracting battery voltage statistics
        battery_stats = series_data['battery_voltage'].describe()
        battery_data.append({
            'id': folder.split('=')[-1],  # Extracting participant ID from folder name
            'mean_battery': battery_stats['mean'],
            'median_battery': battery_stats['50%'],
            '25th_percentile': battery_stats['25%'],
            'min_battery': battery_stats['min']
        })
    else:
        print(f"Missing required columns in {folder}. Skipping this participant.")

# Step 4: Converting battery statistics data into a DataFrame
battery_df = pd.DataFrame(battery_data)
print("Battery Statistics Across Sample Participants:\n", battery_df)

# Step 5: Determining a global battery threshold
global_battery_threshold = battery_df['25th_percentile'].mean()  # Considering the average of the 25th percentile as threshold
print(f"Global Battery Threshold (Average 25th Percentile): {global_battery_threshold:.2f} mV")

Battery Statistics Across Sample Participants:
            id  mean_battery  median_battery  25th_percentile  min_battery
0    2fef897e   3840.489746          3812.0      3741.000000  3098.166748
1    90161e10   3845.631592          3812.0      3747.000000  3098.166748
2    49e4eade   3836.187500          3806.0      3739.166748  3098.166748
3    31b74cd4   3847.067139          3818.0      3741.000000  3098.166748
4    d8037389   3801.283203          3800.0      3741.000000  3098.166748
..        ...           ...             ...              ...          ...
991  1649ce22   3889.106445          3853.0      3788.000000  3683.000000
992  ba73bc7c   3791.461182          3788.0      3730.000000  3098.166748
993  6e9beaee   3832.606934          3806.0      3741.000000  3098.166748
994  b3f5c066   3850.967285          3818.0      3752.062439  3098.166748
995  6ac5763a   3844.443604          3816.0      3741.000000  3098.166748

[996 rows x 5 columns]
Global Battery Threshold (Average 25th P