In [5]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Directory paths
train_dir = "/Users/ad53533/Desktop/Applied ML/Project/series_train.parquet"
test_dir = "/Users/ad53533/Desktop/Applied ML/Project/series_test.parquet"

# Function to process a single file
def process_time_series_file(filename, dirname):
    # Read the file
    filepath = os.path.join(dirname, filename)
    df = pd.read_parquet(filepath)

    # Extract the 'id' from the filename
    df['id'] = filename.split('=')[1]

    # Convert 'time_of_day' from nanoseconds to human-readable format
    if 'time_of_day' in df.columns:
        df['time_of_day'] = pd.to_datetime(df['time_of_day'], unit='ns')
        df['hour'] = df['time_of_day'].dt.hour
        df['minute'] = df['time_of_day'].dt.minute
        df['weekday'] = df['time_of_day'].dt.weekday
        df['quarter'] = df['time_of_day'].dt.quarter

    # Add motion intensity
    if {'X', 'Y', 'Z'}.issubset(df.columns):
        df['motion_intensity'] = np.sqrt(df['X']**2 + df['Y']**2 + df['Z']**2)

    # Calculate the percentage of time the watch is not worn
    if 'non_wear_flag' in df.columns:
        df['non_wear_percentage'] = df['non_wear_flag'].mean()

    # Return the processed DataFrame
    return df

# Function to process all files incrementally and return a combined dataset
def process_all_files(dirname):
    ids = os.listdir(dirname)
    processed_chunks = []

    for filename in tqdm(ids, desc=f"Processing files in {dirname}"):
        # Process each file
        processed_df = process_time_series_file(filename, dirname)
        processed_chunks.append(processed_df)

    # Combine all processed DataFrames
    combined_df = pd.concat(processed_chunks, ignore_index=True)
    return combined_df

# Process train and test datasets
train_time_series = process_all_files(train_dir)
test_time_series = process_all_files(test_dir)

# Inspect the final datasets
print("Train Time Series Shape:", train_time_series.shape)
print("Test Time Series Shape:", test_time_series.shape)
print(train_time_series.head())


Processing files in /Users/ad53533/Desktop/Applied ML/Project/series_train.parquet: 100%|██████████| 996/996 [00:29<00:00, 34.24it/s]
Processing files in /Users/ad53533/Desktop/Applied ML/Project/series_test.parquet: 100%|██████████| 2/2 [00:00<00:00, 14.64it/s]

Train Time Series Shape: (314569149, 17)
Test Time Series Shape: (439726, 17)
   step         X         Y         Z      enmo     anglez  non-wear_flag  \
0     0 -0.468869  0.412020 -0.236458  0.042506 -19.824650            0.0   
1     1 -0.662526  0.533484  0.064034  0.052847   4.300246            0.0   
2     2 -0.611384  0.227252 -0.150882  0.060734 -16.545208            0.0   
3     3 -0.385799  0.552782 -0.500523  0.070440 -36.452175            0.0   
4     4  0.016133  0.031981 -0.825109  0.081058 -67.488388            0.0   

       light  battery_voltage         time_of_day  weekday  quarter  \
0  27.666666      4179.000000 1970-01-01 15:58:00        3        1   
1  12.666667      4178.666504 1970-01-01 15:58:05        3        1   
2  47.000000      4178.333496 1970-01-01 15:58:10        3        1   
3  63.799999      4178.000000 1970-01-01 15:58:15        3        1   
4   6.000000      4177.666504 1970-01-01 15:58:20        3        1   

   relative_date_PCIAT        id




In [9]:
train_time_series

Unnamed: 0,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT,id,hour,minute,motion_intensity
0,0,-0.468869,0.412020,-0.236458,0.042506,-19.824650,0.0,27.666666,4179.000000,1970-01-01 15:58:00,3,1,28.0,0d01bbf2,15,58,0.667466
1,1,-0.662526,0.533484,0.064034,0.052847,4.300246,0.0,12.666667,4178.666504,1970-01-01 15:58:05,3,1,28.0,0d01bbf2,15,58,0.853021
2,2,-0.611384,0.227252,-0.150882,0.060734,-16.545208,0.0,47.000000,4178.333496,1970-01-01 15:58:10,3,1,28.0,0d01bbf2,15,58,0.669476
3,3,-0.385799,0.552782,-0.500523,0.070440,-36.452175,0.0,63.799999,4178.000000,1970-01-01 15:58:15,3,1,28.0,0d01bbf2,15,58,0.839603
4,4,0.016133,0.031981,-0.825109,0.081058,-67.488388,0.0,6.000000,4177.666504,1970-01-01 15:58:20,3,1,28.0,0d01bbf2,15,58,0.825886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314569144,418627,0.007616,0.580302,-0.817633,0.002992,-54.681950,1.0,0.000000,3098.833252,1970-01-01 17:05:35,3,1,25.0,57de6095,17,5,1.002663
314569145,418628,0.007395,0.580485,-0.817698,0.003070,-54.666637,1.0,0.000000,3098.666748,1970-01-01 17:05:40,3,1,25.0,57de6095,17,5,1.002820
314569146,418629,0.007590,0.580393,-0.817607,0.003002,-54.663216,1.0,0.000000,3098.500000,1970-01-01 17:05:45,3,1,25.0,57de6095,17,5,1.002694
314569147,418630,0.007850,0.580432,-0.817594,0.002989,-54.707905,1.0,0.000000,3098.333252,1970-01-01 17:05:50,3,1,25.0,57de6095,17,5,1.002708


In [5]:
# Check for missing values
missing_values = train_time_series.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

# Check unique values in categorical columns
print("Unique values in 'weekday':", train_time_series['weekday'].unique())
print("Unique values in 'quarter':", train_time_series['quarter'].unique())


AttributeError: 'list' object has no attribute 'isnull'