In [4]:
import os
import pandas as pd
import numpy as np

# Function to extract features from Actigraphy data
def extract_features(data):
    # Statistical features
    mean_x = data['X'].mean()
    mean_y = data['Y'].mean()
    mean_z = data['Z'].mean()
    std_x = data['X'].std()
    std_y = data['Y'].std()
    std_z = data['Z'].std()
    
    mean_enmo = data['enmo'].mean()
    std_enmo = data['enmo'].std()
    max_enmo = data['enmo'].max()
    min_enmo = data['enmo'].min()

    # Activity magnitude
    magnitude = np.sqrt(data['X']**2 + data['Y']**2 + data['Z']**2)
    mean_magnitude = magnitude.mean()
    std_magnitude = magnitude.std()
    
    # Activity level (binned)
    activity_level = pd.cut(
        magnitude,
        bins=[0, 0.5, 1.0, 1.5, 2.0],
        labels=['Very Low', 'Low', 'Medium', 'High'],
        include_lowest=True
    ).mode()[0]  # Most common activity level

    # Inactivity percentage
    inactivity_percentage = (data['enmo'] == 0).mean() * 100

    # Non-wear percentage
    non_wear_percentage = (data['non-wear_flag'] == 1).mean() * 100

    # Created a dictionary with the extracted features
    features = {
        'mean_x': mean_x,
        'mean_y': mean_y,
        'mean_z': mean_z,
        'std_x': std_x,
        'std_y': std_y,
        'std_z': std_z,
        'mean_enmo': mean_enmo,
        'std_enmo': std_enmo,
        'max_enmo': max_enmo,
        'min_enmo': min_enmo,
        'mean_magnitude': mean_magnitude,
        'std_magnitude': std_magnitude,
        'activity_level': activity_level,
        'inactivity_percentage': inactivity_percentage,
        'non_wear_percentage': non_wear_percentage
    }
    
    # Returned the features as a DataFrame
    return pd.DataFrame([features])

# Directory containing the parquet files
input_directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet"
output_directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features"

# Checked if output directory exists, if not, create it
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Listed all parquet files in the directory
parquet_files = [f for f in os.listdir(input_directory) if f.endswith('.parquet')]

# Initialized an empty list to collect all extracted features
all_features = []

# Processed each parquet file
for file in parquet_files:
    file_path = os.path.join(input_directory, file)
    try:
        # Read the parquet file
        data = pd.read_parquet(file_path)
        
        # Removed the 'light' column if it exists
        if 'light' in data.columns:
            data = data.drop(columns=['light'])

        # Extracted features for the current participant
        participant_features = extract_features(data)
        
        # Added the participant features to the list
        all_features.append(participant_features)
        
        print(f"Processed file: {file}")
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Concatenated all the features into a single DataFrame
final_features = pd.concat(all_features, ignore_index=True)

# Dynamically created the output file path
output_file = os.path.join(output_directory, "extracted_features_all_participants.parquet")

# Saved the final extracted features to a parquet file
final_features.to_parquet(output_file)
#success message and a preview of the features
print(f"Features extracted and saved successfully for all participants.")
print(final_features.head())  # Print the first few rows for review


Processed file: id=00115b9f.parquet
Processed file: id=001f3379.parquet
Processed file: id=00f332d1.parquet
Processed file: id=01085eb3.parquet
Processed file: id=012cadd8.parquet
Processed file: id=012e3869.parquet
Processed file: id=029a19c9.parquet
Processed file: id=02cebf33.parquet
Processed file: id=02cf7384.parquet
Processed file: id=035c96dd.parquet
Processed file: id=03a9019b.parquet
Processed file: id=0417c91e.parquet
Processed file: id=045a0a94.parquet
Processed file: id=04afb6f9.parquet
Processed file: id=04bb1a76.parquet
Processed file: id=04cb2c30.parquet
Processed file: id=04d06a9c.parquet
Processed file: id=04f094a8.parquet
Processed file: id=051680a0.parquet
Processed file: id=055156e2.parquet
Processed file: id=059eed01.parquet
Processed file: id=05bbed1b.parquet
Processed file: id=05db1b9b.parquet
Processed file: id=05e94f88.parquet
Processed file: id=063b16fc.parquet
Processed file: id=064e8da5.parquet
Processed file: id=0668373f.parquet
Processed file: id=067b9287.

In [5]:
# Checked the number of rows (participants) in the final features DataFrame
print(f"Number of rows (participants): {final_features.shape[0]}")


Number of rows (participants): 996


In [6]:
# Path for saving the CSV file (update the path as needed)
csv_output_file = os.path.join(output_directory, "extracted_features_all_participants.csv")

# Saved the final extracted features to a CSV file
final_features.to_csv(csv_output_file, index=False)

# Output the success message
print(f"Features successfully saved as a CSV file at: {csv_output_file}")


Features successfully saved as a CSV file at: C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features\extracted_features_all_participants.csv
