In [1]:
import os
import pandas as pd
import numpy as np

# Directory containing Parquet files
input_directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet"
output_csv = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/feature_extraction.csv"

# Function to extract features from a single DataFrame
def extract_features(df):
    features = {}
    
    # Statistical features
    features['mean_X'] = df['X'].mean()
    features['std_X'] = df['X'].std()
    features['mean_Y'] = df['Y'].mean()
    features['std_Y'] = df['Y'].std()
    features['mean_Z'] = df['Z'].mean()
    features['std_Z'] = df['Z'].std()
    features['mean_enmo'] = df['enmo'].mean()
    features['std_enmo'] = df['enmo'].std()
    features['mean_anglez'] = df['anglez'].mean()
    features['std_anglez'] = df['anglez'].std()
    
    # Activity features
    features['percent_no_motion'] = (df['enmo'] == 0).sum() / len(df) * 100
    features['percent_non_wear'] = (df['non-wear_flag'] == 1).sum() / len(df) * 100
    
    # Battery features
    features['mean_battery_voltage'] = df['battery_voltage'].mean()
    critical_threshold = 3.5  # Example threshold
    features['percent_below_threshold'] = (df['battery_voltage'] < critical_threshold).sum() / len(df) * 100
    
    # Participant ID (if available)
    features['participant_id'] = df['participant_id'].iloc[0] if 'participant_id' in df.columns else None
    
    return features

# Collected all features into a list
all_features = []

# Iterated through each Parquet file in the directory
for filename in os.listdir(input_directory):
    if filename.endswith(".parquet"):
        file_path = os.path.join(input_directory, filename)
        try:
            # Read Parquet file
            df = pd.read_parquet(file_path)
            
            # Extracted features
            features = extract_features(df)
            all_features.append(features)
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Created a DataFrame from all features
features_df = pd.DataFrame(all_features)

# Saved to a CSV file
features_df.to_csv(output_csv, index=False)

print(f"Feature extraction complete. Output saved to {output_csv}")


Feature extraction complete. Output saved to C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/feature_extraction.csv


In [6]:
def extract_features(file_path):
    try:
        data = pd.read_parquet(file_path)
        
        # Checked for required columns
        required_columns = ["X", "Y", "Z", "enmo", "anglez", "battery_voltage", "non-wear_flag", "participant_id"]
        if not all(col in data.columns for col in required_columns):
            raise ValueError(f"Missing required columns in {file_path}")
        
        # Statistical features
        features = {
            "participant_id": data["participant_id"].iloc[0],
            "mean_X": data["X"].mean(),
            "std_X": data["X"].std(),
            "mean_Y": data["Y"].mean(),
            "std_Y": data["Y"].std(),
            "mean_Z": data["Z"].mean(),
            "std_Z": data["Z"].std(),
            "mean_enmo": data["enmo"].mean(),
            "std_enmo": data["enmo"].std(),
            "mean_anglez": data["anglez"].mean(),
            "std_anglez": data["anglez"].std(),
        }
        
        # Activity features
        features["percentage_no_motion"] = (data["enmo"] == 0).mean() * 100 if "enmo" in data.columns else np.nan
        features["percentage_non_wear"] = (data["non-wear_flag"] == 1).mean() * 100 if "non-wear_flag" in data.columns else np.nan
        
        # Battery features
        features["mean_battery_voltage"] = data["battery_voltage"].mean() if "battery_voltage" in data.columns else np.nan
        features["percentage_low_battery"] = (
            (data["battery_voltage"] < BATTERY_THRESHOLD).mean() * 100
            if "battery_voltage" in data.columns
            else np.nan
        )
        
        return features
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [7]:
# Visualization
if not features_df.empty:
    key_features = ["mean_enmo", "mean_anglez", "percentage_no_motion", "percentage_non_wear", "mean_battery_voltage"]
    
    for feature in key_features:
        if feature in features_df.columns:  # Ensure feature exists
            plt.figure(figsize=(8, 6))
            sns.histplot(features_df[feature].dropna(), kde=True, bins=30, color="skyblue")
            plt.title(f"Distribution of {feature}")
            plt.xlabel(feature)
            plt.ylabel("Frequency")
            plt.savefig(os.path.join(output_graphs_dir, f"{feature}_distribution.png"))
            plt.close()
        else:
            print(f"Feature {feature} not found in the dataset, skipping plot.")


Feature percentage_no_motion not found in the dataset, skipping plot.
Feature percentage_non_wear not found in the dataset, skipping plot.


In [8]:
#Loaded the Data
import pandas as pd

# Loaded a single participant's data (example)
data = pd.read_parquet("C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet/id=00f332d1.parquet")


In [9]:
# Computed basic statistics for each participant
basic_stats = data.describe().T[['mean', 'std', 'min', 'max', '25%', '50%', '75%']]


In [10]:
# Created time-based features
data['hour'] = pd.to_datetime(data['time_of_day']).dt.hour
data['minute'] = pd.to_datetime(data['time_of_day']).dt.minute
data['is_weekend'] = data['weekday'].apply(lambda x: 1 if x in [5, 6] else 0)


In [11]:
# Example: Daily Aggregations
data['date'] = pd.to_datetime(data['relative_date_PCIAT']).dt.date
daily_aggregations = data.groupby('date').agg({
    'X': ['mean', 'std', 'max', 'min'],
    'Y': ['mean', 'std', 'max', 'min'],
    'Z': ['mean', 'std', 'max', 'min'],
    'enmo': ['mean', 'std', 'max', 'min'],
    'anglez': ['mean', 'std', 'max', 'min'],
    'light': ['mean', 'std', 'max', 'min'],
    'battery_voltage': ['mean', 'std', 'max', 'min']
})


In [12]:
# Non-wear statistics
data['non_wear_duration'] = data['non-wear_flag'].sum()  # Count non-wear periods
data['activity_ratio'] = 1 - data['non-wear_flag'].mean()  # Activity density


In [13]:
# Magnitude of accelerometer data
data['magnitude'] = (data['X']**2 + data['Y']**2 + data['Z']**2)**0.5


In [14]:
magnitude_stats = data['magnitude'].agg(['mean', 'std', 'max', 'min', 'median'])


In [15]:
# Angle statistics
anglez_stats = data['anglez'].agg(['mean', 'std', 'max', 'min'])


In [16]:
# Light and battery voltage statistics
light_stats = data['light'].agg(['mean', 'std', 'max', 'min'])
battery_voltage_stats = data['battery_voltage'].agg(['mean', 'std', 'max', 'min'])


In [18]:
# Activity intensity categorization
data['activity_level'] = pd.cut(data['magnitude'], bins=[0, 0.5, 1.0, 1.5, 2.0], labels=['Low', 'Medium', 'High', 'Very High'])


In [19]:
# Handle missing data
data.fillna(method='ffill', inplace=True)


In [21]:
import glob

# Define the directory where your parquet files are stored
directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet"

# Collect all parquet file paths in that directory
participant_files = glob.glob(f"{directory}*.parquet")


In [23]:
print(participant_files)


[]


In [24]:
for participant_file in participant_files:
    participant_data = pd.read_parquet(participant_file)
    print(participant_data.head())  # Check the first few rows of the data
    participant_features = extract_features(participant_data)
    features_list.append(participant_features)


In [26]:
participant_data = pd.read_parquet("C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet")
print(participant_data.head())


   step         X         Y         Z      enmo     anglez  non-wear_flag  \
0     0  0.021536  0.022214 -1.022370  0.022853 -88.280762            0.0   
1     1  0.022005  0.022187 -1.019740  0.020231 -88.241707            0.0   
2     2  0.022240  0.022005 -1.019401  0.019893 -88.170067            0.0   
3     3  0.021589  0.022578 -1.018177  0.018667 -88.250031            0.0   
0     0  0.679618 -0.578170  0.320939  0.273671  18.857922            0.0   

       light  battery_voltage     time_of_day  weekday  quarter  \
0  53.000000      4188.000000  56940000000000        4        3   
1  51.666668      4188.166504  56945000000000        4        3   
2  50.333332      4188.333496  56950000000000        4        3   
3  50.500000      4188.500000  56955000000000        4        3   
0   6.000000      4175.000000  40260000000000        2        3   

   relative_date_PCIAT   day_time  
0                 41.0  41.659028  
1                 41.0  41.659086  
2                 41.0  41

In [27]:
for participant_file in participant_files:
    participant_data = pd.read_parquet(participant_file)
    participant_features = extract_features(participant_data)
    if participant_features is not None and not participant_features.empty:
        features_list.append(participant_features)
    else:
        print(f"Skipping {participant_file} due to empty features")


In [28]:
if features_list:
    final_features_df = pd.concat(features_list, axis=0)
else:
    print("No features to concatenate.")


No features to concatenate.


In [31]:
# Check if the features_list is populated
if features_list:
    final_features_df = pd.concat(features_list, axis=0)
    # Save the extracted features
    final_features_df.to_parquet("C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project//extracted_features.parquet")
else:
    print("No features to save. Check the feature extraction process.")


No features to save. Check the feature extraction process.


In [32]:
print(f"Number of feature sets: {len(features_list)}")
if len(features_list) > 0:
    final_features_df = pd.concat(features_list, axis=0)
else:
    print("Feature list is empty.")

    

Number of feature sets: 0
Feature list is empty.


In [34]:
import pandas as pd
import glob

# Collected participant file paths (assuming they are parquet files in the directory)
directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/"
participant_files = glob.glob(f"{directory}*.parquet")

features_list = []

# Processed each participant's data
for participant_file in participant_files:
    participant_data = pd.read_parquet(participant_file)
    
    # Assuming you have a function to extract features from participant data
    participant_features = extract_features(participant_data)
    
    if participant_features is not None and not participant_features.empty:
        features_list.append(participant_features)
    else:
        print(f"Skipping {participant_file} due to empty or invalid features")

# Checked if features were extracted
if features_list:
    # Concatenate all the individual features
    final_features_df = pd.concat(features_list, axis=0)
    # Save the concatenated features to a new parquet file
    final_features_df.to_parquet("C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features.parquet")
else:
    print("No features to save.")


ArrowMemoryError: malloc of size 10065728 failed

In [39]:
import pandas as pd
import numpy as np

# Sample file path for a single participant's data (modify with actual file path)
participant_file = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet/id=00f332d1.parquet"

# Function to extract features from the participant data
def extract_features(data):
    # Feature extraction logic (example)
    # Here we calculate a few sample features. You can modify this based on your actual logic.
    
    # Example: Calculate the mean and standard deviation of selected columns
    mean_x = data['X'].mean()
    mean_y = data['Y'].mean()
    mean_z = data['Z'].mean()
    mean_enmo = data['enmo'].mean()
    
    std_x = data['X'].std()
    std_y = data['Y'].std()
    std_z = data['Z'].std()
    
    # Example: Creating a new feature based on activity level (you could customize this logic)
    magnitude = np.sqrt(data['X']**2 + data['Y']**2 + data['Z']**2)  # Magnitude of acceleration
    activity_level = pd.cut(magnitude, bins=[0, 0.5, 1.0, 1.5, 2.0], labels=['Very Low', 'Low', 'Medium', 'High']).mode()[0]
    
    # Example: Calculate mean  light and battery voltage
    mean_light = data['light'].mean()
    mean_battery_voltage = data['battery_voltage'].mean()
    
    # Created a dictionary with the extracted features
    features = {
        'mean_x': mean_x,
        'mean_y': mean_y,
        'mean_z': mean_z,
        'mean_enmo': mean_enmo,
        'std_x': std_x,
        'std_y': std_y,
        'std_z': std_z,
        'activity_level': activity_level,
        'mean_light': mean_light,
        'mean_battery_voltage': mean_battery_voltage
    }
    
    # Returned as DataFrame (one row per participant)
    return pd.DataFrame([features])

# Loaded the participant data (parquet file)
participant_data = pd.read_parquet(participant_file)

# Extracted features for the participant
participant_features = extract_features(participant_data)

# Checked if the features are valid
if participant_features is not None and not participant_features.empty:
    # Save the extracted features to a parquet file (final output for one participant)
    participant_features.to_parquet("C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features_id=00f332d1.parquet")
    print("Features extracted and saved successfully.")
else:
    print("No valid features extracted.")



Features extracted and saved successfully.


In [41]:
import pandas as pd

# Specify the input Parquet file path and the output CSV file path
parquet_file = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features_id=00f332d1.parquet"
csv_file = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features_id=00f332d1.csv"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file)

# Saved the DataFrame as a CSV file
df.to_csv(csv_file, index=False)

print(f"Parquet file has been converted to CSV and saved as {csv_file}")


Parquet file has been converted to CSV and saved as C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features_id=00f332d1.csv
