<a href="https://colab.research.google.com/github/soumya0422/FitPulse-Health-Anomaly-Detection-from-Fitness-Devices-/blob/main/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import os
os.environ['NUMBA_DISABLE_JIT'] = '1'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters, ComprehensiveFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold


In [19]:
from google.colab import files
df = files.upload()

In [20]:
# Load dataset
df = pd.read_csv("/content/processed_data.csv", parse_dates=["Time"])

# Sort by Id and Time
df.sort_values(["Id", "Time"], inplace=True)

print("Data Loaded")
display(df.head())

# ------------------------------
# Data Quality Checks
# ------------------------------

# Identify numeric columns only
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Check for NaNs
print("NaNs per column:")
print(df.isna().sum())

# Check for infinite values
print("\nInfinite values per column:")
print((df[numeric_cols] == np.inf).sum())

# Check for extremely large values (overflow risk)
print("\nExtremely large values per column (>|1e308|):")
print((df[numeric_cols].abs() > 1e308).sum())

# ------------------------------
# Clean Data
# ------------------------------

# Replace infinite values with NaN first
df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

# Fill remaining NaNs with column mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

print("\n Data cleaned. Ready for feature extraction.")


Data Loaded


Unnamed: 0,Id,Time,heart_rate,StepTotal,sleep_efficiency,fragmented_sleep,sleep_hours,daily_sleep_date
0,2022484408,2016-04-12 07:00:00+00:00,83.2,847.0,0.946789,0,7.05,2016-04-12
1,2022484408,2016-04-12 08:00:00+00:00,68.562005,334.0,0.946789,0,7.05,2016-04-12
2,2022484408,2016-04-12 09:00:00+00:00,66.4047,243.0,0.946789,0,7.05,2016-04-12
3,2022484408,2016-04-12 10:00:00+00:00,106.716075,5243.0,0.946789,0,7.05,2016-04-12
4,2022484408,2016-04-12 11:00:00+00:00,67.767157,323.0,0.946789,0,7.05,2016-04-12


NaNs per column:
Id                  0
Time                0
heart_rate          0
StepTotal           0
sleep_efficiency    0
fragmented_sleep    0
sleep_hours         0
daily_sleep_date    0
dtype: int64

Infinite values per column:
Id                  0
heart_rate          0
StepTotal           0
sleep_efficiency    0
fragmented_sleep    0
sleep_hours         0
dtype: int64

Extremely large values per column (>|1e308|):
Id                  0
heart_rate          0
StepTotal           0
sleep_efficiency    0
fragmented_sleep    0
sleep_hours         0
dtype: int64

 Data cleaned. Ready for feature extraction.


In [21]:
df

Unnamed: 0,Id,Time,heart_rate,StepTotal,sleep_efficiency,fragmented_sleep,sleep_hours,daily_sleep_date
0,2022484408,2016-04-12 07:00:00+00:00,83.200000,847.0,0.946789,0,7.05,2016-04-12
1,2022484408,2016-04-12 08:00:00+00:00,68.562005,334.0,0.946789,0,7.05,2016-04-12
2,2022484408,2016-04-12 09:00:00+00:00,66.404700,243.0,0.946789,0,7.05,2016-04-12
3,2022484408,2016-04-12 10:00:00+00:00,106.716075,5243.0,0.946789,0,7.05,2016-04-12
4,2022484408,2016-04-12 11:00:00+00:00,67.767157,323.0,0.946789,0,7.05,2016-04-12
...,...,...,...,...,...,...,...,...
6029,8877689391,2016-05-12 10:00:00+00:00,68.733503,514.0,0.946789,0,7.05,2016-05-12
6030,8877689391,2016-05-12 11:00:00+00:00,72.030108,1407.0,0.946789,0,7.05,2016-05-12
6031,8877689391,2016-05-12 12:00:00+00:00,87.614719,3135.0,0.946789,0,7.05,2016-05-12
6032,8877689391,2016-05-12 13:00:00+00:00,64.737705,307.0,0.946789,0,7.05,2016-05-12


In [22]:
ts_data = df.melt(
    id_vars=["Id", "Time"],
    value_vars=["heart_rate","StepTotal","sleep_hours"],
    var_name="kind",
    value_name="value"
)
print("Time-series data prepared for TSFresh")
display(ts_data.head())


Time-series data prepared for TSFresh


Unnamed: 0,Id,Time,kind,value
0,2022484408,2016-04-12 07:00:00+00:00,heart_rate,83.2
1,2022484408,2016-04-12 08:00:00+00:00,heart_rate,68.562005
2,2022484408,2016-04-12 09:00:00+00:00,heart_rate,66.4047
3,2022484408,2016-04-12 10:00:00+00:00,heart_rate,106.716075
4,2022484408,2016-04-12 11:00:00+00:00,heart_rate,67.767157


In [23]:
minimal_features = extract_features(
    ts_data,
    column_id="Id",
    column_sort="Time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=0
)
minimal_features.dropna(axis=1, inplace=True)
print(f"Minimal features extracted: {minimal_features.shape[1]}")


Feature Extraction: 100%|██████████| 42/42 [00:00<00:00, 645.09it/s]

Minimal features extracted: 30





In [24]:
comprehensive_features = extract_features(
    ts_data,
    column_id="Id",
    column_sort="Time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=ComprehensiveFCParameters(),
    n_jobs=0
)

features = comprehensive_features.copy()
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.loc[:, features.isna().mean() < 0.3]
features.fillna(features.median(), inplace=True)

print(f"Comprehensive features after cleaning: {features.shape[1]}")


Feature Extraction: 100%|██████████| 42/42 [00:17<00:00,  2.45it/s]


Comprehensive features after cleaning: 2334


In [25]:
from google.colab import drive
import os
from sklearn.feature_selection import VarianceThreshold

# Mount Google Drive
drive.mount('/content/drive')

# Example path in your Drive
drive_path = '/content/drive/MyDrive/FitPulse Health Anomaly Detection/Milestone2/data'

# Create directory if it doesn't exist
os.makedirs(drive_path, exist_ok=True)

# Apply Variance Threshold
selector = VarianceThreshold(threshold=0.01)
features_selected = features.iloc[:, selector.fit(features).get_support()]

print(f"Features retained after variance threshold: {features_selected.shape[1]}")

# Save CSV to Google Drive
output_file = os.path.join(drive_path, "extracted_features.csv")
features_selected.to_csv(output_file, index=False)

print(f"Saved extracted_features.csv to {output_file}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Features retained after variance threshold: 1978
Saved extracted_features.csv to /content/drive/MyDrive/FitPulse Health Anomaly Detection/Milestone2/data/extracted_features.csv
