<a href="https://colab.research.google.com/github/suhaani1/FitPulse-Health-Anomaly-Detection-from-Fitness-Devices/blob/main/Milestone2/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Install required libraries
!pip install tsfresh prophet scikit-learn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.utilities.dataframe_functions import impute

from sklearn.feature_selection import VarianceThreshold
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans, DBSCAN

# from prophet import Prophet




In [17]:
# Upload dataset manually in Colab
from google.colab import files
uploaded = files.upload()

# Example: change filename if needed
df = pd.read_csv(list(uploaded.keys())[0])

df.head()


Saving clean - Copy.csv to clean - Copy (2).csv


Unnamed: 0,date,participant_id,resting_heart_rate,daily_steps,hours_sleep
0,2016-04-12 00:00:00+00:00,5528405368,60.8125,0.141667,0.75
1,2016-04-12 00:01:00+00:00,5528405368,61.694927,0.141667,0.75
2,2016-04-12 00:02:00+00:00,5528405368,62.183333,0.141667,0.75
3,2016-04-12 00:03:00+00:00,5528405368,61.358334,0.141667,0.75
4,2016-04-12 00:04:00+00:00,5528405368,61.4,0.141667,0.75


In [18]:
# Convert date to datetime
df["date"] = pd.to_datetime(
    df["date"],
    format="mixed",
    dayfirst=True,
    errors="coerce"
)

df.dropna(subset=["date"], inplace=True)
df["day"] = df["date"].dt.date


df.head()


Unnamed: 0,date,participant_id,resting_heart_rate,daily_steps,hours_sleep,day
0,2016-04-12 00:00:00+00:00,5528405368,60.8125,0.141667,0.75,2016-04-12
1,2016-04-12 00:01:00+00:00,5528405368,61.694927,0.141667,0.75,2016-04-12
2,2016-04-12 00:02:00+00:00,5528405368,62.183333,0.141667,0.75,2016-04-12
3,2016-04-12 00:03:00+00:00,5528405368,61.358334,0.141667,0.75,2016-04-12
4,2016-04-12 00:04:00+00:00,5528405368,61.4,0.141667,0.75,2016-04-12


In [19]:
# Convert to long format for TSFresh
value_cols = ["resting_heart_rate", "daily_steps", "hours_sleep"]

tsfresh_df = df.melt(
    id_vars=["participant_id", "date", "day"],
    value_vars=value_cols,
    var_name="kind",
    value_name="value"
)

tsfresh_df = tsfresh_df.rename(columns={
    "participant_id": "id",
    "date": "time"
})

tsfresh_df = tsfresh_df[["id", "time", "kind", "value"]]

# Safety cleaning
tsfresh_df.replace([np.inf, -np.inf], np.nan, inplace=True)
tsfresh_df.dropna(inplace=True)

tsfresh_df.head()


Unnamed: 0,id,time,kind,value
0,5528405368,2016-04-12 00:00:00+00:00,resting_heart_rate,60.8125
1,5528405368,2016-04-12 00:01:00+00:00,resting_heart_rate,61.694927
2,5528405368,2016-04-12 00:02:00+00:00,resting_heart_rate,62.183333
3,5528405368,2016-04-12 00:03:00+00:00,resting_heart_rate,61.358334
4,5528405368,2016-04-12 00:04:00+00:00,resting_heart_rate,61.4


In [20]:
features = extract_features(
    tsfresh_df,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=ComprehensiveFCParameters(),
    n_jobs=0
)

# Handle NaN values
impute(features)

features.head()


Feature Extraction: 100%|██████████| 1734/1734 [04:19<00:00,  6.68it/s]
 'daily_steps__friedrich_coefficients__coeff_1__m_3__r_30'
 'daily_steps__friedrich_coefficients__coeff_2__m_3__r_30'
 'daily_steps__friedrich_coefficients__coeff_3__m_3__r_30'
 'daily_steps__max_langevin_fixed_point__m_3__r_30'
 'daily_steps__query_similarity_count__query_None__threshold_0.0'
 'hours_sleep__query_similarity_count__query_None__threshold_0.0'
 'resting_heart_rate__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


Unnamed: 0,daily_steps__variance_larger_than_standard_deviation,daily_steps__has_duplicate_max,daily_steps__has_duplicate_min,daily_steps__has_duplicate,daily_steps__sum_values,daily_steps__abs_energy,daily_steps__mean_abs_change,daily_steps__mean_change,daily_steps__mean_second_derivative_central,daily_steps__median,...,resting_heart_rate__fourier_entropy__bins_5,resting_heart_rate__fourier_entropy__bins_10,resting_heart_rate__fourier_entropy__bins_100,resting_heart_rate__permutation_entropy__dimension_3__tau_1,resting_heart_rate__permutation_entropy__dimension_4__tau_1,resting_heart_rate__permutation_entropy__dimension_5__tau_1,resting_heart_rate__permutation_entropy__dimension_6__tau_1,resting_heart_rate__permutation_entropy__dimension_7__tau_1,resting_heart_rate__query_similarity_count__query_None__threshold_0.0,resting_heart_rate__mean_n_absolute_max__number_of_maxima_7
3290547166,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.556647,0.633663,1.241889,1.211041,1.824397,2.340741,2.744871,2.9942,0.0,81.706416
3656418726,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.720125,0.949594,1.599015,1.486976,2.357687,2.920687,3.120292,3.135494,0.0,79.123523
3764646522,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.673012,1.332179,1.609438,1.747868,1.791759,1.609438,1.386294,1.098612,0.0,69.551474
4130803406,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.693147,0.693147,0.693147,-0.0,2.469579,3.08154,3.282694,3.433987,0.0,79.944182
4217772470,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.853236,1.077573,2.176186,1.686939,2.598143,2.941617,3.120292,3.135494,0.0,83.546995


In [21]:
# Remove low variance features
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(features)

selected_features = features.loc[:, selector.get_feature_names_out()]
selected_features.head()


Unnamed: 0,daily_steps__has_duplicate_max,daily_steps__has_duplicate_min,daily_steps__has_duplicate,daily_steps__sum_values,daily_steps__abs_energy,daily_steps__median,daily_steps__mean,daily_steps__length,daily_steps__standard_deviation,daily_steps__variation_coefficient,...,resting_heart_rate__fourier_entropy__bins_3,resting_heart_rate__fourier_entropy__bins_5,resting_heart_rate__fourier_entropy__bins_10,resting_heart_rate__fourier_entropy__bins_100,resting_heart_rate__permutation_entropy__dimension_3__tau_1,resting_heart_rate__permutation_entropy__dimension_4__tau_1,resting_heart_rate__permutation_entropy__dimension_5__tau_1,resting_heart_rate__permutation_entropy__dimension_6__tau_1,resting_heart_rate__permutation_entropy__dimension_7__tau_1,resting_heart_rate__mean_n_absolute_max__number_of_maxima_7
3290547166,1.0,1.0,1.0,0.0,0.0,0.0,0.0,35.0,0.0,1.199277,...,0.556647,0.556647,0.633663,1.241889,1.211041,1.824397,2.340741,2.744871,2.9942,81.706416
3656418726,1.0,1.0,1.0,0.0,0.0,0.0,0.0,29.0,0.0,1.199277,...,0.485094,0.720125,0.949594,1.599015,1.486976,2.357687,2.920687,3.120292,3.135494,79.123523
3764646522,1.0,1.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,1.199277,...,0.673012,0.673012,1.332179,1.609438,1.747868,1.791759,1.609438,1.386294,1.098612,69.551474
4130803406,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,1.199277,...,0.693147,0.693147,0.693147,0.693147,-0.0,2.469579,3.08154,3.282694,3.433987,79.944182
4217772470,1.0,1.0,1.0,0.0,0.0,0.0,0.0,29.0,0.0,1.199277,...,0.627705,0.853236,1.077573,2.176186,1.686939,2.598143,2.941617,3.120292,3.135494,83.546995


In [22]:
selected_features.to_csv("tsfresh_features.csv")
print("Saved tsfresh_features.csv")


Saved tsfresh_features.csv
