Dataset Loading and Verification

In this step, we connect Google Drive with Google Colab and verify the availability of the required Fitbit datasets.
The datasets include daily activity data, heart rate data, and sleep data, which will be used for health anomaly detection analysis.

In [50]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
import os

base_path = "/content/drive/MyDrive/FitPulse_Milestone1"
os.listdir(base_path)


['dailyActivity_merged.csv',
 'heartrate_seconds_merged.csv',
 'minuteSleep_merged.csv',
 'final_merged_data.csv',
 'Copy of minuteSleep_merged.csv',
 'data']

In [52]:
!pip install pandas numpy   --quiet

In [53]:
import pandas as pd


In [54]:
pd.__version__


'2.2.2'

In [55]:
import pandas as pd
activity = pd.read_csv("/content/drive/MyDrive/FitPulse_Milestone1/dailyActivity_merged.csv")
activity.head()


Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,3/25/2016,11004,7.11,7.11,0.0,2.57,0.46,4.07,0.0,33,12,205,804,1819
1,1503960366,3/26/2016,17609,11.55,11.55,0.0,6.92,0.73,3.91,0.0,89,17,274,588,2154
2,1503960366,3/27/2016,12736,8.53,8.53,0.0,4.66,0.16,3.71,0.0,56,5,268,605,1944
3,1503960366,3/28/2016,13231,8.93,8.93,0.0,3.19,0.79,4.95,0.0,39,20,224,1080,1932
4,1503960366,3/29/2016,12041,7.85,7.85,0.0,2.16,1.09,4.61,0.0,28,28,243,763,1886


In [56]:
activity.shape


(457, 15)

In [57]:
activity.columns

Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance',
       'LoggedActivitiesDistance', 'VeryActiveDistance',
       'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories'],
      dtype='object')

In [58]:
activity['ActivityDate'] = pd.to_datetime(activity['ActivityDate'])

In [59]:
activity_clean = activity[['Id', 'ActivityDate', 'TotalSteps']]
activity_clean.head()


Unnamed: 0,Id,ActivityDate,TotalSteps
0,1503960366,2016-03-25,11004
1,1503960366,2016-03-26,17609
2,1503960366,2016-03-27,12736
3,1503960366,2016-03-28,13231
4,1503960366,2016-03-29,12041


In [60]:
activity_clean.isnull().sum()

Unnamed: 0,0
Id,0
ActivityDate,0
TotalSteps,0


In [61]:
activity_clean['TotalSteps'].describe()


Unnamed: 0,TotalSteps
count,457.0
mean,6546.562363
std,5398.493064
min,0.0
25%,1988.0
50%,5986.0
75%,10198.0
max,28497.0


In [62]:
activity_clean = activity_clean.sort_values('ActivityDate')
activity_clean.head()


Unnamed: 0,Id,ActivityDate,TotalSteps
165,4020332650,2016-03-12,5543
197,4057192912,2016-03-12,0
166,4020332650,2016-03-13,3226
198,4057192912,2016-03-13,0
199,4057192912,2016-03-14,8433


In [63]:
activity_clean = activity_clean.set_index('ActivityDate')
activity_clean.head()


Unnamed: 0_level_0,Id,TotalSteps
ActivityDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-03-12,4020332650,5543
2016-03-12,4057192912,0
2016-03-13,4020332650,3226
2016-03-13,4057192912,0
2016-03-14,4057192912,8433


In [64]:
activity_clean = activity_clean.sort_index()
activity_clean.head()


Unnamed: 0_level_0,Id,TotalSteps
ActivityDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-03-12,4020332650,5543
2016-03-12,4057192912,0
2016-03-13,4020332650,3226
2016-03-13,4057192912,0
2016-03-14,4057192912,8433


In [65]:
activity_clean.index.is_monotonic_increasing

True

In [66]:
heart = pd.read_csv("/content/drive/MyDrive/FitPulse_Milestone1/heartrate_seconds_merged.csv")
heart.head()

Unnamed: 0,Id,Time,Value
0,2022484408,4/1/2016 7:54:00 AM,93
1,2022484408,4/1/2016 7:54:05 AM,91
2,2022484408,4/1/2016 7:54:10 AM,96
3,2022484408,4/1/2016 7:54:15 AM,98
4,2022484408,4/1/2016 7:54:20 AM,100


In [67]:
heart.columns

Index(['Id', 'Time', 'Value'], dtype='object')

In [68]:
heart['Time'] = pd.to_datetime(heart['Time'])

In [69]:
heart.dtypes

Unnamed: 0,0
Id,int64
Time,datetime64[ns]
Value,int64


In [70]:
heart = heart.sort_values('Time')
heart.head()

Unnamed: 0,Id,Time,Value
56922,2347167796,2016-03-29 00:00:05,69
56923,2347167796,2016-03-29 00:00:10,68
56924,2347167796,2016-03-29 00:00:20,69
56925,2347167796,2016-03-29 00:00:35,69
56926,2347167796,2016-03-29 00:00:50,69


In [71]:
heart = heart.set_index('Time')

In [72]:
heart.head()

Unnamed: 0_level_0,Id,Value
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-03-29 00:00:05,2347167796,69
2016-03-29 00:00:10,2347167796,68
2016-03-29 00:00:20,2347167796,69
2016-03-29 00:00:35,2347167796,69
2016-03-29 00:00:50,2347167796,69


In [73]:
heart.index.is_monotonic_increasing

True

In [74]:
sleep = pd.read_csv("/content/drive/MyDrive/FitPulse_Milestone1/minuteSleep_merged.csv")


In [75]:
sleep.head()

Unnamed: 0,Id,date,value,logId
0,1503960366,3/13/2016 2:39:30 AM,1,11114919637
1,1503960366,3/13/2016 2:40:30 AM,1,11114919637
2,1503960366,3/13/2016 2:41:30 AM,1,11114919637
3,1503960366,3/13/2016 2:42:30 AM,1,11114919637
4,1503960366,3/13/2016 2:43:30 AM,1,11114919637


In [76]:
sleep.columns

Index(['Id', 'date', 'value', 'logId'], dtype='object')

In [77]:
sleep['date'] = pd.to_datetime(sleep['date'])

In [78]:
sleep.dtypes

Unnamed: 0,0
Id,int64
date,datetime64[ns]
value,int64
logId,int64


In [79]:
sleep = sleep.sort_values('date')
sleep.head()

Unnamed: 0,Id,date,value,logId
119428,5577150313,2016-03-11 21:19:30,1,11109426118
119429,5577150313,2016-03-11 21:20:30,1,11109426118
119430,5577150313,2016-03-11 21:21:30,1,11109426118
119431,5577150313,2016-03-11 21:22:30,1,11109426118
119432,5577150313,2016-03-11 21:23:30,1,11109426118


In [80]:
sleep = sleep.set_index('date')
sleep.index.is_monotonic_increasing

True

In [81]:
heart_daily = heart.groupby(['Id', heart.index.date])['Value'].mean().reset_index()
heart_daily.columns = ['Id', 'Date', 'AvgHeartRate']

In [82]:
heart_daily.head()

Unnamed: 0,Id,Date,AvgHeartRate
0,2022484408,2016-04-01,88.563916
1,2022484408,2016-04-02,72.068685
2,2022484408,2016-04-03,74.398232
3,2022484408,2016-04-04,78.295372
4,2022484408,2016-04-05,83.474328


In [83]:
steps_daily = activity_clean.reset_index()
steps_daily.rename(columns={'ActivityDate': 'Date'}, inplace=True)

In [84]:
steps_daily.head()

Unnamed: 0,Date,Id,TotalSteps
0,2016-03-12,4020332650,5543
1,2016-03-12,4057192912,0
2,2016-03-13,4020332650,3226
3,2016-03-13,4057192912,0
4,2016-03-14,4057192912,8433


In [85]:
sleep_daily = sleep.groupby(['Id', sleep.index.date])['value'].sum().reset_index()
sleep_daily.columns = ['Id', 'Date', 'TotalSleepMinutes']

In [86]:
sleep_daily.head()

Unnamed: 0,Id,Date,TotalSleepMinutes
0,1503960366,2016-03-13,441
1,1503960366,2016-03-14,423
2,1503960366,2016-03-15,365
3,1503960366,2016-03-16,404
4,1503960366,2016-03-17,473


In [87]:
steps_daily['Date'] = pd.to_datetime(steps_daily['Date'])
heart_daily['Date'] = pd.to_datetime(heart_daily['Date'])
sleep_daily['Date'] = pd.to_datetime(sleep_daily['Date'])

In [88]:
merged_1 = pd.merge(
    steps_daily,
    heart_daily,
    on=['Id', 'Date'],
    how='inner'
)

In [89]:
merged_1.head()
merged_1.shape

(143, 4)

In [90]:
final_data = pd.merge(
    merged_1,
    sleep_daily,
    on=['Id', 'Date'],
    how='inner'
)

In [91]:
final_data.head()
final_data.shape

(83, 5)

In [92]:
final_data.to_csv(
    "/content/drive/MyDrive/FitPulse_Milestone1/final_merged_data.csv",
    index=False
)

In [93]:
### Minute-Level Data Preprocessing


In [94]:
# --- Minute-level Heart Rate ---
heart = pd.read_csv("/content/drive/MyDrive/FitPulse_Milestone1/heartrate_seconds_merged.csv")
heart['timestamp'] = pd.to_datetime(heart['Time'])
heart = heart.rename(columns={'Value': 'HeartRate'})
heart = heart[['Id', 'timestamp', 'HeartRate']]

# --- Daily Steps (mapped to minutes) ---
steps_daily = activity[['Id', 'ActivityDate', 'TotalSteps']].copy()
steps_daily['Date'] = pd.to_datetime(steps_daily['ActivityDate'])
steps_daily = steps_daily[['Id', 'Date', 'TotalSteps']]

heart['Date'] = heart['timestamp'].dt.date
heart['Date'] = pd.to_datetime(heart['Date'])

heart_steps = pd.merge(
    heart,
    steps_daily,
    on=['Id', 'Date'],
    how='left'
)

# --- Minute-level Sleep ---
sleep = pd.read_csv("/content/drive/MyDrive/FitPulse_Milestone1/minuteSleep_merged.csv")
sleep['timestamp'] = pd.to_datetime(sleep['date'])
sleep = sleep.rename(columns={'value': 'SleepFlag'})
sleep = sleep[['Id', 'timestamp', 'SleepFlag']]

# --- Final Minute Merge ---
final_minute_data = pd.merge(
    heart_steps,
    sleep,
    on=['Id', 'timestamp'],
    how='left'
)

final_minute_data['SleepFlag'] = final_minute_data['SleepFlag'].fillna(0)


In [95]:
final_minute_data.to_csv(
    "/content/drive/MyDrive/FitPulse_Milestone1/data/minute_level_data.csv",
    index=False
)
