# Data Preparation

This jupyter notebook comes from [this](https://www.kaggle.com/code/carlafgomes/fitbit-data-analysis-in-python#2-Data-Processing) original notebook Kaggle.  The original Kaggle notebook 
uses as input [this](https://www.kaggle.com/datasets/arashnic/fitbit) dataset at Kaggle. 

Before running this notebook,
you should run `jupyter_notebooks/heartrate_data_thinning.id.ipynb`.

The goal of this notebook is to produce, based on the 
Kaggle dataset, a file for each patient identified by his/her Id. This set 
of patient records is stored in the folder `patient_csv_records`



In [1]:
# this makes sure it starts looking for things from the SentenceAx folder down.
import os
import sys
os.chdir('../')
sys.path.insert(0,os.getcwd())
print(os.getcwd())

C:\Users\rrtuc\Desktop\backed-up\python-projects\CausalFitbit


In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import glob

In [3]:
fnames= ["dailyActivity",
"heartrate_reduced",
"sleepDay",
"weightLogInfo"]

def get_csv_path(fname):
 return f"input_data/Fitabase Data 4.12.16-5.12.16/{fname}_merged.csv"

dfs = [pd.read_csv(get_csv_path(fname)) for fname in fnames]

In [4]:
# number of unique Id's
for i in range(4):
    print(fnames[i], dfs[i].Id.nunique())

dailyActivity 33
heartrate_reduced 14
sleepDay 24
weightLogInfo 8


In [5]:
# column names
for i in range(4):
    print(f"*******{fnames[i]}\n", dfs[i].columns)

*******dailyActivity
 Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance',
       'LoggedActivitiesDistance', 'VeryActiveDistance',
       'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories'],
      dtype='object')
*******heartrate_reduced
 Index(['Id', 'Time', 'MinuteAverage', 'HourlyAverage'], dtype='object')
*******sleepDay
 Index(['Id', 'SleepDay', 'TotalSleepRecords', 'TotalMinutesAsleep',
       'TotalTimeInBed'],
      dtype='object')
*******weightLogInfo
 Index(['Id', 'Date', 'WeightKg', 'WeightPounds', 'Fat', 'BMI',
       'IsManualReport', 'LogId'],
      dtype='object')


In [6]:
# check for missing values
for i in range(4):
    print(f"*******{fnames[i]}\n", dfs[i].isnull().sum())

*******dailyActivity
 Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64
*******heartrate_reduced
 Id               0
Time             0
MinuteAverage    0
HourlyAverage    0
dtype: int64
*******sleepDay
 Id                    0
SleepDay              0
TotalSleepRecords     0
TotalMinutesAsleep    0
TotalTimeInBed        0
dtype: int64
*******weightLogInfo
 Id                 0
Date               0
WeightKg           0
WeightPounds       0
Fat               65
BMI                0
IsManualReport     0
LogId              0
dtype: int64


In [7]:
total_rows_fat = len(dfs[3]['Fat'])

print(f"Total number of rows in the 'Fat' column: {total_rows_fat}")

Total number of rows in the 'Fat' column: 67


In [8]:
# fnames= ["dailyActivity",
# "heartrate_seconds",
# "sleepDay",
# "weightLogInfo"]

# Since the 'Fat' columns has 67 rows, 
# and 65 of them are missing value, it will be dropped
dfs[3] = dfs[3].drop(columns=['Fat'])

# Also drop the columns that will not be needed

dfs[0] = dfs[0].drop(
    columns=['LoggedActivitiesDistance', 'TrackerDistance'])
dfs[2] = dfs[2].drop(columns=['TotalSleepRecords'])
dfs[3] = dfs[3].drop(columns=['IsManualReport', 'LogId'])

In [9]:
# Check the dataframe shapes

for i in range(4):   
    print(f"*******{fnames[i]}\n", dfs[i].shape)

*******dailyActivity
 (940, 13)
*******heartrate_reduced
 (3332, 4)
*******sleepDay
 (413, 4)
*******weightLogInfo
 (67, 5)


In [10]:
# add new columns to sleepDay
dfs[2]['TotalHoursAsleep'] = dfs[2]['TotalMinutesAsleep'] / 60
dfs[2]['TotalTimeInBedHour'] = dfs[2]['TotalTimeInBed'] / 60
dfs[2].rename(columns={'TotalTimeInBed': 'TotalTimeInBedMin'}, inplace=True)

dfs[2].columns

Index(['Id', 'SleepDay', 'TotalMinutesAsleep', 'TotalTimeInBedMin',
       'TotalHoursAsleep', 'TotalTimeInBedHour'],
      dtype='object')

In [11]:
# add new columns to dailyActivity
dfs[0]["TotalActiveMinutes"] = dfs[0]["VeryActiveMinutes"] + dfs[0]["FairlyActiveMinutes"] + dfs[0]["LightlyActiveMinutes"]
dfs[0]["TotalMinutes"] = dfs[0]["TotalActiveMinutes"] + dfs[0]["SedentaryMinutes"]
dfs[0]["TotalActiveHours"] = round(dfs[0]["TotalActiveMinutes"] / 60)


In [12]:
dfs[0].head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,TotalActiveMinutes,TotalMinutes,TotalActiveHours
0,1503960366,4/12/2016,13162,8.5,1.88,0.55,6.06,0.0,25,13,328,728,1985,366,1094,6.0
1,1503960366,4/13/2016,10735,6.97,1.57,0.69,4.71,0.0,21,19,217,776,1797,257,1033,4.0
2,1503960366,4/14/2016,10460,6.74,2.44,0.4,3.91,0.0,30,11,181,1218,1776,222,1440,4.0
3,1503960366,4/15/2016,9762,6.28,2.14,1.26,2.83,0.0,29,34,209,726,1745,272,998,5.0
4,1503960366,4/16/2016,12669,8.16,2.71,0.41,5.04,0.0,36,10,221,773,1863,267,1040,4.0


In [13]:
# datatypes
for i in range(4):   
    print(f"*******{fnames[i]}\n", dfs[i].dtypes)

*******dailyActivity
 Id                            int64
ActivityDate                 object
TotalSteps                    int64
TotalDistance               float64
VeryActiveDistance          float64
ModeratelyActiveDistance    float64
LightActiveDistance         float64
SedentaryActiveDistance     float64
VeryActiveMinutes             int64
FairlyActiveMinutes           int64
LightlyActiveMinutes          int64
SedentaryMinutes              int64
Calories                      int64
TotalActiveMinutes            int64
TotalMinutes                  int64
TotalActiveHours            float64
dtype: object
*******heartrate_reduced
 Id                 int64
Time              object
MinuteAverage    float64
HourlyAverage    float64
dtype: object
*******sleepDay
 Id                      int64
SleepDay               object
TotalMinutesAsleep      int64
TotalTimeInBedMin       int64
TotalHoursAsleep      float64
TotalTimeInBedHour    float64
dtype: object
*******weightLogInfo
 Id             

In [14]:
dfs[3].columns

Index(['Id', 'Date', 'WeightKg', 'WeightPounds', 'BMI'], dtype='object')

In [27]:
# Transform Date to DateTime data type
time_labels = ["ActivityDate", "Time", "SleepDay", "Date"]
for i in range(4): 
    str0 = time_labels[i]
    dfs[i][str0] = pd.to_datetime(dfs[i][str0])


In [16]:
# datatypes
for i in range(4):   
    print(f"*******{fnames[i]}\n", dfs[i].dtypes)

*******dailyActivity
 Id                                   int64
ActivityDate                datetime64[ns]
TotalSteps                           int64
TotalDistance                      float64
VeryActiveDistance                 float64
ModeratelyActiveDistance           float64
LightActiveDistance                float64
SedentaryActiveDistance            float64
VeryActiveMinutes                    int64
FairlyActiveMinutes                  int64
LightlyActiveMinutes                 int64
SedentaryMinutes                     int64
Calories                             int64
TotalActiveMinutes                   int64
TotalMinutes                         int64
TotalActiveHours                   float64
dtype: object
*******heartrate_reduced
 Id                        int64
Time             datetime64[ns]
MinuteAverage           float64
HourlyAverage           float64
dtype: object
*******sleepDay
 Id                             int64
SleepDay              datetime64[ns]
TotalMinutesAslee

In [25]:
# check for duplicates
for i in range(4):
    print(f"*******{fnames[i]}") 
    print(dfs[i].duplicated().any())
          

*******dailyActivity
False
*******heartrate_reduced
False
*******sleepDay
True
*******weightLogInfo
False
*******dailyActivity
Id                          False
ActivityDate                False
TotalSteps                  False
TotalDistance               False
VeryActiveDistance          False
ModeratelyActiveDistance    False
LightActiveDistance         False
SedentaryActiveDistance     False
VeryActiveMinutes           False
FairlyActiveMinutes         False
LightlyActiveMinutes        False
SedentaryMinutes            False
Calories                    False
TotalActiveMinutes          False
TotalMinutes                False
TotalActiveHours            False
dtype: bool
*******heartrate_reduced
Id               False
Time             False
MinuteAverage    False
HourlyAverage    False
dtype: bool
*******sleepDay
Id                    False
SleepDay              False
TotalMinutesAsleep    False
TotalTimeInBedMin     False
TotalHoursAsleep      False
TotalTimeInBedHour    False
dtyp

In [26]:
# check for nulls
for i in range(4):
    print(f"*******{fnames[i]}") 
    print(dfs[i].isnull().any())
          

*******dailyActivity
Id                          False
ActivityDate                False
TotalSteps                  False
TotalDistance               False
VeryActiveDistance          False
ModeratelyActiveDistance    False
LightActiveDistance         False
SedentaryActiveDistance     False
VeryActiveMinutes           False
FairlyActiveMinutes         False
LightlyActiveMinutes        False
SedentaryMinutes            False
Calories                    False
TotalActiveMinutes          False
TotalMinutes                False
TotalActiveHours            False
dtype: bool
*******heartrate_reduced
Id               False
Time             False
MinuteAverage    False
HourlyAverage    False
dtype: bool
*******sleepDay
Id                    False
SleepDay              False
TotalMinutesAsleep    False
TotalTimeInBedMin     False
TotalHoursAsleep      False
TotalTimeInBedHour    False
dtype: bool
*******weightLogInfo
Id              False
Date            False
WeightKg        False
WeightPound

In [28]:
# renaming time columns all by same name
time_labels = ["ActivityDate", "Time", "SleepDay", "Date"]
for i in range(4): 
    str0 = time_labels[i]
for i in range(4):
    dfs[i] = dfs[i].rename(columns={str0: 'ActivityDate'})


In [29]:
# Step 1: Concatenate the DataFrames vertically to combine all the data
combined_df = pd.concat(dfs, ignore_index=True)

# Step 2: Sort the combined DataFrame by "ActivityDate" 
combined_df.sort_values(by='ActivityDate', inplace=True)

# Step 3: Merge the data using forward fill (ffill) to 
# fill NaN values in subsequent columns
combined_df.ffill(inplace=True)

print(combined_df.head())
combined_df.columns

             Id ActivityDate  TotalSteps  TotalDistance  VeryActiveDistance  \
0    1503960366   2016-04-12     13162.0           8.50                1.88   
737  7086361926   2016-04-12     11317.0           8.41                5.27   
711  7007744171   2016-04-12     14172.0          10.29                4.50   
680  6962181067   2016-04-12     10199.0           6.74                3.40   
654  6775888955   2016-04-12         0.0           0.00                0.00   

     ModeratelyActiveDistance  LightActiveDistance  SedentaryActiveDistance  \
0                        0.55                 6.06                      0.0   
737                      0.15                 2.97                      0.0   
711                      0.38                 5.41                      0.0   
680                      0.83                 2.51                      0.0   
654                      0.00                 0.00                      0.0   

     VeryActiveMinutes  FairlyActiveMinutes  ...  

Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance',
       'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories',
       'TotalActiveMinutes', 'TotalMinutes', 'TotalActiveHours', 'Time',
       'MinuteAverage', 'HourlyAverage', 'SleepDay', 'TotalMinutesAsleep',
       'TotalTimeInBedMin', 'TotalHoursAsleep', 'TotalTimeInBedHour',
       'WeightKg', 'WeightPounds', 'BMI'],
      dtype='object')

In [30]:
combined_df.shape

(4752, 27)

In [35]:
group_dict = combined_df.groupby('Id')

for patient_id, patient_df in group_dict:
    path = f"patient_csv_records/patient_{patient_id}.csv"
    patient_df.to_csv(path, index=False)
    print(patient_id, patient_df.shape)

1503960366 (58, 27)
1624580081 (31, 27)
1644430081 (34, 27)
1844505072 (34, 27)
1927972279 (37, 27)
2022484408 (246, 27)
2026352035 (64, 27)
2320127002 (32, 27)
2347167796 (274, 27)
2873212765 (33, 27)
3372868164 (20, 27)
3977333714 (58, 27)
4020332650 (119, 27)
4057192912 (4, 27)
4319703577 (59, 27)
4388161847 (514, 27)
4445114986 (59, 27)
4558609924 (333, 27)
4702921684 (59, 27)
5553957443 (467, 27)
5577150313 (434, 27)
6117666160 (326, 27)
6290855005 (29, 27)
6775888955 (88, 27)
6962181067 (513, 27)
7007744171 (243, 27)
7086361926 (55, 27)
8053475328 (34, 27)
8253242879 (19, 27)
8378563200 (63, 27)
8583815059 (31, 27)
8792009665 (265, 27)
8877689391 (117, 27)
