# Data Preparation

This jupyter notebook comes from [this](https://www.kaggle.com/code/carlafgomes/fitbit-data-analysis-in-python#2-Data-Processing) original notebook Kaggle.  The original Kaggle notebook 
uses as input [this](https://www.kaggle.com/datasets/arashnic/fitbit) dataset at Kaggle. 

Before running this notebook,
you should run `jupyter_notebooks/heartrate_data_thinning.id.ipynb`.

The goal of this notebook is to produce, based on the 
Kaggle dataset, a file for each patient identified by his/her Id. This set 
of patient records is stored in the folder `patient_csv_records`



In [1]:
# this makes sure it starts looking for things from the SentenceAx folder down.
import os
import sys
os.chdir('../')
sys.path.insert(0,os.getcwd())
print(os.getcwd())

C:\Users\rrtuc\Desktop\backed-up\python-projects\CausalFitbit


In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import glob

In [3]:
fnames= ["dailyActivity",
"heartrate_reduced",
"sleepDay",
"weightLogInfo"]

def get_csv_path(fname):
 return f"input_data/Fitabase Data 4.12.16-5.12.16/{fname}_merged.csv"

dfs = [pd.read_csv(get_csv_path(fname)) for fname in fnames]

In [4]:
# number of unique Id's
for i in range(4):
    print(fnames[i], dfs[i].Id.nunique())

dailyActivity 33
heartrate_reduced 14
sleepDay 24
weightLogInfo 8


In [5]:
# column names
for i in range(4):
    print(f"*******{fnames[i]}\n", dfs[i].columns)

*******dailyActivity
 Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance',
       'LoggedActivitiesDistance', 'VeryActiveDistance',
       'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories'],
      dtype='object')
*******heartrate_reduced
 Index(['Id', 'Time', 'PulseHourlyAverage'], dtype='object')
*******sleepDay
 Index(['Id', 'SleepDay', 'TotalSleepRecords', 'TotalMinutesAsleep',
       'TotalTimeInBed'],
      dtype='object')
*******weightLogInfo
 Index(['Id', 'Date', 'WeightKg', 'WeightPounds', 'Fat', 'BMI',
       'IsManualReport', 'LogId'],
      dtype='object')


In [6]:
# check for missing values
for i in range(4):
    print(f"*******{fnames[i]}\n", dfs[i].isnull().sum())

*******dailyActivity
 Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64
*******heartrate_reduced
 Id                    0
Time                  0
PulseHourlyAverage    0
dtype: int64
*******sleepDay
 Id                    0
SleepDay              0
TotalSleepRecords     0
TotalMinutesAsleep    0
TotalTimeInBed        0
dtype: int64
*******weightLogInfo
 Id                 0
Date               0
WeightKg           0
WeightPounds       0
Fat               65
BMI                0
IsManualReport     0
LogId              0
dtype: int64


In [7]:
total_rows_fat = len(dfs[3]['Fat'])

print(f"Total number of rows in the 'Fat' column: {total_rows_fat}")

Total number of rows in the 'Fat' column: 67


In [8]:
# fnames= ["dailyActivity",
# "heartrate_seconds",
# "sleepDay",
# "weightLogInfo"]

# Since the 'Fat' columns has 67 rows, 
# and 65 of them are missing value, it will be dropped
dfs[3] = dfs[3].drop(columns=['Fat'])

# Also drop the columns that will not be needed
dfs[0] = dfs[0].drop(
    columns=['LoggedActivitiesDistance', 'TrackerDistance'])
dfs[2] = dfs[2].drop(columns=['TotalSleepRecords'])
dfs[3] = dfs[3].drop(columns=['IsManualReport', 'LogId'])

In [9]:
# Check the dataframe shapes

for i in range(4):   
    print(f"*******{fnames[i]}\n", dfs[i].shape)

*******dailyActivity
 (940, 13)
*******heartrate_reduced
 (3332, 3)
*******sleepDay
 (413, 4)
*******weightLogInfo
 (67, 5)


In [10]:
# add new columns to sleepDay
dfs[2]['HoursAsleep'] = dfs[2]['TotalMinutesAsleep'] / 60
dfs[2]['HoursInBed'] = dfs[2]['TotalTimeInBed'] / 60

dfs[2] = dfs[2].drop(columns=['TotalMinutesAsleep'])
dfs[2] = dfs[2].drop(columns=['TotalTimeInBed'])

dfs[2].columns

Index(['Id', 'SleepDay', 'HoursAsleep', 'HoursInBed'], dtype='object')

In [11]:
dfs[0].head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [12]:
# datatypes
for i in range(4):   
    print(f"*******{fnames[i]}\n", dfs[i].dtypes)

*******dailyActivity
 Id                            int64
ActivityDate                 object
TotalSteps                    int64
TotalDistance               float64
VeryActiveDistance          float64
ModeratelyActiveDistance    float64
LightActiveDistance         float64
SedentaryActiveDistance     float64
VeryActiveMinutes             int64
FairlyActiveMinutes           int64
LightlyActiveMinutes          int64
SedentaryMinutes              int64
Calories                      int64
dtype: object
*******heartrate_reduced
 Id                      int64
Time                   object
PulseHourlyAverage    float64
dtype: object
*******sleepDay
 Id               int64
SleepDay        object
HoursAsleep    float64
HoursInBed     float64
dtype: object
*******weightLogInfo
 Id                int64
Date             object
WeightKg        float64
WeightPounds    float64
BMI             float64
dtype: object


In [13]:
dfs[3].columns

Index(['Id', 'Date', 'WeightKg', 'WeightPounds', 'BMI'], dtype='object')

In [14]:
# Transform Date to DateTime data type
time_labels = ["ActivityDate", "Time", "SleepDay", "Date"]
for i in range(4): 
    str0 = time_labels[i]
    dfs[i][str0] = pd.to_datetime(dfs[i][str0])


In [15]:
# datatypes
for i in range(4):   
    print(f"*******{fnames[i]}\n", dfs[i].dtypes)

*******dailyActivity
 Id                                   int64
ActivityDate                datetime64[ns]
TotalSteps                           int64
TotalDistance                      float64
VeryActiveDistance                 float64
ModeratelyActiveDistance           float64
LightActiveDistance                float64
SedentaryActiveDistance            float64
VeryActiveMinutes                    int64
FairlyActiveMinutes                  int64
LightlyActiveMinutes                 int64
SedentaryMinutes                     int64
Calories                             int64
dtype: object
*******heartrate_reduced
 Id                             int64
Time                  datetime64[ns]
PulseHourlyAverage           float64
dtype: object
*******sleepDay
 Id                      int64
SleepDay       datetime64[ns]
HoursAsleep           float64
HoursInBed            float64
dtype: object
*******weightLogInfo
 Id                       int64
Date            datetime64[ns]
WeightKg           

In [16]:
# check for duplicates
for i in range(4):
    print(f"*******{fnames[i]}") 
    print(dfs[i].duplicated().any())
          

*******dailyActivity
False
*******heartrate_reduced
False
*******sleepDay
True
*******weightLogInfo
False


In [17]:
# check for nulls
for i in range(4):
    print(f"*******{fnames[i]}") 
    print(dfs[i].isnull().any())
          

*******dailyActivity
Id                          False
ActivityDate                False
TotalSteps                  False
TotalDistance               False
VeryActiveDistance          False
ModeratelyActiveDistance    False
LightActiveDistance         False
SedentaryActiveDistance     False
VeryActiveMinutes           False
FairlyActiveMinutes         False
LightlyActiveMinutes        False
SedentaryMinutes            False
Calories                    False
dtype: bool
*******heartrate_reduced
Id                    False
Time                  False
PulseHourlyAverage    False
dtype: bool
*******sleepDay
Id             False
SleepDay       False
HoursAsleep    False
HoursInBed     False
dtype: bool
*******weightLogInfo
Id              False
Date            False
WeightKg        False
WeightPounds    False
BMI             False
dtype: bool


In [18]:
# renaming time columns all by same name
time_col_labels = ["ActivityDate", "Time", "SleepDay", "Date"]
for i in range(4): 
    str0 = time_col_labels[i]
    dfs[i] = dfs[i].rename(columns={str0: 'ActivityDate'})

In [19]:
# Step 1: Concatenate the DataFrames vertically to combine all the data
combined_df = pd.concat(dfs, ignore_index=True)

# Step 2: Sort the combined DataFrame by "ActivityDate" 
combined_df.sort_values(by='ActivityDate', inplace=True)

# Step 3: Merge the data using forward fill (ffill) to 
# fill NaN values in subsequent columns
combined_df.ffill(inplace=True)

print(combined_df.head())
combined_df.columns

              Id ActivityDate  TotalSteps  TotalDistance  VeryActiveDistance  \
0     1503960366   2016-04-12     13162.0           8.50                1.88   
4611  7086361926   2016-04-12     13162.0           8.50                1.88   
536   5553957443   2016-04-12     11596.0           7.57                1.37   
505   4702921684   2016-04-12      7213.0           5.88                0.00   
474   4558609924   2016-04-12      5135.0           3.39                0.00   

      ModeratelyActiveDistance  LightActiveDistance  SedentaryActiveDistance  \
0                         0.55                 6.06                      0.0   
4611                      0.55                 6.06                      0.0   
536                       0.79                 5.41                      0.0   
505                       0.00                 5.85                      0.0   
474                       0.00                 3.39                      0.0   

      VeryActiveMinutes  FairlyActiveM

Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance',
       'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories',
       'PulseHourlyAverage', 'HoursAsleep', 'HoursInBed', 'WeightKg',
       'WeightPounds', 'BMI'],
      dtype='object')

In [20]:
combined_df.shape

(4752, 19)

In [21]:
group_dict = combined_df.groupby('Id')

for patient_id, patient_df in group_dict:
    path = f"patient_csv_records/patient_{patient_id}.csv"
    patient_df.to_csv(path, index=False)
    print(patient_id, patient_df.shape)

1503960366 (58, 19)
1624580081 (31, 19)
1644430081 (34, 19)
1844505072 (34, 19)
1927972279 (37, 19)
2022484408 (246, 19)
2026352035 (64, 19)
2320127002 (32, 19)
2347167796 (274, 19)
2873212765 (33, 19)
3372868164 (20, 19)
3977333714 (58, 19)
4020332650 (119, 19)
4057192912 (4, 19)
4319703577 (59, 19)
4388161847 (514, 19)
4445114986 (59, 19)
4558609924 (333, 19)
4702921684 (59, 19)
5553957443 (467, 19)
5577150313 (434, 19)
6117666160 (326, 19)
6290855005 (29, 19)
6775888955 (88, 19)
6962181067 (513, 19)
7007744171 (243, 19)
7086361926 (55, 19)
8053475328 (34, 19)
8253242879 (19, 19)
8378563200 (63, 19)
8583815059 (31, 19)
8792009665 (265, 19)
8877689391 (117, 19)
