In [1]:
import numpy as np 
import pandas as pd
import os

In [None]:
# handling fitness dataframe
features_used = [
    "dailyActivity_merged.csv",   # base
    "sleepDay_merged.csv",
    "dailyCalories_merged.csv",
    "dailySteps_merged.csv"
    # "weightLogInfo_merged.csv"
]

file_path = "datasets/mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16/"

# average out the heartRate per day? 
hr_seconds = pd.read_csv(file_path + "heartrate_seconds_merged.csv")
hr_seconds.head(15)

Unnamed: 0,Id,Time,Value
0,2022484408,4/12/2016 7:21:00 AM,97
1,2022484408,4/12/2016 7:21:05 AM,102
2,2022484408,4/12/2016 7:21:10 AM,105
3,2022484408,4/12/2016 7:21:20 AM,103
4,2022484408,4/12/2016 7:21:25 AM,101
5,2022484408,4/12/2016 7:22:05 AM,95
6,2022484408,4/12/2016 7:22:10 AM,91
7,2022484408,4/12/2016 7:22:15 AM,93
8,2022484408,4/12/2016 7:22:20 AM,94
9,2022484408,4/12/2016 7:22:25 AM,93


In [11]:
hr_seconds.dtypes

Id        int64
Time     object
Value     int64
dtype: object

In [12]:
# converting to daily values 
hr_seconds["Time"] = pd.to_datetime(hr_seconds["Time"])
hr_seconds["Date"] = hr_seconds["Time"].dt.date

# group by id and date and then find the average for each day 
hr_daily_avg = hr_seconds.groupby(['Id', 'Date'])['Value'].mean().reset_index()
hr_daily_avg = hr_daily_avg.rename(columns={'Value': 'AverageHeartRate'})
hr_daily_avg.head(10)

Unnamed: 0,Id,Date,AverageHeartRate
0,2022484408,2016-04-12,75.804177
1,2022484408,2016-04-13,80.337584
2,2022484408,2016-04-14,72.628597
3,2022484408,2016-04-15,80.437382
4,2022484408,2016-04-16,75.960547
5,2022484408,2016-04-17,83.917138
6,2022484408,2016-04-18,82.712829
7,2022484408,2016-04-19,81.954
8,2022484408,2016-04-20,83.44404
9,2022484408,2016-04-21,86.37482


### Left Merge fitness data with all of the other datasets - keeping all rows from the daily activities.

We can change this up if you don't like it

In [13]:
# Load base dataframe
main_df = pd.read_csv(os.path.join(file_path, "dailyActivity_merged.csv"))
main_df['Date'] = pd.to_datetime(main_df['Date']).dt.date


In [14]:
# Merge heart rate first
main_df = pd.merge(main_df, hr_daily_avg, how='left', on=['Id', 'Date'])

# Merge remaining files
for file in features_used[1:]:
    df = pd.read_csv(os.path.join(file_path, file))
    
    # Try to parse date column
    for col in df.columns:
        if 'date' in col.lower():
            try:
                df[col] = pd.to_datetime(df[col]).dt.date
            except:
                pass
    
    # Determine merge keys
    common_cols = list(set(main_df.columns) & set(df.columns))
    if 'Id' in common_cols and len(common_cols) > 1:
        main_df = pd.merge(main_df, df, how='left', on=common_cols)
    else:
        main_df = pd.merge(main_df, df, how='left', on='Id')

# Final dataset
print(main_df.head())

           Id        Date  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366  2016-04-12       13162           8.50             8.50   
1  1503960366  2016-04-13       10735           6.97             6.97   
2  1503960366  2016-04-14       10460           6.74             6.74   
3  1503960366  2016-04-15        9762           6.28             6.28   
4  1503960366  2016-04-16       12669           8.16             8.16   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0                1.88                      0.55   
1                       0.0                1.57                      0.69   
2                       0.0                2.44                      0.40   
3                       0.0                2.14                      1.26   
4                       0.0                2.71                      0.41   

   LightActiveDistance  SedentaryActiveDistance  VeryActiveMinutes  \
0                 6.06      

  df[col] = pd.to_datetime(df[col]).dt.date


In [15]:
print(main_df.isna().sum())

print("Num of rows total:", len(main_df))

Id                            0
Date                          0
TotalSteps                    0
TotalDistance                 0
TrackerDistance               0
LoggedActivitiesDistance      0
VeryActiveDistance            0
ModeratelyActiveDistance      0
LightActiveDistance           0
SedentaryActiveDistance       0
VeryActiveMinutes             0
FairlyActiveMinutes           0
LightlyActiveMinutes          0
SedentaryMinutes              0
Calories                      0
AverageHeartRate            608
TotalSleepRecords           530
TotalMinutesAsleep          530
TotalTimeInBed              530
StepTotal                     0
dtype: int64
Num of rows total: 943


There are a lot of missing data for sleeping records, weight and logging weight. We can modify what we want to do with this dataset but let me know if you don't like this. I think we have enough data to do other things. I also changed the Date columns manually for all of the datasets in the folder so that it would be easier to index. 

### Handling clinical data 

In [18]:
files_used = ["PATIENTS.csv",
              "ADMISSIONS.csv",
              "DIAGNOSES_ICD.csv",
              "D_ICD_DIAGNOSES.csv",
              "LABEVENTS.csv",
              "D_LABITEMS.csv",
              "PRESCRIPTIONS.csv",
              "PROCEDURES_ICD.csv",
              "D_ICD_PROCEDURES.csv",
              "CHARTEVENTS.csv",
              "D_ITEMS.csv"]
file_path = "datasets/mimic-iii-clinical-database-demo-1.4/"

I am also looking at the files that you said we could use and I am seeing a lot of data but I am not quite sure what would be useful since I do not know what half of it is. I think the features from ADMISSIONS, D_ICD_DIAGNOSES, D_ICD_PROCEDURES, and PRESCRIPTIONS could be useful for our project. Let me know what you think.