In [74]:
import pandas as pd
import os

# Define file paths
path1 = '../mturkfitbit_export_3.12.16-4.11.16/Fitabase Data 3.12.16-4.11.16/'
path2 = '../mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16/'

# List of all unique filenames across both directories
all_files = [
    'dailyActivity_merged.csv',
    'heartrate_seconds_merged.csv',
    'hourlyCalories_merged.csv',
    'hourlyIntensities_merged.csv',
    'sleepDay_merged.csv',
    'weightLogInfo_merged.csv'
]

# Dictionary to store clean dataframes
data = {}

for filename in all_files:
    file_path1 = os.path.join(path1, filename)
    file_path2 = os.path.join(path2, filename)
    
    df_list = []
    
    # Check and load from first directory
    if os.path.exists(file_path1):
        df_list.append(pd.read_csv(file_path1))
        
    # Check and load from second directory
    if os.path.exists(file_path2):
        df_list.append(pd.read_csv(file_path2))
    
    # Merge if data exists
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        
        # Create clean name (remove _merged.csv)
        clean_name = filename.replace('_merged.csv', '')
        data[clean_name] = combined_df
        
        print(f"Loaded {clean_name}: {combined_df.shape}")
    else:
        print(f"Warning: {filename} not found in either directory.")

# Clean up keys for easier access (optional: create individual variables)
# locals().update(data) # Uncomment to create variables like dailyActivity, heartrate_seconds etc.

print("\nAll files loaded successfully into 'data' dictionary.")

Loaded dailyActivity: (1397, 15)
Loaded heartrate_seconds: (3638339, 3)
Loaded hourlyCalories: (46183, 3)
Loaded hourlyIntensities: (46183, 4)
Loaded sleepDay: (413, 5)
Loaded weightLogInfo: (100, 8)

All files loaded successfully into 'data' dictionary.


In [76]:
daily_activity = data['dailyActivity']
daily_activity.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1397 entries, 0 to 1396
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        1397 non-null   int64  
 1   ActivityDate              1397 non-null   object 
 2   TotalSteps                1397 non-null   int64  
 3   TotalDistance             1397 non-null   float64
 4   TrackerDistance           1397 non-null   float64
 5   LoggedActivitiesDistance  1397 non-null   float64
 6   VeryActiveDistance        1397 non-null   float64
 7   ModeratelyActiveDistance  1397 non-null   float64
 8   LightActiveDistance       1397 non-null   float64
 9   SedentaryActiveDistance   1397 non-null   float64
 10  VeryActiveMinutes         1397 non-null   int64  
 11  FairlyActiveMinutes       1397 non-null   int64  
 12  LightlyActiveMinutes      1397 non-null   int64  
 13  SedentaryMinutes          1397 non-null   int64  
 14  Calories

In [77]:
import re
import pandas as pd

def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


for col in daily_activity.columns:
    daily_activity.rename(columns={col: to_snake_case(col)}, inplace=True)



In [78]:
daily_activity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1397 entries, 0 to 1396
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          1397 non-null   int64  
 1   activity_date               1397 non-null   object 
 2   total_steps                 1397 non-null   int64  
 3   total_distance              1397 non-null   float64
 4   tracker_distance            1397 non-null   float64
 5   logged_activities_distance  1397 non-null   float64
 6   very_active_distance        1397 non-null   float64
 7   moderately_active_distance  1397 non-null   float64
 8   light_active_distance       1397 non-null   float64
 9   sedentary_active_distance   1397 non-null   float64
 10  very_active_minutes         1397 non-null   int64  
 11  fairly_active_minutes       1397 non-null   int64  
 12  lightly_active_minutes      1397 non-null   int64  
 13  sedentary_minutes           1397 

In [79]:
#Date Parsing & Validation

daily_activity["activity_date"] = pd.to_datetime(daily_activity["activity_date"])
daily_activity ["activity_date"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1397 entries, 0 to 1396
Series name: activity_date
Non-Null Count  Dtype         
--------------  -----         
1397 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 11.0 KB


In [80]:
#Duplicate & Logical Integrity Checks
daily_activity.duplicated().sum()

np.int64(0)

In [81]:
# num of unique rows
daily_activity.nunique()

id                              35
activity_date                   62
total_steps                   1208
total_distance                 781
tracker_distance               780
logged_activities_distance      35
very_active_distance           410
moderately_active_distance     243
light_active_distance          591
sedentary_active_distance       10
very_active_minutes            130
fairly_active_minutes           93
lightly_active_minutes         379
sedentary_minutes              684
calories                       995
dtype: int64

In [82]:
#Data Quality & Anomaly Checks

In [None]:
#Zero-Activity Days
(daily["TotalSteps"] == 0).mean() * 100


NameError: name 'df' is not defined