 <h1 style="text-align: center;"><b>1. Data set clean up and pre-processing</b></h1>

In [219]:
import glob
import os
import pandas as pd
from datetime import timedelta

In [220]:
# Get all csv files from folder
files = glob.glob("/Users/venmeen/Downloads/HUPA-UC Diabetes Dataset/*.csv")

# Reading demographic patient file
demographic_df = pd.read_csv("/Users/venmeen/Downloads/HUPA-UC Diabetes Dataset/T1DM_patient_sleep_demographics_with_race.csv")

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8" />
  
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <style>
    body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; line-height: 1.6; margin: 2rem; color: #222; }
    h1, h2, h3 { line-height: 1.25; }
    h1 { font-size: 1.8rem; margin-bottom: 0.25rem; }
    h2 { font-size: 1.3rem; margin-top: 1.75rem; }
    h3 { font-size: 1.1rem; margin-top: 1rem; }
    ul { margin: 0.5rem 0 0.75rem 1.25rem; }
    .calc { background: #fafafa; border: 1px solid #eee; padding: 0.75rem 1rem; border-radius: 8px; }
    .eq { font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace; }
    strong { font-weight: 600; }
  </style>
</head>
<body>
  <h1>Dataset Description (Refined From the given description)</h1>

  <p>
    This dataset provides a collection of <strong>Continuous Glucose Monitoring (CGM) data</strong>, insulin dose administration, meal ingestion
    (counted in carbohydrate grams), steps, calories burned, heart rate, and sleep quality and quantity assessments, acquired from
    <strong>25 individuals with type 1 diabetes mellitus (T1DM)</strong>.
  </p>

  <h2>Data Acquisition</h2>
  <ul>
    <li><strong>CGM data</strong> was collected using <strong>FreeStyle Libre 2</strong> devices.</li>
    <li><strong>Fitbit Ionic smartwatches</strong> were used to capture steps, calories, heart rate, and sleep data.</li>
    <li><strong>Sampling frequency:</strong> Data was recorded in <strong>5-minute intervals</strong> for a <strong>minimum of 14 days per patient</strong>.</li>
  </ul>

  <h2>Expected Record Count</h2>
  <p>If the dataset were provided strictly for a 14-day sample, the expected number of records per patient can be calculated as follows:</p>

<h4> Number of 5-minute intervals per hour: 60 minutes/hour ÷ 5 minutes/interval = 12 intervals/hour </h4>
    
<h4> Number of intervals per day: 12 intervals/hour × 24 hours/day = 288 intervals/day </h4>

<h4> Number of intervals for 14 days: 288 intervals/day × 14 days = 4032 </h4>
  

  <p>Thus, each patient should ideally have <strong>4032 records</strong> in a 14-day dataset.</p>

  <h2>Dataset Observations</h2>

  <h3>Duplicate checks</h3>
  <ul>
    <li>Suspected duplicate records were checked using <code>patient_id</code> and <code>time</code>.</li>
    <li>No duplicates were found.</li>
    <li>Further checks confirmed that every record was spaced at exactly <strong>5-minute intervals</strong> with no missing entries for any patient.</li>
  </ul>
    
  <h3>Variation in data availability</h3>
  <ul>
    <li>Some patients had <strong>very large datasets</strong>, spanning several months (hundreds of thousands of records).</li>
    <li>Some patients had <strong>fewer than 14 days</strong> of data (e.g., 9–13 days).</li>
  </ul>

  <h3>Standardization approach</h3>
  <ul>
    <li>For patients with <strong>&lt;14</strong> or exactly <strong>14–15 days</strong> of data, we retained their records as-is.</li>
    <li>For patients with longer datasets, we extracted the <strong>most recent 15 days of data</strong> from the last available timestamp to ensure consistency.</li>
  </ul>

</body>
</html>


### Merging 25 patient data files as one file 
#### Reasoning: Merging files together will help for group analysis like finiding glucose trends and other general pattern across patients
#### And also ensuring the time interval between each record per patient is exactly 5 mins

In [223]:

def merge_rawfiles():
    df_files = []
    all_good=True
    for file in files:
        filename = os.path.splitext(os.path.basename(file))[0]
        if not filename.startswith("HUPA"):
            continue

        df = pd.read_csv(file, sep=";")

        # Uniform column header
        df.columns = df.columns.str.strip().str.lower()

        # Convert 'time' to datetime (force errors if invalid)
        df['time'] = pd.to_datetime(df['time'], errors='raise')

        # Remove duplicates based on time
        df.drop_duplicates(subset=["time"], inplace=True)

        # Sort by time
        df = df.sort_values('time').reset_index(drop=True)

        # Calculate time differences
        df['time_diff'] = df['time'].diff()

        # Find gaps larger than 5 minutes
        gaps = df[(df['time_diff'] != timedelta(minutes=5)) & (df.index > 0)]
        
        if not gaps.empty:
            all_good = False
            print(f" {filename} has missing 5-minute intervals")
            for idx in gaps.index:
                prev_time = df.loc[idx-1, 'time'] if idx > 0 else None
                curr_time = df.loc[idx, 'time']
                diff = df.loc[idx, 'time_diff']
                print(f"Previous: {prev_time}, Current: {curr_time}, Gap: {diff}")

        # Add patient_id
        df["patient_id"] = filename

        df_files.append(df)
    
    if all_good:
        print("All files have exact 5-minute intervals without missing records.")


    # Merge all patient files
    df_merged = pd.concat(df_files, ignore_index=True)
    df_merged.to_csv("mergedraw_file.csv", index=False)
    return df_merged


In [224]:
df = merge_rawfiles()

All files have exact 5-minute intervals without missing records.


<h3> Checking for 14 days record count for each patient </h3>

In [226]:
expected_records = 4032
# Count actual records per patient
record_count = df.groupby("patient_id").size().reset_index(name="Actual_Records")

# Add expected & difference columns
record_count["Expected_Records"] = expected_records
record_count["Missing_Records"] = record_count["Actual_Records"] - record_count["Expected_Records"]

print(record_count)

   patient_id  Actual_Records  Expected_Records  Missing_Records
0   HUPA0001P            4096              4032               64
1   HUPA0002P            3181              4032             -851
2   HUPA0003P            3770              4032             -262
3   HUPA0004P            3184              4032             -848
4   HUPA0005P            3858              4032             -174
5   HUPA0006P            2290              4032            -1742
6   HUPA0007P            3857              4032             -175
7   HUPA0009P            3812              4032             -220
8   HUPA0010P            2976              4032            -1056
9   HUPA0011P            3839              4032             -193
10  HUPA0014P            3829              4032             -203
11  HUPA0015P            3792              4032             -240
12  HUPA0016P            3835              4032             -197
13  HUPA0017P            3599              4032             -433
14  HUPA0018P            

In [227]:
def filter_14daysSample_files():
#  filter raw data file to reduce to 14 days sample
    df_files = []
    for file in files:

        filename = os.path.splitext(os.path.basename(file))[0]
        if not filename.startswith("HUPA"):
            continue
        df = pd.read_csv(file,sep=";") 

        # Uniform column header in all files
        df.columns = df.columns.str.strip().str.lower()

        # Removing duplicate rows - if more than one record has the same time for single patient, then it is considered as duplicate.
        df.drop_duplicates(subset=["time"], inplace= True)

        # Make sure time as Date time
        df['time'] = pd.to_datetime(df['time'], errors='coerce')

        # check if patient has more than 15 days of data
        max_date = df['time'].max()
        min_date = df['time'].min()
        total_days = (max_date - min_date).days
        total_days = df['time'].dt.date.nunique()
        
        if total_days > 15:
            # filter only last 15 days
            cutoff = max_date - timedelta(days=15)
            df = df[df['time'] > cutoff]
    
        #Add patient_id since we are merging all files together
        df["patient_id"] = filename
        df_files.append(df)

    # Merge the patient data files
    df = pd.concat(df_files, ignore_index=True)

    # Saving it as Single Merged csv file
    df.to_csv("mergedraw_file.csv", index=False)
    return df

In [228]:
df = filter_14daysSample_files()

### Verying data by checking for column's data types,null values and Nan values

In [230]:
def verify_data():
    print("\033[1mDataFrame's Information:\033[0m\n")
    print(df.info())
    print("\033[0m\nNull Value Count:\033[0m\n",df.isnull().sum())
    print("\033[0m\nNan Values:\033[0m\n",df.isna().sum())
    print("\033[0m\nNumber of rows and cols:\033[0m\n",df.shape)
    print("\033[0m\nDescription of DataFrame :\033[0m\n")
    print(df.describe)

### Display Raw Merged File Information as it is

In [232]:
print("\033[1mRaw Merged Data Info:\033[0m\n")
verify_data()

[1mRaw Merged Data Info:[0m

[1mDataFrame's Information:[0m

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90539 entries, 0 to 90538
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   time                    90539 non-null  datetime64[ns]
 1   glucose                 90539 non-null  float64       
 2   calories                90539 non-null  float64       
 3   heart_rate              90539 non-null  float64       
 4   steps                   90539 non-null  float64       
 5   basal_rate              90539 non-null  float64       
 6   bolus_volume_delivered  90539 non-null  float64       
 7   carb_input              90539 non-null  float64       
 8   patient_id              90539 non-null  object        
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 6.2+ MB
None
[0m
Null Value Count:[0m
 time                      0
glucose                   0
calorie

### Ensure the columns DataTypes

In [234]:
def ensure_col_dtype():

    # Collecting column names in one list
    cols_dtype = ['glucose', 'calories', 'heart_rate','steps','basal_rate','bolus_volume_delivered','carb_input']

    #Ensure all the columns in list are in numeric
    df[cols_dtype] = df[cols_dtype].apply(pd.to_numeric, errors='coerce')

    #Setting patient_id column as string
    df['patient_id'] = df['patient_id'].astype("string")

    #Setting time column as datetime
    df["time"] = pd.to_datetime(df["time"],errors="coerce")
    return df

### Standardize numeric column values 
#### Rounding the float value to 3 decimal numbers for clarity and usability

In [236]:
def standardize_numeric_cols():
    stdardize_cols = ['glucose', 'calories', 'heart_rate']
    df[stdardize_cols] = df[stdardize_cols].round(3)
    return df

<h4> Check for negative values in numeric columns </h4>
<font face="TimesNewRoman">To treat the negative values as error</font>

In [238]:
def check_negative_values():
    columns = ['glucose', 'calories', 'heart_rate', 'steps', 'basal_rate', 'bolus_volume_delivered', 'carb_input']
    col_negative_values = (df[columns] < 0).any()
    print(col_negative_values)

In [239]:
df = standardize_numeric_cols()
df = ensure_col_dtype()
print("\033[1mChecking for negative values in columns:\033[0m\n")
check_negative_values()

[1mChecking for negative values in columns:[0m

glucose                   False
calories                  False
heart_rate                False
steps                     False
basal_rate                False
bolus_volume_delivered     True
carb_input                False
dtype: bool


<h4>Setting up glucose range</h4>
Glucose range provides framework to evaluate a patient's blood sugar control

In [241]:
def classify_by_glucose_value(value):
    if pd.isna(value): 
        return "NA"
    if value < 40 or value > 500:
        return "Invalid"
    elif value < 70:
        return "Below Range"
    elif value > 180:
        return "Above Range"
    else:
        return "In Range"
        
def set_glucose_range(df):
    df["glucose_range_level"] = df["glucose"].apply(classify_by_glucose_value)

<h4>Setting up Calories Burned into categories for group by Analysis</h4>
<h4><font face="TimeNewRoman">Classifying the burned calories data as (0-4 calories burned)'Resting', (5-19 calories burned)'Light Activity', (20-34 calories burned)'Moderate Activity', (35-50 calories burned)'Intense activity', (50-59 calories burned)'Very Extreme/SPIKE' and more then 60 is considered as Error since this is 5 min interval data</font></h4>

In [243]:
def set_caloriesburned_categories(value):
    if value < 5:
        return "Resting"
    elif 5 <= value <20:
        return "Light Activity"
    elif 20 <= value <35:
        return "Moderate Activity"
    elif 35 <= value <50:
        return "Intense activity"
    elif 50 <= value <60:
        return "Very Extreme/SPIKE"
    elif value >60:
        return "ERROR"
def calories_categories(df):
    df['calories_categories'] = df['calories'].apply(set_caloriesburned_categories)

In [244]:
set_glucose_range(df)
calories_categories(df)

### Heart Rate column pre-processing

#### Reasoning: HR Range validation and categorizing helps for grouped analysis

In [246]:
def hr_category(hr):

    # To HR Range validation and categorizing
    if  (hr >= 40) & (hr <= 100): 
        return 'Normal'
         
    elif  (hr >= 101) & (hr <= 180):
      return 'Active'
    
    elif (hr >= 180) & (hr < 220):
       return 'High'

    else: 
        return 'Abnormal'

#### Adding derived column 'HR_Category' to the dataset for HR range categorization

In [248]:

# Adding 'HR_Category' column
df['HR_Category'] = df['heart_rate'].apply(hr_category)


### Steps column validation 

#### Reasoning: Validation of steps column range between 0 - 1500 to investigate extreme outliers and negative values.

In [250]:
df[(df['steps'] < 0) | (df['steps'] > 1500)]['steps'].sum()

0.0

### Carb_input validation

#### Reasoning: Validation of carb_input column to investigate extreme outliers and negative values. 

In [252]:
df[(df['carb_input'] < 0) | (df['carb_input'] > 150)]['carb_input'].sum()

0.0

### Basal_rate validation

#### Reasoning: Validation of basal_rate column to investigate extreme outliers and negative values.

In [254]:
print(df[df['basal_rate'] < 0])

Empty DataFrame
Columns: [time, glucose, calories, heart_rate, steps, basal_rate, bolus_volume_delivered, carb_input, patient_id, glucose_range_level, calories_categories, HR_Category]
Index: []


### Bolus_volume_delivered validation

#### Reasoning: Validation of bolus_volume_delivered column to investigate extreme outliers and negative values.

In [256]:
print(df[df['bolus_volume_delivered']<0].groupby(by='patient_id').value_counts())

patient_id  time                 glucose  calories  heart_rate  steps  basal_rate  bolus_volume_delivered  carb_input  glucose_range_level  calories_categories  HR_Category
HUPA0017P   2019-03-29 15:00:00  103.0    6.489     84.179      0.0    0.059       -1.0                    7.0         In Range             Light Activity       Normal         1
            2019-03-31 14:30:00  84.0     4.771     98.220      0.0    0.059       -1.0                    4.0         In Range             Resting              Normal         1
            2019-04-06 12:45:00  134.0    14.601    100.231     84.0   0.059       -3.0                    4.0         In Range             Light Activity       Abnormal       1
            2019-04-07 13:15:00  97.0     4.771     98.324      0.0    0.059       -1.0                    2.5         In Range             Resting              Normal         1
Name: count, dtype: int64


#### Reasoning: Treating all negative bolus_volume_delivered to 0 helps to maintain clean data 

In [258]:
df['bolus_volume_delivered'] = df['bolus_volume_delivered'].clip(lower=0)
print(df[df['bolus_volume_delivered'] < 0])

Empty DataFrame
Columns: [time, glucose, calories, heart_rate, steps, basal_rate, bolus_volume_delivered, carb_input, patient_id, glucose_range_level, calories_categories, HR_Category]
Index: []


<h4>Verification After setting Range validation columns</h4>

In [260]:
print("\033[1mAfter DataCleanup:\033[0m\n")
verify_data()

[1mAfter DataCleanup:[0m

[1mDataFrame's Information:[0m

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90539 entries, 0 to 90538
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   time                    90539 non-null  datetime64[ns]
 1   glucose                 90539 non-null  float64       
 2   calories                90539 non-null  float64       
 3   heart_rate              90539 non-null  float64       
 4   steps                   90539 non-null  float64       
 5   basal_rate              90539 non-null  float64       
 6   bolus_volume_delivered  90539 non-null  float64       
 7   carb_input              90539 non-null  float64       
 8   patient_id              90539 non-null  string        
 9   glucose_range_level     90539 non-null  object        
 10  calories_categories     90539 non-null  object        
 11  HR_Category             90539 non-null  obje

### Demographic dataset verification

#### To ensure the dataset has non null, non na and unique values

In [262]:
print("\033[1mShape:\033[0m ", demographic_df.shape)
print("\033[1mInfo:\033[0m ", demographic_df.info())
print("\033[1mSum of null:\033[0m\n", demographic_df.isnull().sum())
print("\033[1mSum of na:\033[0m\n", demographic_df.isna().sum())
print("\033[1mSum of duplicated:\033[0m\n", demographic_df.duplicated().sum())
print("\033[1mDescribe:\033[0m\n")
demographic_df.describe().T

[1mShape:[0m  (25, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Patient_ID                    25 non-null     object 
 1   Age                           25 non-null     int64  
 2   Gender                        25 non-null     object 
 3   Race                          25 non-null     object 
 4   Average Sleep Duration (hrs)  25 non-null     float64
 5   Sleep Quality (1-10)          25 non-null     float64
 6   % with Sleep Disturbances     25 non-null     int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 1.5+ KB
[1mInfo:[0m  None
[1mSum of null:[0m
 Patient_ID                      0
Age                             0
Gender                          0
Race                            0
Average Sleep Duration (hrs)    0
Sleep Quality (1-10)            0
% with Sleep Disturbances       0
d

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,25.0,46.68,15.544881,20.0,34.0,46.0,62.0,74.0
Average Sleep Duration (hrs),25.0,5.972,0.6699,5.0,5.4,5.9,6.6,7.2
Sleep Quality (1-10),25.0,5.952,1.342609,4.1,4.6,5.9,7.1,7.9
% with Sleep Disturbances,25.0,52.8,17.682383,30.0,40.0,50.0,70.0,80.0


### Demographic dataset cleanup

#### For clean and easy access renaming the column names, capitalizing the values.

In [264]:
# Renaming columns to lower and replacing space with undetscore
demographic_df.columns = demographic_df.columns.str.strip().str.lower().str.replace(' ', '_')

# Renaming long column names for simple column names
demographic_df.rename(columns={
    'average_sleep_duration_(hrs)': 'sleep_duration',
    'sleep_quality_(1-10)': 'sleep_quality',
    '%_with_sleep_disturbances': 'sleep_disturbance_percent'
}, inplace=True)

# checking Columns datatype
print(demographic_df.info())

# Checking for missing values
print("Missing values per column:")
print(demographic_df.isnull().sum())

# Converting numeric columns
numeric_cols = ['age', 'sleep_duration', 'sleep_quality', 'sleep_disturbance_percent']
for col in numeric_cols:
    demographic_df[col] = pd.to_numeric(demographic_df[col], errors='coerce')

# Normalizing categorical values
demographic_df['gender'] = demographic_df['gender'].str.capitalize()
demographic_df['race'] = demographic_df['race'].str.title()

# Dropping duplicates
demographic_df.drop_duplicates(subset='patient_id', inplace=True)

# Final check
print("\nCleaned Demographic DataFrame:")
demographic_df.head().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   patient_id                 25 non-null     object 
 1   age                        25 non-null     int64  
 2   gender                     25 non-null     object 
 3   race                       25 non-null     object 
 4   sleep_duration             25 non-null     float64
 5   sleep_quality              25 non-null     float64
 6   sleep_disturbance_percent  25 non-null     int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 1.5+ KB
None
Missing values per column:
patient_id                   0
age                          0
gender                       0
race                         0
sleep_duration               0
sleep_quality                0
sleep_disturbance_percent    0
dtype: int64

Cleaned Demographic DataFrame:


Unnamed: 0,0,1,2,3,4
patient_id,HUPA0001P,HUPA0002P,HUPA0003P,HUPA0004P,HUPA0005P
age,34,49,64,34,49
gender,Male,Male,Male,Female,Male
race,Other,Hispanic,Black,Native American,Native American
sleep_duration,6.3,6.6,5.3,5.2,5.8
sleep_quality,4.5,4.4,5.2,6.9,7.9
sleep_disturbance_percent,80,40,70,60,30


In [265]:
# Writing merged and demographic cleaned dataset into csv 
df.to_csv('merged_cleandata.csv', index=False)
demographic_df.to_csv("demographic_cleandata.csv", index=False)

In [266]:
# checking merged dataset
df

Unnamed: 0,time,glucose,calories,heart_rate,steps,basal_rate,bolus_volume_delivered,carb_input,patient_id,glucose_range_level,calories_categories,HR_Category
0,2020-01-17 00:00:00,40.000,15.043,96.372,8.0,0.035,0.0,1.0,HUPA0023P,Below Range,Light Activity,Normal
1,2020-01-17 00:05:00,41.333,8.316,91.395,0.0,0.035,0.0,0.0,HUPA0023P,Below Range,Light Activity,Normal
2,2020-01-17 00:10:00,42.667,7.583,85.992,0.0,0.035,0.0,0.0,HUPA0023P,Below Range,Light Activity,Normal
3,2020-01-17 00:15:00,44.000,7.338,82.434,0.0,0.035,0.0,0.0,HUPA0023P,Below Range,Light Activity,Normal
4,2020-01-17 00:20:00,50.000,7.583,78.823,0.0,0.035,0.0,0.0,HUPA0023P,Below Range,Light Activity,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...
90534,2019-07-13 18:20:00,70.000,13.588,85.124,61.0,0.152,0.0,0.0,HUPA0020P,In Range,Light Activity,Normal
90535,2019-07-13 18:25:00,80.000,6.578,81.886,0.0,0.152,0.0,0.0,HUPA0020P,In Range,Light Activity,Normal
90536,2019-07-13 18:30:00,90.000,6.902,84.046,0.0,0.152,0.0,0.0,HUPA0020P,In Range,Light Activity,Normal
90537,2019-07-13 18:35:00,108.667,6.470,82.110,0.0,0.152,0.0,0.0,HUPA0020P,In Range,Light Activity,Normal


In [267]:
# checking demographic dataset
demographic_df

Unnamed: 0,patient_id,age,gender,race,sleep_duration,sleep_quality,sleep_disturbance_percent
0,HUPA0001P,34,Male,Other,6.3,4.5,80
1,HUPA0002P,49,Male,Hispanic,6.6,4.4,40
2,HUPA0003P,64,Male,Black,5.3,5.2,70
3,HUPA0004P,34,Female,Native American,5.2,6.9,60
4,HUPA0005P,49,Male,Native American,5.8,7.9,30
5,HUPA0006P,35,Male,White,6.6,4.2,60
6,HUPA0007P,67,Male,Native American,7.1,6.0,80
7,HUPA0009P,65,Female,Other,6.6,4.6,40
8,HUPA0010P,22,Male,Asian,7.1,5.5,50
9,HUPA0011P,63,Female,Other,5.6,4.7,60
