# 1. Data set pre-processing

In [87]:
import glob
import os
import pandas as pd

In [88]:
# Get all csv files from folder
files = glob.glob(r"C:\Febi\NumpyNinja\Hackathon\Python_Launch Folder\Launch Folder\HUPA-UC Diabetes Dataset\*.csv")

demographic_df = pd.read_csv(r"C:\Febi\NumpyNinja\Hackathon\Python_Launch Folder\Launch Folder\HUPA-UC Diabetes Dataset\T1DM_patient_sleep_demographics_with_race.csv")

**Merging 25 patient data files as one file for cleanup and Analysis**
<h5>Adding patient_Id column as unique identifier based on filename</h5>

In [90]:
def merge_rawfiles():
# Merging raw data file as one file
    df_files = []
    for file in files:

        filename = os.path.basename(file)
        if not filename.startswith("HUPA"):
            continue
        df = pd.read_csv(file,sep=";") 

        # Uniform column header in all files
        df.columns = df.columns.str.strip().str.lower()
        
        #Add patient_id
        df["patient_id"] = filename
        df_files.append(df)

    # Merge the patient data files
    df = pd.concat(df_files, ignore_index=True)

    # Saving it as Single Merged file
    df.to_csv("mergedraw_file.csv", index=False)
    return df

<h4>standardize based on time</h4>
<h5><font="TimesNewRoman">Parsed datetime, removed duplicate rows based on time and set time as index</font></h5>

In [92]:
def standardize_basedontime(df):

    # Time Parser for datetime calculation
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"],errors="coerce")

    # Setting the time column as index
    df.set_index("time")
        
    # Removing duplicate rows
    df = df.drop_duplicates(subset=["time"])


### Heart Rate column pre-processing

#### Reasoning: HR Range validation and categorizing helps for grouped analysis

In [94]:
def hr_category(hr):

    # To HR Range validation and categorizing
    if  (hr >= 40) & (hr <= 100): 
        return 'Normal'
         
    elif  (hr >= 101) & (hr <= 180):
      return 'Active'
    
    elif (hr >= 180) & (hr < 220):
       return 'High'

    else: 
        return 'Abnormal'

### Steps column validation 

#### Reasoning: Validation of steps column range between 0 - 1500 to investigate extreme outliers and negative values.


In [96]:
df[(df['steps'] < 0) | (df['steps'] > 1500)]['steps'].sum()

0.0

### Carb_input validation

#### Reasoning: Validation of carb_input column to investigate extreme outliers and negative values. 

In [98]:
df[(df['carb_input'] < 0) | (df['carb_input'] > 150)]['carb_input'].sum()

0.0

### Basal_rate validation

#### Reasoning: Validation of basal_rate column to investigate extreme outliers and negative values.

In [100]:
print(df[df['basal_rate'] < 0])

Empty DataFrame
Columns: [time, glucose, calories, heart_rate, steps, basal_rate, bolus_volume_delivered, carb_input, patient_id]
Index: []


### Bolus_volume_delivered validation

#### Reasoning: Validation of bolus_volume_delivered column to investigate extreme outliers and negative values.

In [102]:
print(df[df['bolus_volume_delivered']<0].groupby(by='patient_id').value_counts())

Series([], Name: count, dtype: int64)


#### Reasoning: Treating all negative bolus_volume_delivered to 0 helps to maintain clean data 

In [104]:
df['bolus_volume_delivered'] = df['bolus_volume_delivered'].clip(lower=0)
print(df[df['bolus_volume_delivered'] < 0])

Empty DataFrame
Columns: [time, glucose, calories, heart_rate, steps, basal_rate, bolus_volume_delivered, carb_input, patient_id]
Index: []


In [105]:
df = merge_rawfiles()
standardize_basedontime(df)
df['HR_Category'] = df['heart_rate'].apply(hr_category)
df

Unnamed: 0,time,glucose,calories,heart_rate,steps,basal_rate,bolus_volume_delivered,carb_input,patient_id,HR_Category
0,2018-06-13 18:40:00,332.000000,6.35950,82.322835,34.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
1,2018-06-13 18:45:00,326.000000,7.72800,83.740157,0.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
2,2018-06-13 18:50:00,330.000000,4.74950,80.525180,0.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
3,2018-06-13 18:55:00,324.000000,6.35950,89.129032,20.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
4,2018-06-13 19:00:00,306.000000,5.15200,92.495652,0.0,0.075000,0.0,0.0,HUPA0001P.csv,Normal
...,...,...,...,...,...,...,...,...,...,...
309387,2022-05-18 11:55:00,109.333333,10.79280,104.171171,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Active
309388,2022-05-18 12:00:00,114.000000,9.80346,103.442623,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Active
309389,2022-05-18 12:05:00,118.666667,5.66622,95.542857,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Normal
309390,2022-05-18 12:10:00,123.333333,5.57628,91.381356,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Normal


### Demographic dataset verification

#### To ensure the dataset has non null, non na and unique values

In [107]:
print("\033[1mShape:\033[0m ", demographic_df.shape)
print("\033[1mInfo:\033[0m ", demographic_df.info())
print("\033[1mSum of null:\033[0m\n", demographic_df.isnull().sum())
print("\033[1mSum of na:\033[0m\n", demographic_df.isna().sum())
print("\033[1mSum of duplicated:\033[0m\n", demographic_df.duplicated().sum())
print("\033[1mDescribe:\033[0m\n")
demographic_df.describe().T

[1mShape:[0m  (25, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Patient_ID                    25 non-null     object 
 1   Age                           25 non-null     int64  
 2   Gender                        25 non-null     object 
 3   Race                          25 non-null     object 
 4   Average Sleep Duration (hrs)  25 non-null     float64
 5   Sleep Quality (1-10)          25 non-null     float64
 6   % with Sleep Disturbances     25 non-null     int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 1.5+ KB
[1mInfo:[0m  None
[1mSum of null:[0m
 Patient_ID                      0
Age                             0
Gender                          0
Race                            0
Average Sleep Duration (hrs)    0
Sleep Quality (1-10)            0
% with Sleep Disturbances       0
d

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,25.0,46.68,15.544881,20.0,34.0,46.0,62.0,74.0
Average Sleep Duration (hrs),25.0,5.972,0.6699,5.0,5.4,5.9,6.6,7.2
Sleep Quality (1-10),25.0,5.952,1.342609,4.1,4.6,5.9,7.1,7.9
% with Sleep Disturbances,25.0,52.8,17.682383,30.0,40.0,50.0,70.0,80.0


### Demographic dataset cleanup

#### For clean and easy access renaming the column names, capitalizing the values.

In [122]:
# Renaming columns to lower and replacing space with undetscore
demographic_df.columns = demographic_df.columns.str.strip().str.lower().str.replace(' ', '_')

# Renaming long column names for simple column names
demographic_df.rename(columns={
    'average_sleep_duration_(hrs)': 'sleep_duration',
    'sleep_quality_(1-10)': 'sleep_quality',
    '%_with_sleep_disturbances': 'sleep_disturbance_percent'
}, inplace=True)

# checking Columns datatype
print(demographic_df.info())

# Checking for missing values
print("Missing values per column:")
print(demographic_df.isnull().sum())

# Converting numeric columns
numeric_cols = ['age', 'sleep_duration', 'sleep_quality', 'sleep_disturbance_percent']
for col in numeric_cols:
    demographic_df[col] = pd.to_numeric(demographic_df[col], errors='coerce')

# Normalizing categorical values
demographic_df['gender'] = demographic_df['gender'].str.capitalize()
demographic_df['race'] = demographic_df['race'].str.title()

# Dropping duplicates
demographic_df.drop_duplicates(subset='patient_id', inplace=True)

# Final check
print("\nCleaned Demographic DataFrame:")
demographic_df.head().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   patient_id                 25 non-null     object 
 1   age                        25 non-null     int64  
 2   gender                     25 non-null     object 
 3   race                       25 non-null     object 
 4   sleep_duration             25 non-null     float64
 5   sleep_quality              25 non-null     float64
 6   sleep_disturbance_percent  25 non-null     int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 1.5+ KB
None
Missing values per column:
patient_id                   0
age                          0
gender                       0
race                         0
sleep_duration               0
sleep_quality                0
sleep_disturbance_percent    0
dtype: int64

Cleaned Demographic DataFrame:


Unnamed: 0,0,1,2,3,4
patient_id,HUPA0001P,HUPA0002P,HUPA0003P,HUPA0004P,HUPA0005P
age,34,49,64,34,49
gender,Male,Male,Male,Female,Male
race,Other,Hispanic,Black,Native American,Native American
sleep_duration,6.3,6.6,5.3,5.2,5.8
sleep_quality,4.5,4.4,5.2,6.9,7.9
sleep_disturbance_percent,80,40,70,60,30


In [124]:
# Writing merged and demographic cleaned dataset into csv 
df.to_csv('merged_cleandata.csv', index=False)
demographic_df.to_csv("demographic_cleandata.csv", index=False)

In [125]:
# checking merged dataset
df

Unnamed: 0,time,glucose,calories,heart_rate,steps,basal_rate,bolus_volume_delivered,carb_input,patient_id,HR_Category
0,2018-06-13 18:40:00,332.000000,6.35950,82.322835,34.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
1,2018-06-13 18:45:00,326.000000,7.72800,83.740157,0.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
2,2018-06-13 18:50:00,330.000000,4.74950,80.525180,0.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
3,2018-06-13 18:55:00,324.000000,6.35950,89.129032,20.0,0.091667,0.0,0.0,HUPA0001P.csv,Normal
4,2018-06-13 19:00:00,306.000000,5.15200,92.495652,0.0,0.075000,0.0,0.0,HUPA0001P.csv,Normal
...,...,...,...,...,...,...,...,...,...,...
309387,2022-05-18 11:55:00,109.333333,10.79280,104.171171,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Active
309388,2022-05-18 12:00:00,114.000000,9.80346,103.442623,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Active
309389,2022-05-18 12:05:00,118.666667,5.66622,95.542857,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Normal
309390,2022-05-18 12:10:00,123.333333,5.57628,91.381356,0.0,0.000000,0.0,0.0,HUPA0028P.csv,Normal


In [128]:
# checking demographic dataset
demographic_df


Unnamed: 0,patient_id,age,gender,race,sleep_duration,sleep_quality,sleep_disturbance_percent
0,HUPA0001P,34,Male,Other,6.3,4.5,80
1,HUPA0002P,49,Male,Hispanic,6.6,4.4,40
2,HUPA0003P,64,Male,Black,5.3,5.2,70
3,HUPA0004P,34,Female,Native American,5.2,6.9,60
4,HUPA0005P,49,Male,Native American,5.8,7.9,30
5,HUPA0006P,35,Male,White,6.6,4.2,60
6,HUPA0007P,67,Male,Native American,7.1,6.0,80
7,HUPA0009P,65,Female,Other,6.6,4.6,40
8,HUPA0010P,22,Male,Asian,7.1,5.5,50
9,HUPA0011P,63,Female,Other,5.6,4.7,60
