# 1. Data pre-preparation

In [22]:
import glob
import os
import pandas as pd

In [71]:
# Get all csv files from folder
files = glob.glob(r"C:\Febi\NumpyNinja\Hackathon\Python_Launch Folder\Launch Folder\HUPA-UC Diabetes Dataset\*.csv")

demographic_df = pd.read_csv(r"C:\Febi\NumpyNinja\Hackathon\Python_Launch Folder\Launch Folder\HUPA-UC Diabetes Dataset\T1DM_patient_sleep_demographics_with_race.csv")

**Merging 25 patient data files as one file for cleanup and Analysis**
<h5>Adding patient_Id column as unique identifier based on filename</h5>

In [25]:
def merge_rawfiles():
# Merging raw data file as one file
    df_files = []
    for file in files:

        filename = os.path.basename(file)
        if not filename.startswith("HUPA"):
            continue
        df = pd.read_csv(file,sep=";") 

        # Uniform column header in all files
        df.columns = df.columns.str.strip().str.lower()
        
        #Add patient_id
        df["patient_id"] = filename
        df_files.append(df)

    # Merge the patient data files
    df = pd.concat(df_files, ignore_index=True)

    # Saving it as Single Merged file
    df.to_csv("mergedraw_file.csv", index=False)
    return df

<h4>standardize based on time</h4>
<h5><font="TimesNewRoman">Parsed datetime, removed duplicate rows based on time and set time as index</font></h5>

In [27]:
def standardize_basedontime(df):

    # Time Parser for datetime calculation
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"],errors="coerce")

    # Setting the time column as index
    df.set_index("time")
        
    # Removing duplicate rows
    df = df.drop_duplicates(subset=["time"])


### Heart Rate column pre-processing

#### Reasoning: HR Range validation and categorizing helps for grouped analysis

In [38]:
def hr_category(hr):

    # To HR Range validation and categorizing
    if  (hr >= 40) & (hr <= 100): 
        return 'Normal'
         
    elif  (hr >= 101) & (hr <= 180):
      return 'Active'
    
    elif (hr >= 180) & (hr < 220):
       return 'High'

    else: 
        return 'Abnormal'

### Steps column validation 

#### Reasoning: Validation of steps column range between 0 - 1500 to investigate extreme outliers and negative values.


In [41]:
df[(df['steps'] < 0) | (df['steps'] > 1500)]['steps'].sum()

0.0

### Carb_input validation

#### Reasoning: Validation of carb_input column to investigate extreme outliers and negative values. 

In [44]:
df[(df['carb_input'] < 0) | (df['carb_input'] > 150)]['carb_input'].sum()

0.0

### Basal_rate validation

#### Reasoning: Validation of basal_rate column to investigate extreme outliers and negative values.

In [49]:
print(df[df['basal_rate'] < 0])

Empty DataFrame
Columns: [time, glucose, calories, heart_rate, steps, basal_rate, bolus_volume_delivered, carb_input, patient_id]
Index: []


### Bolus_volume_delivered validation

#### Reasoning: Validation of bolus_volume_delivered column to investigate extreme outliers and negative values.

In [56]:
print(df[df['bolus_volume_delivered']<0].groupby(by='patient_id').value_counts())

patient_id     time                 glucose  calories  heart_rate  steps  basal_rate  bolus_volume_delivered  carb_input
HUPA0017P.csv  2019-03-29 15:00:00  103.0    6.48924   84.178571   0.0    0.059       -1.0                    7.0           1
               2019-03-31 14:30:00  84.0     4.77150   98.220272   0.0    0.059       -1.0                    4.0           1
               2019-04-06 12:45:00  134.0    14.60079  100.230769  84.0   0.059       -3.0                    4.0           1
               2019-04-07 13:15:00  97.0     4.77150   98.323864   0.0    0.059       -1.0                    2.5           1
Name: count, dtype: int64


#### Reasoning: Treating all negative bolus_volume_delivered to 0 helps to maintain clean data 

In [68]:
df['bolus_volume_delivered'] = df['bolus_volume_delivered'].clip(lower=0)
print(df[df['bolus_volume_delivered'] < 0])

Empty DataFrame
Columns: [time, glucose, calories, heart_rate, steps, basal_rate, bolus_volume_delivered, carb_input, patient_id]
Index: []


In [28]:
df = merge_rawfiles()
standardize_basedontime(df)
df['HR_Category'] = df['heart_rate'].apply(hr_category)
df

### Demographic dataset verification

#### To ensure the dataset has non null, non na and unique values

In [78]:
print("\033[1mShape:\033[0m ", demographic_df.shape)
print("\033[1mInfo:\033[0m ", demographic_df.info())
print("\033[1mSum of null:\033[0m\n", demographic_df.isnull().sum())
print("\033[1mSum of na:\033[0m\n", demographic_df.isna().sum())
print("\033[1mSum of duplicated:\033[0m\n", demographic_df.duplicated().sum())
print("\033[1mDescribe:\033[0m\n")
demographic_df.describe().T

[1mShape:[0m  (25, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Patient_ID                    25 non-null     object 
 1   Age                           25 non-null     int64  
 2   Gender                        25 non-null     object 
 3   Race                          25 non-null     object 
 4   Average Sleep Duration (hrs)  25 non-null     float64
 5   Sleep Quality (1-10)          25 non-null     float64
 6   % with Sleep Disturbances     25 non-null     int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 1.5+ KB
[1mInfo:[0m  None
[1mSum of null:[0m
 Patient_ID                      0
Age                             0
Gender                          0
Race                            0
Average Sleep Duration (hrs)    0
Sleep Quality (1-10)            0
% with Sleep Disturbances       0
d

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,25.0,46.68,15.544881,20.0,34.0,46.0,62.0,74.0
Average Sleep Duration (hrs),25.0,5.972,0.6699,5.0,5.4,5.9,6.6,7.2
Sleep Quality (1-10),25.0,5.952,1.342609,4.1,4.6,5.9,7.1,7.9
% with Sleep Disturbances,25.0,52.8,17.682383,30.0,40.0,50.0,70.0,80.0
