# 1. Data pre-preparation

In [22]:
import glob
import os
import pandas as pd

In [23]:
# Get all csv files from folder
files = glob.glob(r"C:\Febi\NumpyNinja\Hackathon\Python_Launch Folder\Launch Folder\HUPA-UC Diabetes Dataset\*.csv")


**Merging 25 patient data files as one file for cleanup and Analysis**
<h5>Adding patient_Id column as unique identifier based on filename</h5>

In [25]:
def merge_rawfiles():
# Merging raw data file as one file
    df_files = []
    for file in files:

        filename = os.path.basename(file)
        if not filename.startswith("HUPA"):
            continue
        df = pd.read_csv(file,sep=";") 

        # Uniform column header in all files
        df.columns = df.columns.str.strip().str.lower()
        
        #Add patient_id
        df["patient_id"] = filename
        df_files.append(df)

    # Merge the patient data files
    df = pd.concat(df_files, ignore_index=True)

    # Saving it as Single Merged file
    df.to_csv("mergedraw_file.csv", index=False)
    return df

<h4>standardize based on time</h4>
<h5><font="TimesNewRoman">Parsed datetime, removed duplicate rows based on time and set time as index</font></h5>

In [27]:
def standardize_basedontime(df):

    # Time Parser for datetime calculation
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"],errors="coerce")

    # Setting the time column as index
    df.set_index("time")
        
    # Removing duplicate rows
    df = df.drop_duplicates(subset=["time"])


### Heart Rate column pre-processing

#### Reasoning: HR Range validation and categorizing helps for grouped analysis

In [38]:
def hr_category(hr):

    # To HR Range validation and categorizing
    if  (hr >= 40) & (hr <= 100): 
        return 'Normal'
         
    elif  (hr >= 101) & (hr <= 180):
      return 'Active'
    
    elif (hr >= 180) & (hr < 220):
       return 'High'

    else: 
        return 'Abnormal'

### Steps column validation 

#### Reasoning: Validation of steps column range between 0 - 1500 to investigate extreme outliers and negative values.


In [41]:
df[(df['steps'] < 0) | (df['steps'] > 1500)]['steps'].sum()

0.0

### Carb_input validation

#### Reasoning: Validation of carb_input column to investigate extreme outliers and negative values. 

In [44]:
df[(df['carb_input'] < 0) | (df['carb_input'] > 150)]['carb_input'].sum()

0.0

In [28]:
df = merge_rawfiles()
standardize_basedontime(df)
df['HR_Category'] = df['heart_rate'].apply(hr_category)
df