In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots
import os

plt.style.use(['science', 'grid'])
plt.rc('figure', titlesize = 33, figsize = (21, 7))
plt.rc('axes', titlesize = 27, labelsize = 21, titlepad = 21)
plt.rc('xtick', labelsize = 17)
plt.rc('ytick', labelsize = 17)

master = '/Users/fomo/Documents/Kaizen/code/physionet/data/CG Macros'

# Glucose Preprocessing

In [17]:
# preprocessing CGMacros data
count = 0

for i in range(1, 50):

    # open temp file
    if i < 10:
        common = "CGMacros-00"
    else:
        common = "CGMacros-0"

    filestring = common + str(i)

    try:
        temp = pd.read_csv(os.path.join(master, "raw", filestring, filestring + ".csv"))
    except FileNotFoundError:
        print(f"File {filestring} not found")
        continue

    # preprocess data
    temp["Timestamp"] = pd.to_datetime(temp["Timestamp"])
    temp.columns = (temp.columns.
                    str.lower().
                    str.strip().
                    str.replace(' ', '_').
                    str.replace('(', '').
                    str.replace(')', '')
                    )

    # define the output directory and filename
    new_filestring = os.path.join(master, "preprocessed", f"{filestring}-preprocessed.csv")

    # Save the preprocessed data to a new CSV file
    temp.drop(columns = ["image_path"], inplace = True)
    if 'unnamed:_0' in temp.columns:
        temp.drop(columns = ['unnamed:_0'], inplace = True)
    temp.to_csv(new_filestring, index=False)
    
    count += 1

print(f"Preprocessed {count} files")

File CGMacros-024 not found
File CGMacros-025 not found
File CGMacros-037 not found
File CGMacros-040 not found
Preprocessed 45 files


## Glucose EDA

In [18]:
temp.shape

temp.columns

Index(['timestamp', 'libre_gl', 'dexcom_gl', 'hr', 'calories_activity', 'mets',
       'meal_type', 'calories', 'carbs', 'protein', 'fat', 'fiber',
       'amount_consumed'],
      dtype='object')

# Biomarker Preprocessing

In [3]:
bio = pd.read_csv(os.path.join(master, "raw", "bio.csv"))

bio.columns = (
    bio.columns
    .str.strip()
    .str.lower()
    .str.replace('#', '')
    .str.replace('__', '')
    .str.replace('_glu', '')
    .str.replace('contour', '')
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('.', '')
)
bio = bio.drop(columns = ["collection_time_pdl_lab"])
bio.rename(columns = {
    "1__fingerstick_glu": "fingerstick_one",
    "2__fingerstick_glu": "fingerstick_two",
    "3__fingerstick_glu": "fingerstick_three",
    "time_t":"time_one",
    "time_t1":"time_two",
    "time_t2":"time_three",
    "fasting_glu_-_pdl_lab":"fasting_glu",
    "a1c_pdl_lab":"a1c",
    "self-identify":"ethnicity"
        }, inplace = True)

# cleaning the ethnicity column
bio["ethnicity"] = (
    bio["ethnicity"]
    .str.lower()
    .str.replace('hispanic/', '')
    .str.replace('black, ', '')
    .str.replace(' ', '_')
    .str.strip()
)
bio = pd.get_dummies(bio, columns=["ethnicity"])

# calculating the average fingerstick glu
bio["fingerstick_avg"] = bio[["fingerstick_one", "fingerstick_two", "fingerstick_three"]].mean(axis = 1)

# convert gender column to numerical
bio["gender"] = np.where(bio.gender == 'M', 1, 0).astype('bool')

# create the 'diabetes' column
bins = [0, 5.7, 6.4, float('inf')]
labels = [0, 1, 2]
bio['diabetes'] = pd.cut(bio['a1c'], bins=bins, labels=labels, right=False)

# save the data
bio.drop(columns = ["fingerstick_one", "fingerstick_two", "fingerstick_three", "time_one", "time_two", "time_three"], inplace = True)
bio.to_csv('/Users/fomo/Documents/Kaizen/code/physionet/data/CG Macros/bio_edit.csv', index = False)

bio.head()
bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   subject                     45 non-null     int64   
 1   age                         45 non-null     int64   
 2   gender                      45 non-null     bool    
 3   bmi                         45 non-null     float64 
 4   body_weight                 45 non-null     float64 
 5   height                      45 non-null     float64 
 6   a1c                         45 non-null     float64 
 7   fasting_glu                 45 non-null     int64   
 8   insulin                     45 non-null     float64 
 9   triglycerides               45 non-null     int64   
 10  cholesterol                 45 non-null     int64   
 11  hdl                         45 non-null     int64   
 12  non_hdl                     45 non-null     int64   
 13  ldl_cal               

# Gut Health Preprocessing

In [5]:
gut = pd.read_csv(os.path.join(master, "raw", "gut_health_test.csv"))

gut.columns = (
    gut.columns
    .str.strip()
    .str.lower()
    .str.replace('#', '')
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('.', '')
)
gut.dropna(inplace = True)
for col in gut.columns:
    if col != 'subject':
        gut[col] = gut[col].astype('int64')
        gut[col] = pd.Categorical(gut[col], ordered = True, categories = [1, 2, 3])

gut.to_csv('/Users/fomo/Documents/Kaizen/code/physionet/data/CG Macros/gut_health_test_edit.csv')

In [6]:
bioxgut = pd.merge(bio, gut, on = 'subject')

bioxgut.dropna(inplace = True)

bioxgut.to_csv('/Users/fomo/Documents/Kaizen/code/physionet/data/CG Macros/bio_x_gut.csv', index = False)

bioxgut.shape

(42, 43)

# Microbes Preprocessing

In [26]:
microbes = pd.read_csv(os.path.join(master, "raw", "microbes.csv"))

for column in microbes.columns:
    microbes[column].astype("bool")
    
microbes.to_csv(os.path.join(master, "microbes_edit.csv"))