In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots
import os

plt.style.use(['science', 'grid'])
plt.rc('figure', titlesize = 33, figsize = (21, 7))
plt.rc('axes', titlesize = 27, labelsize = 21, titlepad = 21)
plt.rc('xtick', labelsize = 17)
plt.rc('ytick', labelsize = 17)

In [31]:
bio = pd.read_csv('data/bio.csv')

bio.columns = (
    bio.columns
    .str.strip()
    .str.lower()
    .str.replace('#', '')
    .str.replace('__', '')
    .str.replace('_glu', '')
    .str.replace('contour', '')
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('.', '')
)
bio = bio.drop(columns = ["collection_time_pdl_lab"])
bio.rename(columns = {
    "1__fingerstick_glu": "fingerstick_one",
    "2__fingerstick_glu": "fingerstick_two",
    "3__fingerstick_glu": "fingerstick_three",
    "time_t":"time_one",
    "time_t1":"time_two",
    "time_t2":"time_three",
    "fasting_glu_-_pdl_lab":"fasting_glu",
    "a1c_pdl_lab":"a1c",
    "self-identify":"ethnicity"
        }, inplace = True)

bio["ethnicity"] = (
    bio["ethnicity"]
    .str.lower()
    .str.replace('hispanic/', '')
    .str.replace('black, ', '')
    .str.replace(' ', '_')
    .str.strip()
)


bio = pd.get_dummies(bio, columns=["ethnicity"])

bio["gender"] = np.where(bio.gender == 'M', 1, 0).astype('bool')

# create the 'diabetes' column
bins = [0, 5.7, 6.4, float('inf')]
labels = [0, 1, 2]
bio['diabetes'] = pd.cut(bio['a1c'], bins=bins, labels=labels, right=False)

# save the data
bio.to_csv('data/bio_test.csv', index = False)

bio.head()
bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   subject                     45 non-null     int64   
 1   age                         45 non-null     int64   
 2   gender                      45 non-null     bool    
 3   bmi                         45 non-null     float64 
 4   body_weight                 45 non-null     float64 
 5   height                      45 non-null     float64 
 6   a1c                         45 non-null     float64 
 7   fasting_glu                 45 non-null     int64   
 8   insulin                     45 non-null     float64 
 9   triglycerides               45 non-null     int64   
 10  cholesterol                 45 non-null     int64   
 11  hdl                         45 non-null     int64   
 12  non_hdl                     45 non-null     int64   
 13  ldl_cal               

In [None]:
sns.pairplot(bio)

In [20]:
gut = pd.read_csv('data/gut_health_test.csv')

gut.columns = (
    gut.columns
    .str.strip()
    .str.lower()
    .str.replace('#', '')
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('.', '')
)
gut.dropna(inplace = True)
for col in gut.columns:
    if col != 'subject':
        gut[col] = gut[col].astype('int64')
        gut[col] = pd.Categorical(gut[col], ordered = True, categories = [1, 2, 3])



(42, 23)

In [32]:
bioxgut = pd.merge(bio, gut, on = 'subject')

bioxgut.dropna(inplace = True)

bioxgut.to_csv('data/bio_x_gut.csv', index = False)