In [None]:
import warnings

import pandas as pd
import pandarallel
import numpy as np

warnings.filterwarnings("ignore")
np.seterr(all="ignore")

pandarallel.pandarallel.initialize(nb_workers=14)

### Data demographics profiling

In [None]:
raw_data = pd.read_csv('data/KOGES_BASE.csv')
raw_data = raw_data.set_index('RID')
raw_data = raw_data.replace([66666, 77777, 99999], np.nan)
raw_data.columns = [x.replace('B_', '') for x in raw_data.columns]

disease_features = ['HTN', 'DM', 'THY', 'LIP',
                    'LCA', 'GCA', 'HCCCA', 'COLCA', 'PACA', 'UTCA', 'BRCA', 'THYCA', 'PROCA', 'GALLCA']
clinical_features = ['BMI', 'PRT16_U', 'GLU16_U', 'BLOOD16_U', 'HBA1C', 'GLU0',
                     'BUN', 'ALBUMIN', 'CREATINE', 'AST', 'T_BIL',
                     'ALT', 'TCHL', 'R_GTP', 'HDL', 'LDL',
                     'TG', 'WBC_B','RBC_B', 'HB', 'HCT', 'PLAT']

raw_data = raw_data.loc[raw_data.loc[:, ['SEX'] + disease_features + clinical_features].dropna().index, :].copy()
raw_data.loc[:, disease_features] = raw_data.loc[:, disease_features].replace([1, 2], [False, True]) 
raw_data.loc[:, 'OBESITY'] = raw_data.loc[:, 'BMI'] > 30

In [None]:
# Demographics of Data
# demo_data = raw_data.copy()
# demo_data = raw_data.loc[(raw_data.loc[:, disease_features] == False).all(axis=1), :].copy() # normal
# demo_data = raw_data.loc[raw_data.loc[:, 'OBESITY'] == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'DM']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'HTN']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'THY']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'LIP']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'BRCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'LCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'THYCA'] == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'PROCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'UTCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'LCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'PACA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'GCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'GALLCA']  == True, :].copy()
# demo_data = raw_data.loc[raw_data.loc[:, 'COLCA']  == True, :].copy()
demo_data = raw_data.loc[(raw_data.loc[:, 'OBESITY'] == True) & (raw_data.loc[:, ['OBESITY', 'HTN', 'DM', 'THY', 'LIP']].sum(axis=1) >= 2), :].copy()
# demo_data = raw_data.loc[(raw_data.loc[:, 'HTN'] == True) & (raw_data.loc[:, ['OBESITY', 'HTN', 'DM', 'THY', 'LIP']].sum(axis=1) >= 2), :].copy()
# demo_data = raw_data.loc[(raw_data.loc[:, 'FM'] == True) & (raw_data.loc[:, ['OBESITY', 'HTN', 'DM', 'THY', 'LIP']].sum(axis=1) >= 2), :].copy()
# demo_data = raw_data.loc[(raw_data.loc[:, 'THY'] == True) & (raw_data.loc[:, ['OBESITY', 'HTN', 'DM', 'THY', 'LIP']].sum(axis=1) >= 2), :].copy()
# demo_data = raw_data.loc[(raw_data.loc[:, 'LIP'] == True) & (raw_data.loc[:, ['OBESITY', 'HTN', 'DM', 'THY', 'LIP']].sum(axis=1) >= 2), :].copy()

print(f"N: {demo_data.shape[0]}")
print(f"AGE: m = {demo_data.loc[:, 'AGE'].mean():.2f}, std={demo_data.loc[:, 'AGE'].std():.2f}")
print('Gender')
print(f"\tMan: {sum(demo_data.loc[:, 'SEX'] == 1)}")
print(f"\tWOMAN: {sum(demo_data.loc[:, 'SEX'] == 2)}")
print('Drinking status')
print(f"\tNon-Drinker: {sum(demo_data.loc[:, 'DRINK'] == 1)}")
print(f"\tEx-Drinker: {sum(demo_data.loc[:, 'DRINK'] == 2)}")
print(f"\tCurrent-Drinker: {sum(demo_data.loc[:, 'DRINK'] == 3)}")
print('Smoking status')
print(f"\tNon-Smoker: {sum(demo_data.loc[:, 'SMOKE'] == 1)}")
print(f"\tEx-Smoker: {sum(demo_data.loc[:, 'SMOKE'] == 2)}")
print(f"\tCurrent-Smoker: {sum(demo_data.loc[:, 'SMOKE'] == 3)}")
print(f"Regular Exercise: {sum(demo_data.loc[:, 'EXER'] == 2)}")
print(f"Total Energy Intake: m={demo_data.loc[:, 'SS01'].mean():.2f}, std={demo_data.loc[:, 'SS01'].std():.2f}")
print('Anthropometric measurements')
print(f"\tHeight: m={demo_data.loc[:, 'HEIGHT'].mean():.2f}, std={demo_data.loc[:, 'HEIGHT'].std():.2f}")
print(f"\tWeight: m={demo_data.loc[:, 'WEIGHT'].mean():.2f}, std={demo_data.loc[:, 'WEIGHT'].std():.2f}")
print(f"\tBMI: m={demo_data.loc[:, 'BMI'].mean():.2f}, std={demo_data.loc[:, 'BMI'].std():.2f}")
print(f"\tSBP: m={demo_data.loc[:, 'SBP'].mean():.2f}, std={demo_data.loc[:, 'SBP'].std():.2f}")
print(f"\tDBP: m={demo_data.loc[:, 'DBP'].mean():.2f}, std={demo_data.loc[:, 'DBP'].std():.2f}")
print('Clinical Examination')
print(f"\tHbA1C: m={demo_data.loc[:, 'HBA1C'].mean():.2f}, std={demo_data.loc[:, 'HBA1C'].std():.2f}")
print(f"\tFast Glucose: m={demo_data.loc[:, 'GLU0'].mean():.2f}, std={demo_data.loc[:, 'GLU0'].std():.2f}")
print(f"\tAlbumin: m={demo_data.loc[:, 'ALBUMIN'].mean():.2f}, std={demo_data.loc[:, 'ALBUMIN'].std():.2f}")
print(f"\tBUN: m={demo_data.loc[:, 'BUN'].mean():.2f}, std={demo_data.loc[:, 'BUN'].std():.2f}")
print(f"\tCreatine: m={demo_data.loc[:, 'CREATINE'].mean():.2f}, std={demo_data.loc[:, 'CREATINE'].std():.2f}")
print(f"\tTotal Bilirubin: m={demo_data.loc[:, 'T_BIL'].mean():.2f}, std={demo_data.loc[:, 'T_BIL'].std():.2f}")
print(f"\tAST: m={demo_data.loc[:, 'AST'].mean():.2f}, std={demo_data.loc[:, 'AST'].std():.2f}")
print(f"\tALT: m={demo_data.loc[:, 'ALT'].mean():.2f}, std={demo_data.loc[:, 'ALT'].std():.2f}")
print(f"\tr-GTP: m={demo_data.loc[:, 'R_GTP'].mean():.2f}, std={demo_data.loc[:, 'R_GTP'].std():.2f}")
print(f"\tTotal Cholesterol: m={demo_data.loc[:, 'TCHL'].mean():.2f}, std={demo_data.loc[:, 'TCHL'].std():.2f}")
print(f"\tHDL-Cholesterol: m={demo_data.loc[:, 'HDL'].mean():.2f}, std={demo_data.loc[:, 'HDL'].std():.2f}")
print(f"\tLDL-Cholesterol: m={demo_data.loc[:, 'LDL'].mean():.2f}, std={demo_data.loc[:, 'LDL'].std():.2f}")
print(f"\tTriglyceride: m={demo_data.loc[:, 'TG'].mean():.2f}, std={demo_data.loc[:, 'TG'].std():.2f}")
print(f"\tBlood WBC: m={demo_data.loc[:, 'WBC_B'].mean():.2f}, std={demo_data.loc[:, 'WBC_B'].std():.2f}")
print(f"\tBlood RBC: m={demo_data.loc[:, 'RBC_B'].mean():.2f}, std={demo_data.loc[:, 'RBC_B'].std():.2f}")
print(f"\tBlood Hemoglobin: m={demo_data.loc[:, 'HB'].mean():.2f}, std={demo_data.loc[:, 'HB'].std():.2f}")
print(f"\tBlood Hematocrit: m={demo_data.loc[:, 'HCT'].mean():.2f}, std={demo_data.loc[:, 'HCT'].std():.2f}")
print(f"\tBlood Platelet: m={demo_data.loc[:, 'PLAT'].mean():.2f}, std={demo_data.loc[:, 'PLAT'].std():.2f}")
print(f"\tUrine Protein: negative: {sum(demo_data.loc[:, 'PRT16_U'] == 0)}, positive: {sum(demo_data.loc[:, 'PRT16_U'] != 0)}")
print(f"\tUrine Glucose: negative: {sum(demo_data.loc[:, 'GLU16_U'] == 0)}, positive: {sum(demo_data.loc[:, 'GLU16_U'] != 0)}")
print(f"\tUrine Blood: negative: {sum(demo_data.loc[:, 'BLOOD16_U'] == 0)}, positive: {sum(demo_data.loc[:, 'BLOOD16_U'] != 0)}")

N: 1241
AGE: m = 55.34, std=8.16
Gender
	Man: 407
	WOMAN: 834
Drinking status
	Non-Drinker: 651
	Ex-Drinker: 84
	Current-Drinker: 504
Smoking status
	Non-Smoker: 871
	Ex-Smoker: 215
	Current-Smoker: 150
Regular Exercise: 499
Total Energy Intake: m=1762.78, std=568.43
Anthropometric measurements
	Height: m=159.28, std=9.22
	Weight: m=81.15, std=10.08
	BMI: m=31.91, std=1.94
	SBP: m=133.18, std=15.36
	DBP: m=82.55, std=10.19
Clinical Examination
	HbA1C: m=6.30, std=1.07
	Fast Glucose: m=108.67, std=30.37
	Albumin: m=4.59, std=0.27
	BUN: m=15.20, std=4.44
	Creatine: m=0.84, std=0.35
	Total Bilirubin: m=0.68, std=0.26
	AST: m=29.10, std=17.60
	ALT: m=33.10, std=25.94
	r-GTP: m=45.82, std=46.17
	Total Cholesterol: m=198.33, std=40.25
	HDL-Cholesterol: m=47.79, std=11.17
	LDL-Cholesterol: m=115.00, std=37.69
	Triglyceride: m=177.69, std=113.78
	Blood WBC: m=6.75, std=1.82
	Blood RBC: m=4.64, std=0.45
	Blood Hemoglobin: m=14.14, std=1.48
	Blood Hematocrit: m=42.26, std=3.96
	Blood Platelet: m