In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
import sys
sys.path.append('../Utilities')
import utils
import udfs

Loading Dataset

In [3]:
data_dir = "../Data/raw_data/NTCMRC_all.xlsx"
df = pd.read_excel(data_dir)

In [4]:
# Create a copy of df as df1
df1 = df.copy()

# Replace '\\N' with NaN
df1 = df1.replace('\\N', np.nan)

# Specify the columns to be converted to FLOAT
columns_to_convert1 = ['BMI', 'Triglyceride_y', 'gamgt', 'waist_y', 'mst', 'egfrn', 'Estimated_GFR_x', 'Alb_Cre_ratio', 'HOMA_IR', 'HS_CRP', \
                       'LDL_C_direct', 'LDL_C_HDL_C', 'Adiponectin', 'Leptin', 'Uric_Acid','Insulin', 'ALT_GPT']

# Specify the columns to be converted to INT
columns_to_convert2 = ['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)', 'smoke', 'smoke_q', \
                       'sex', 'w', 'coffee', 'betel']

# Convert the specified columns to float and fill missing/unconvertible values with NaN
for column in columns_to_convert1:
    df1[column] = pd.to_numeric(df1[column], errors='coerce')

# Convert the specified columns to int and fill missing/unconvertible values with NaN
for column in columns_to_convert2:
    df1[column] = pd.to_numeric(df1[column], errors='coerce').astype(pd.Int64Dtype())

# Calculate FLI using the formula and defined as df2
df2 = df1.copy()
df2['FLI'] = (np.exp(0.953 * np.log(df2['Triglyceride_y']) + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) \
     + 0.053 * df2['waist_y'] - 15.745)) / (1 + np.exp(0.953 * np.log(df2['Triglyceride_y']) \
    + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) + 0.053 * df2['waist_y'] - 15.745)) * 100

# Derive FL_echo based on ultrasound results column
df2['FL_echo'] = df2['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)']
df2['FL_echo'] = df2['FL_echo'].replace('<NA>', np.nan)
df2['fl_status'] = df2.apply(utils.derive_fl_status, axis=1)

#Derive homa_ir_check, hs_crp_check, and mst_total to determine MAFLD risk factors
df2['homa_ir_check'] = df2['HOMA_IR'].apply(lambda x: 1 if x >= 2.5 else 0)
df2['hs_crp_check'] = df2['HS_CRP'].apply(lambda x: 1 if x > 2 else 0)
df2['mst_total'] = df2[['w', 'hyper', 'HDL', 'fg', 'trig', 'homa_ir_check', 'hs_crp_check']].sum(axis=1)

In [5]:
# Derive target vars
df3 = utils.derive_MAFLD_with_multi_label(utils.derive_MAFLD(df2))
df4 = utils.derive_CKD(df3)

In [6]:
columns_to_extract = ['HBsAg_x', 'Anti_HCV_x']
df5 = df4.copy()
for column in columns_to_extract:
    new_column_name = column + '_num'
    df5[new_column_name] = df5[column].apply(utils.extract_numeric_value)

df6 = df5.copy()

In [7]:
# Survey Data Processing

""" Survey basic information part """
df6['diet'] = df6['diet'].apply(lambda x: x if x in [1.0, 2.0, 3.0, 4.0] else np.nan) #1:葷 2:全素 3:早素 4:其他
df6['job_c'] = df6['job_c'].apply(lambda x: x if x in [1, 0] else np.nan) #0:坐辦公室 1:體力勞動
df6['mstatus'] = df6['mstatus'].apply(lambda x: x if x in [1, 2, 3, 4] else np.nan) # 1:未婚 2:已婚 3:離婚或分居 4:鰥寡
df6['children'] = df6['children'].apply(lambda x: x if x in [0, 1, 2, 3] else ('above 3' if x in [4, 5, 6, 7, 8] else np.nan))# num of children
df6['fam_self'] = df6['fam_self'].apply(lambda x: x if x in [1, 0] else np.nan) # 居住成員 (獨居): 0:沒有 1:有
df6['care_self'] = df6['care_self'].apply(lambda x: x if x in [1, 0] else np.nan) # 誰照顧你(自理): 0:沒有 1:有
df6['eco_self'] = df6['eco_self'].apply(lambda x: x if x in [1, 0] else np.nan) # 經濟來源 (自己): 0:沒有 1:有
df6['insurance'] = df6['insurance'].apply(lambda x: x if x in [1, 0] else np.nan) # 保險狀況: 0:沒有 1:有

""" Survey life style part """
df6['smoke'] = df6['smoke'].apply(lambda x: x if pd.isna(x) or x in [0, 1] else np.nan) # 您曾經抽過菸嗎？: 0:沒有 1:有
df6['smoke_q'] = df6['smoke_q'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2] else np.nan) # 您是否曾戒過菸？: 0:一直有在抽 1:已戒菸N年(未再抽) 2:戒過N年，現在又抽
df6['coffee'] = df6['coffee'].apply(lambda x: x if pd.isna(x) or x in [1, 0] else np.nan) # 您有喝咖啡的習慣嗎？: 0:沒有 1:有
df6['betel'] = df6['betel'].apply(lambda x: x if pd.isna(x) or x in [1, 0] else np.nan) # 您曾經嚼過檳榔嗎？: 0:沒有 1:有

""" Survey self genetic part  """
df6['hypertension'] = df6['hypertension'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無高血壓: 0:沒有 1:有
df6['Dysrhythmia'] = df6['Dysrhythmia'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無心律不整 0:沒有 1:有
df6['ap'] = df6['ap'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無心絞痛 0:沒有 1:有
df6['ami'] = df6['ami'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無心肌梗塞 0:沒有 1:有
df6['Hyperlipidemia'] = df6['Hyperlipidemia'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無高血脂 0:沒有 1:有
df6['HF'] = df6['HF'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無心臟衰竭 0:沒有 1:有
df6['endocrine'] = df6['endocrine'].apply(lambda x: x if x in [1, 0] else np.nan) # 自述有內分泌疾病 0:沒有 1:有
df6['Thyroid'] = df6['Thyroid'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無甲狀腺疾病 0:沒有 1:有
df6['gastritis'] = df6['gastritis'].apply(lambda x: x if x in [1, 0] else np.nan) # 自述有胃炎 0:沒有 1:有
df6['hepatitis_o'] = df6['hepatitis_o'].apply(lambda x: x if x in [1, 0] else np.nan) # 自述有其他肝病、肝硬化等疾病 0:沒有 1:有
df6['FLD'] = df6['FLD'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無脂肪肝 0:沒有 1:有
df6['fibrosis'] = df6['fibrosis'].apply(lambda x: x if x in [1, 0] else np.nan) # 自己有無肝纖維化 0:沒有 1:有
df6['Cirrhosis'] = df6['Cirrhosis'].apply(lambda x: x if x in [1, 0] else np.nan)# 自己有無肝硬化 0:沒有 1:有
df6['Polyposis'] = df6['Polyposis'].apply(lambda x: x if x in [1, 0] else np.nan)# 自己有無大腸瘜肉 0:沒有 1:有
df6['ibs'] = df6['ibs'].apply(lambda x: x if x in [1, 0] else np.nan)# 自己有無腸躁症 0:沒有 1:有

""" Survey parents genetic part  """
df6['fm_cardio'] = df6.apply(udfs.determine_fm_cardio, axis=1)
df6['fm_hyper'] = df6.apply(udfs.determine_fm_hyper, axis=1)
df6['fm_dys'] = df6.apply(udfs.determine_fm_dys, axis=1)
df6['fm_ap'] = df6.apply(udfs.determine_fm_ap, axis=1)
df6['fm_ami'] = df6.apply(udfs.determine_fm_ami, axis=1)
df6['fm_lipid'] = df6.apply(udfs.determine_fm_lipid, axis=1)
df6['fm_hf'] = df6.apply(udfs.determine_fm_hf, axis=1)
df6['fm_dm'] = df6.apply(udfs.determine_fm_dm, axis=1)
df6['fm_Thyroid'] = df6.apply(udfs.determine_fm_Thyroid, axis=1)

""" Survey brothers and sisters genetic part  """
df6['bs_cardio'] = df6['bs_cardio'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有心臟血管疾病: 0:沒有 1:有
df6['bs_hyper'] = df6['bs_hyper'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無高血壓: 0:沒有 1:有
df6['bs_dys'] = df6['bs_dys'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無心律不整 0:沒有 1:有
df6['bs_ap'] = df6['bs_ap'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無心絞痛 0:沒有 1:有
df6['bs_ami'] = df6['bs_ami'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無心肌梗塞 0:沒有 1:有
df6['bs_lipid'] = df6['bs_lipid'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無高血脂 0:沒有 1:有
df6['bs_hf'] = df6['bs_hf'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無心臟衰竭 0:沒有 1:有
df6['bs_dm'] = df6['bs_dm'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無糖尿病 0:沒有 1:有
df6['bs_Thyroid'] = df6['bs_Thyroid'].apply(lambda x: x if x in [1, 0] else np.nan) # 兄弟姊妹有無甲狀腺疾病 0:沒有 1:有

""" Survey children genetic part """
df6['chi_cardio'] = df6['chi_cardio'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有心臟血管疾病: 0:沒有 1:有
df6['chi_hyper'] = df6['chi_hyper'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無高血壓: 0:沒有 1:有
df6['chi_dys'] = df6['chi_dys'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無心律不整 0:沒有 1:有
df6['chi_ap'] = df6['chi_ap'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無心絞痛 0:沒有 1:有
df6['chi_ami'] = df6['chi_ami'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無心肌梗塞 0:沒有 1:有
df6['chi_lipid'] = df6['chi_lipid'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無高血脂 0:沒有 1:有
df6['chi_hf'] = df6['chi_hf'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無心臟衰竭 0:沒有 1:有
df6['chi_dm'] = df6['chi_dm'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無糖尿病 0:沒有 1:有
df6['chi_Thyroid'] = df6['chi_Thyroid'].apply(lambda x: x if x in [1, 0] else np.nan) # 子女有無甲狀腺疾病 0:沒有 1:有

""" Survey scoring """
#RAND36 score
df6['RAND36_PF'] = df6['RAND36_PF'].apply(udfs.categorize_rand36_score)
df6['RAND36_RP'] = df6['RAND36_RP'].apply(udfs.categorize_rand36_score)
df6['RAND36_RE'] = df6['RAND36_RE'].apply(udfs.categorize_rand36_score)
df6['RAND36_EF'] = df6['RAND36_EF'].apply(udfs.categorize_rand36_score)
df6['RAND36_EWB'] = df6['RAND36_EWB'].apply(udfs.categorize_rand36_score)
df6['RAND36_SF'] = df6['RAND36_SF'].apply(udfs.categorize_rand36_score)
df6['RAND36_BP'] = df6['RAND36_BP'].apply(udfs.categorize_rand36_score)
df6['RAND36_GH'] = df6['RAND36_GH'].apply(udfs.categorize_rand36_score)

# Anxiety score
df6['Anxiety_Level'] = df6['Anxiety'].apply(udfs.categorize_anxiety_or_depression)
df6['Depression_Level'] = df6['Depresscore'].apply(udfs.categorize_anxiety_or_depression)
df6['HAD'] = df6['HAD'].apply(udfs.categorize_HAD_total)

# PSQI
df6['c1_Sleep_Quality'] = df6['c1_Sleep_Quality'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)
df6['c2_Latency'] = df6['c2_Latency'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)
df6['c3_Duration'] = df6['c3_Duration'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)
df6['c4_Efficiency'] = df6['c4_Efficiency'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)
df6['c5_Disturbance'] = df6['c5_Disturbance'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)
df6['c6_Use_Medicatin'] = df6['c6_Use_Medicatin'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)
df6['c7_Daytime_dysfunction'] = df6['c7_Daytime_dysfunction'].apply(lambda x: x if pd.isna(x) or x in [0, 1, 2, 3] else np.nan)

df6['MNA'] = df6['MNA'].apply(udfs.categorize_mna)
df6['AUDIT'] = df6['AUDIT'].apply(udfs.categorize_AUDIT)

In [10]:
# Select survey features
survey_features = ['diet', 'job_c', 'mstatus', 'children', 'fam_self', 'care_self', 'eco_self', 'insurance', \
    'smoke', 'smoke_q', 'coffee', 'betel',\
    'hypertension', 'Dysrhythmia', 'ap', 'ami', 'Hyperlipidemia', 'HF', 'endocrine', 'Thyroid', \
    'gastritis', 'hepatitis_o', 'FLD', 'fibrosis', 'Cirrhosis', 'Polyposis', 'ibs', \
    'fm_cardio', 'fm_hyper', 'fm_dys', 'fm_ap', 'fm_ami', 'fm_lipid', 'fm_hf', 'fm_dm', 'fm_Thyroid', \
    'bs_cardio', 'bs_hyper', 'bs_dys', 'bs_ap', 'bs_ami', 'bs_lipid', 'bs_hf', 'bs_dm', 'bs_Thyroid',
    'chi_cardio', 'chi_hyper', 'chi_dys', 'chi_ap', 'chi_ami', 'chi_lipid', 'chi_hf', 'chi_dm', 'chi_Thyroid', \
    'RAND36_PF', 'RAND36_RP', 'RAND36_RE', 'RAND36_EF', 'RAND36_EWB', 'RAND36_SF', 'RAND36_BP', 'RAND36_GH', \
    'Anxiety_Level', 'Depression_Level', 'HAD', \
    'c1_Sleep_Quality', 'c2_Latency', 'c3_Duration', 'c4_Efficiency', 'c5_Disturbance', 'c6_Use_Medicatin', 'c7_Daytime_dysfunction', \
    'MNA', 'AUDIT']

print("number of survey_features",len(survey_features))
# Calculate the null ratio for each survey data feature and filter null value higher than 20%
df6_survey = df6[survey_features]
survey_null_ratios = (df6_survey.isnull().mean() * 100).sort_values(ascending=False)
filtered_survey_data = survey_null_ratios[survey_null_ratios <= 20]
# filtered_survey_data = survey_null_ratios[survey_null_ratios <= 100]

# print the remaing number of features
remaining_features_count = len(filtered_survey_data)
print("Number of remaining features:", remaining_features_count)

survey_features_filtered = list(filtered_survey_data.index)
print(survey_features_filtered)

number of survey_features 74
Number of remaining features: 23
['AUDIT', 'c5_Disturbance', 'c3_Duration', 'c2_Latency', 'c7_Daytime_dysfunction', 'c6_Use_Medicatin', 'c1_Sleep_Quality', 'betel', 'coffee', 'smoke', 'insurance', 'eco_self', 'care_self', 'HF', 'fam_self', 'RAND36_SF', 'RAND36_PF', 'RAND36_EWB', 'RAND36_BP', 'RAND36_EF', 'RAND36_GH', 'RAND36_RP', 'RAND36_RE']


In [11]:
# Filter for invalid patients
# count the unique CMRC_id
unique_CMRC_id_count = df6['CMRC_id'].nunique()
print("unique CMRC_id number:", unique_CMRC_id_count)

# groupby patient via year_come, and count the patient number by year_come
max_year_come_by_patient = df5.groupby(['CMRC_id'])['year_come'].max().reset_index()
count_by_year_come = max_year_come_by_patient.groupby('year_come')['CMRC_id'].count().reset_index()

# using groupby to filter the patient that in the same year_come having 2 or more patients
patients_to_remove = df6.groupby(['CMRC_id', 'year_come']).filter(lambda x: len(x) >= 2)['CMRC_id'].unique()
df6_filtered = df6[~df6['CMRC_id'].isin(patients_to_remove)]

#apply sliding window
df7 = utils.sliding_window_multi_label_data(df6_filtered, input_window_size=1, target_window_size=1)

unique CMRC_id number: 7986


In [27]:
#Select key columns for conventional machine learning models
other_columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid",
           "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", \
           "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", \
           "HBV_", "HCV_", "MAFLD", "CKD", \
           'HBsAg_x_num', 'Anti_HCV_x_num', \
           'MAFLD_0', 'MAFLD_Obesity', 'MAFLD_MD', 'MAFLD_Diabetes', \
           'year_come']
columns = other_columns + survey_features_filtered
prefixes = ["t1_"]
renamed_columns = utils.add_prefix(columns, prefixes)

df8 = df7[renamed_columns].copy()
# df8['t2_MAFLD_0'] = df7['t2_MAFLD_0']
# df8['t2_MAFLD_Obesity'] = df7['t2_MAFLD_Obesity']
# df8['t2_MAFLD_MD'] = df7['t2_MAFLD_MD']
# df8['t2_MAFLD_Diabetes'] = df7['t2_MAFLD_Diabetes']
# df8['t1_CMRC_id'] = df7['t1_CMRC_id']
df8.loc[:, 't2_MAFLD_0'] = df7['t2_MAFLD_0']
df8.loc[:, 't2_MAFLD_Obesity'] = df7['t2_MAFLD_Obesity']
df8.loc[:, 't2_MAFLD_MD'] = df7['t2_MAFLD_MD']
df8.loc[:, 't2_MAFLD_Diabetes'] = df7['t2_MAFLD_Diabetes']
df8.loc[:, 't1_CMRC_id'] = df7['t1_CMRC_id']

In [28]:
filtered_df1 = df8[(df8['t1_MAFLD_0'] == 1) & (df8['t2_MAFLD_0'] != -1)]
unique_records = filtered_df1.groupby('t1_CMRC_id').apply(utils.select_record).reset_index(drop=True)
unique_records.head()

Unnamed: 0,t1_sex,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,...,t1_RAND36_EF,t1_RAND36_GH,t1_RAND36_RP,t1_RAND36_RE,t2_MAFLD_0,t2_MAFLD_Obesity,t2_MAFLD_MD,t2_MAFLD_Diabetes,t1_CMRC_id,null_ratio
0,0,62,85.0,103.0,192.0,60.7,18.0,22.0,17.0,4.56,...,1.0,1.0,0.0,1.0,1,0,0,0,A02013O51207P5052,0.042254
1,0,62,64.0,90.0,72.0,80.0,22.0,17.0,9.0,3.9,...,3.0,4.0,4.0,4.0,1,0,0,0,A02014I70719D9059,0.028169
2,0,35,71.0,89.0,54.0,71.3,38.0,33.0,52.0,5.47,...,1.0,3.0,4.0,4.0,1,0,0,0,A02014P40118F7096,0.098592
3,1,74,91.0,87.0,122.0,51.2,20.0,16.0,17.0,11.55,...,4.0,4.0,4.0,4.0,1,0,0,0,A02014V30517K5009,0.042254
4,0,61,88.0,107.0,207.0,43.0,21.0,33.0,18.0,10.2,...,4.0,4.0,4.0,4.0,1,0,0,0,A02014Z40719Z0013,0.014085


In [29]:
# drop these cols
cols_to_drop_only_MAFLD = ['t1_MAFLD', 't1_MAFLD_0',\
                           't1_MAFLD_Obesity', 't1_MAFLD_MD', 't1_MAFLD_Diabetes', \
                           't1_CMRC_id', 'null_ratio'\
                           ]

df9_processed = unique_records.drop(cols_to_drop_only_MAFLD, axis=1)

In [30]:
# start modeling preparation - one-hot encoding 
# split categorical and numerical variables

features = df9_processed.columns.drop(['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD'])
other_categorical_features = ['t1_sex', 't1_w', 't1_DM_determine', 't1_CKD']
prefixes = ["t1_"]

# all survey features are categorical vars
survey_categorical_features = utils.add_prefix(survey_features_filtered, prefixes)
categorical_features = other_categorical_features + survey_categorical_features

# Continous Var: Scaling and Missing value handling
numeric_features = df9_processed.columns.drop(categorical_features).drop(['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD'])
X_numeric = df9_processed[numeric_features]
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

# Discrete Var: one-hot encoding
# One hot for categorical var - nan with 0 - use for contrastive learning
X_categorical = df9_processed[categorical_features]
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

nan_columns = [col for col in X_categorical_encoded.columns if 'nan' in col]
# Remove 'nan' columns from the dataframe
X_categorical_encoded.drop(columns=nan_columns, inplace=True)

# process target variable
y = df9_processed[['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD']]
y = y.astype(int)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_combined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_combined shape:  (4408, 130)
y shape:  (4408, 4)


In [31]:
#   Change y to 1-d array to fit the multiclass probelm
# y_feture_list = y.columns.tolist()
# # y[y_feture_list].value_counts(dropna = False)
# for column in y.columns:
#     print(f"Value counts for {column}:")
#     print(y[column].value_counts(dropna=False))
#     print("\n")

y_lp = y.apply(lambda row: ','.join(row.index[row == 1]), axis=1)
print("Number of unique classes in y_train_lp:", y_lp.nunique())
print(y_lp.value_counts())

Number of unique classes in y_train_lp: 5
t2_MAFLD_0                            3667
t2_MAFLD_Obesity                       537
t2_MAFLD_Obesity,t2_MAFLD_Diabetes     157
t2_MAFLD_MD                             34
t2_MAFLD_Diabetes,t2_MAFLD_MD           13
dtype: int64


In [20]:
y_new = utils.process_label_powerset(y, y_lp)
y_new.head(3)

# Check index for X_Combined and y_new
assert(X_combined.index.equals(y_new.index))

In [21]:
# Check for multiclass problem
row_sums = y_new.sum(axis=1)
all_single_label = (row_sums == 1).all()

print(f"All data points belong to exactly one class: {all_single_label}")

All data points belong to exactly one class: True


In [23]:
# convert to 1d array
y_new = y_new.reset_index(drop=True)

y_new_1d = y_new.idxmax(axis=1)
class_mapping = {label: idx for idx, label in enumerate(y_new.columns)}
y_new_1d = y_new_1d.map(class_mapping).astype('int64')

In [24]:
y_new_1d.value_counts()

0    3667
2     537
3     157
1      34
4      13
dtype: int64

In [32]:
# Check index for X_Combined and y_new_1d
assert(X_combined.index.equals(y_new_1d.index))

In [33]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_new_1d, test_size=0.2, random_state=42, stratify=y_new_1d)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# get the size for each training and validation
print("Size of X_train:", X_train.shape)
print("Size of y_train:", y_train.shape)
print("Size of X_val:", X_val.shape)
print("Size of y_val:", y_val.shape)
print("Size of X_test:", X_test.shape)
print("Size of y_test:", y_test.shape)

Size of X_train: (2820, 130)
Size of y_train: (2820,)
Size of X_val: (706, 130)
Size of y_val: (706,)
Size of X_test: (882, 130)
Size of y_test: (882,)


In [36]:
# save train val test dataset for next usage
import os
# use for constrastive learning, benchmark etc
DATA_DIR = "../Data/train_test_data/"

feature_columns = X_train.columns.tolist()
train_df = pd.DataFrame(data=X_train, columns=feature_columns)
train_df["target"] = y_train
train_df.to_csv(os.path.join(DATA_DIR, "train.csv"), index=False)

val_df = pd.DataFrame(data=X_val, columns=feature_columns)
val_df["target"] = y_val
val_df.to_csv(os.path.join(DATA_DIR, "val.csv"), index=False)

test_df = pd.DataFrame(data=X_test, columns=feature_columns)
test_df["target"] = y_test
test_df.to_csv(os.path.join(DATA_DIR, "test.csv"), index=False)