In [1]:
#Imports 
import pandas as pd
import numpy as np

In [2]:
#read dataset as df
df = pd.read_excel("../data/NTCMRC_all.xlsx")
df.head(5)

Unnamed: 0,sid,P_Number,birth,data_exam,CMRC_id,sex,age,data_year,year_come,height_y,...,"腎結節 ( yes=1, no=0)","左腎 (無:0, 單一:1,多發:2).3",公分.23,"右腎 (無:0, 單一:1,多發:2).3",公分.24,脾臟 (0:正常;1:脾臟腫大；2脾臟切除),spleen long axis (cm),spleen short axis (cm),脾面積(大於20CM=脾腫大),other
0,M11A011,M11A011,391127,20221122,R8M080Y5005M5,0,71,2022,2,145.2,...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
1,20221119122,P111782,410428,20221119,E42018P21120K8045,0,70,2022,3,153.5,...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,20221119121,P111781,401120,20221119,H62016G10710R5042,0,70,2022,5,147.5,...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,20221119120,P111780,430325,20221119,U32016A80710B9013,0,68,2022,4,148.0,...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,20221119119,P111779,390220,20221119,A72018S91120Z2060,1,72,2022,3,157.0,...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


In [4]:
import sys
sys.path.append("../")
from utilities import utils

In [5]:
import os
print(os.getcwd())

/Users/sizhechen/Desktop/git/fatty-liver-project/src/features


In [7]:
# Create a copy of df as df1
df1 = df.copy()

# Replace '\\N' with NaN
df1 = df1.replace('\\N', np.nan)

# Specify the columns to be converted to FLOAT
columns_to_convert1 = ['BMI', 'Triglyceride_y', 'gamgt', 'waist_y', 'mst', 'egfrn', 'Estimated_GFR_x', 'Alb_Cre_ratio', 'HOMA_IR', 'HS_CRP', \
                       'LDL_C_direct', 'LDL_C_HDL_C', 'Adiponectin', 'Leptin', 'Uric_Acid','Insulin', 'ALT_GPT']

# Specify the columns to be converted to INT
columns_to_convert2 = ['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)', 'smoke', 'smoke_q', \
                       'sex', 'w', 'coffee', 'betel']

# Convert the specified columns to float and fill missing/unconvertible values with NaN
for column in columns_to_convert1:
    df1[column] = pd.to_numeric(df1[column], errors='coerce')

# Convert the specified columns to int and fill missing/unconvertible values with NaN
for column in columns_to_convert2:
    df1[column] = pd.to_numeric(df1[column], errors='coerce').astype(pd.Int64Dtype())

# Calculate FLI using the formula and defined as df2
df2 = df1.copy()
df2['FLI'] = (np.exp(0.953 * np.log(df2['Triglyceride_y']) + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) \
     + 0.053 * df2['waist_y'] - 15.745)) / (1 + np.exp(0.953 * np.log(df2['Triglyceride_y']) \
    + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) + 0.053 * df2['waist_y'] - 15.745)) * 100

# Derive FL_echo based on ultrasound results column
df2['FL_echo'] = df2['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)']
df2['FL_echo'] = df2['FL_echo'].replace('<NA>', np.nan)
df2['fl_status'] = df2.apply(utils.derive_fl_status, axis=1)

#Derive homa_ir_check, hs_crp_check, and mst_total to determine MAFLD risk factors
df2['homa_ir_check'] = df2['HOMA_IR'].apply(lambda x: 1 if x >= 2.5 else 0)
df2['hs_crp_check'] = df2['HS_CRP'].apply(lambda x: 1 if x > 2 else 0)
df2['mst_total'] = df2[['w', 'hyper', 'HDL', 'fg', 'trig', 'homa_ir_check', 'hs_crp_check']].sum(axis=1)


df3 = utils.derive_CKD(utils.derive_MAFLD(df2))

# Derive FL_group_list, 对CMRC_id分组并计算每个病人的FL_Check的唯一值
grouped = df2.groupby('CMRC_id')['fl_status'].unique()
df2['fl_group_list'] = df2.groupby('CMRC_id')['fl_status'].transform(lambda x: [x.unique().tolist()] * len(x))

#assign patient valid value
df4 = utils.assign_patient_fl_validity(df3)

# Deal with values with both string and num values: HBsAg_x, Anti_HCV_x, (values eg. 陰性    0.351)
# extract numeric values for these cols and rename that as  *_num
columns_to_extract = ['HBsAg_x', 'Anti_HCV_x']
df5 = df4.copy()
for column in columns_to_extract:
    new_column_name = column + '_num'
    df5[new_column_name] = df5[column].apply(utils.extract_numeric_value)

# Sliding window implementation, ie. use previous 2 years record to predict the 3 year MAFLD status
df6 = utils.sliding_window_data(df5, input_window_size=2, target_window_size=1)

# Filter available data that can be applied to models
df7 = df6[(df6['t3_MAFLD'] != -1)]

# Drop ID relevant cols in the dataset
columns_to_drop = ['CMRC_id', 't1_CMRC_id', 't2_CMRC_id', 't1_sid', 't2_sid', 't1_P_Number','t2_P_Number']
df8 = df7.drop(columns=columns_to_drop)

#Select key columns for conventional machine learning models
columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid", "HBsAg_x", "Anti_HCV_x", "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", "smoke", "smoke_q", "coffee", "betel", "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", "MNA", "AUDIT", "HBV_", "HCV_", "MAFLD", "CKD", \
           'HBsAg_x_num', 'Anti_HCV_x_num']
prefixes = ["t1_", "t2_"]
renamed_columns = utils.add_prefix(columns, prefixes)

df9 = df8[renamed_columns]
df9['t3_MAFLD'] = df8['t3_MAFLD']

# drop these cols as those been derived for numeric cols, remain alias *_num,
cols_to_drop_only_MAFLD = ['t1_HBsAg_x', 't2_HBsAg_x', 't1_Anti_HCV_x', 't2_Anti_HCV_x', 't1_MAFLD', 't2_MAFLD']
cols_to_drop_fli_related = ['t1_HBsAg_x', 't2_HBsAg_x', 't1_Anti_HCV_x', 't2_Anti_HCV_x', 't1_MAFLD', 't2_MAFLD', \
                            't1_Triglyceride_y', 't1_BMI', 't1_gamgt', 't1_waist_y', 't1_gamgt', 't1_w', \
                            't2_Triglyceride_y', 't2_BMI', 't2_gamgt', 't2_waist_y', 't2_gamgt', 't2_w']
df9_a = df9.drop(cols_to_drop_only_MAFLD, axis=1)

#FLI related cols: Triglyceride_y, BMI, gamgt, waist_y, gamgt
df9_b = df9.drop(cols_to_drop_fli_related, axis=1)

df9_a.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['t3_MAFLD'] = df8['t3_MAFLD']


Unnamed: 0,t1_sex,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,...,t2_sarcf,t2_ms2,t2_MNA,t2_AUDIT,t2_HBV_,t2_HCV_,t2_CKD,t2_HBsAg_x_num,t2_Anti_HCV_x_num,t3_MAFLD
0,0,59,82.0,101.0,264.0,56.4,21.0,19.0,15.0,9.18,...,,1,,1.0,,,1,,,0
1,0,60,83.0,104.0,255.0,55.8,15.0,18.0,14.0,5.86,...,,1,,0.0,0.0,0.0,2,0.48,0.03,0
2,0,62,85.0,103.0,192.0,60.7,18.0,22.0,17.0,4.56,...,,1,,0.0,,,2,,,1
3,0,63,84.0,111.0,164.0,83.0,27.0,30.0,16.0,6.3,...,,1,,0.0,0.0,0.0,2,0.33,0.036,0
4,0,58,61.0,99.0,75.0,60.4,23.0,17.0,10.0,3.75,...,,0,,1.0,,,2,,,0


In [8]:
# Modeling - baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, precision_score, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [9]:
# start modeling preparation
# split categorical and numerical variables,

df9_a.drop('t2_sex', axis=1, inplace=True)
features = df9_a.columns.drop(['t3_MAFLD'])

categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD', \
                        't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']
numeric_features = df9_a.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df9_a[categorical_features]
X_numeric = df9_a[numeric_features]
y = df9_a['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 114)
y shape:  (9658,)


In [11]:
# train test split, seed=2023
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Baseline Logistic model(all important factors) AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Baseline Random Forest(all important factors) AUC score: ", auc_rf)

Baseline Logistic model(all important factors) AUC score:  0.8495972230855735
Baseline Random Forest(all important factors) AUC score:  0.8443908617250324


Analysis 1

In [12]:
#Analysis 1: The impact of interval years
# Start with df9_a, dataset contains BMI, etc variables
# Check t1->t3
def select_columns(df, prefix, additional_column=None):
    selected_columns = [col for col in df.columns if col.startswith(prefix)]
    if additional_column is not None:
        selected_columns.append(additional_column)
    return df[selected_columns]

df10_a_t1 = select_columns(df9_a, 't1_', 't3_MAFLD')
df10_a_t1.head(1)

Unnamed: 0,t1_sex,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,...,t1_sarcf,t1_ms2,t1_MNA,t1_AUDIT,t1_HBV_,t1_HCV_,t1_CKD,t1_HBsAg_x_num,t1_Anti_HCV_x_num,t3_MAFLD
0,0,59,82.0,101.0,264.0,56.4,21.0,19.0,15.0,9.18,...,,1,,,0.0,0.0,2,0.44,0.04,0


In [13]:
# start modeling preparation
# split categorical and numerical variables,
features = df10_a_t1.columns.drop(['t3_MAFLD'])

categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD']
                        # 't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']
numeric_features = df10_a_t1.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df10_a_t1[categorical_features]
X_numeric = df10_a_t1[numeric_features]
y = df10_a_t1['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 58)
y shape:  (9658,)


In [14]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)
print("Start train test split with Random Seed = 2023")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important factors) for t1 predict t3 AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) for t1 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 2023
Logistic model(all important factors) for t1 predict t3 AUC score:  0.8412372357438729
Random Forest(all important factors) for t1 predict t3 AUC score:  0.8345329991692992


In [15]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=713)
print("Start train test split with Random Seed = 713")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important factors) for t1 predict t3 AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) for t1 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 713
Logistic model(all important factors) for t1 predict t3 AUC score:  0.8411811108607987
Random Forest(all important factors) for t1 predict t3 AUC score:  0.8286175196324223


In [16]:
# Check t2 ->t3
df10_a_t2 = select_columns(df9_a, 't2_', 't3_MAFLD')
df10_a_t2.head(1)

Unnamed: 0,t2_age,t2_waist_y,t2_Glucose_AC_y,t2_Triglyceride_y,t2_HDL_C_y,t2_AST_GOT,t2_ALT_GPT,t2_gamgt,t2_Insulin,t2_T_Cholesterol,...,t2_sarcf,t2_ms2,t2_MNA,t2_AUDIT,t2_HBV_,t2_HCV_,t2_CKD,t2_HBsAg_x_num,t2_Anti_HCV_x_num,t3_MAFLD
0,60,83.0,104.0,255.0,55.8,15.0,18.0,14.0,5.86,312.0,...,,1,,1.0,,,1,,,0


In [17]:
# start modeling preparation for t2
# split categorical and numerical variables,
features = df10_a_t2.columns.drop(['t3_MAFLD'])

categorical_features = ['t2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']
numeric_features = df10_a_t2.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df10_a_t2[categorical_features]
X_numeric = df10_a_t2[numeric_features]
y = df10_a_t2['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 56)
y shape:  (9658,)


In [18]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)
print("Start train test split with Random Seed = 2023")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)

print("Logistic model(all important factors) for t2 predict t3 AUC score: ",auc_log)
print("Random Forest(all important factors) for t2 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 2023
Logistic model(all important factors) for t2 predict t3 AUC score:  0.8437356958391491
Random Forest(all important factors) for t2 predict t3 AUC score:  0.8342264309811876


In [39]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=713)
print("Start train test split with Random Seed = 713")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important factors for t2 predict t3 AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) for t1 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 713
Logistic model(all important factors for t2 predict t3 AUC score:  0.8475780823862711
Random Forest(all important factors) for t1 predict t3 AUC score:  0.8398109521425979


Analysis 2

In [135]:
# Analysis 2: the impact of survey data
# Feature selection - added survey data then modeling

columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid", \
        #   "HBsAg_x", "Anti_HCV_x", \
          "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", \
          #  "smoke", "smoke_q", "coffee", "betel",
           "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", "MNA", "AUDIT", "HBV_", "HCV_", "MAFLD", "CKD", \
           'HBsAg_x_num', 'Anti_HCV_x_num']

survey_data =['smoke', 's_smoke', 'smoke_t', 'smoke_q', 'smoke_qt1', 'smoke_qt2', 'smoke_f', 'smoke_n', 'smoke_second', \
           'tea', 'tea_c', 'tea_1', 'tea_t', 'tea_q', 'tea_f', 'tea_v', 'coffee', 'coffee_c', 'coffee_t', 'coffee_q', 'coffee_f', 'coffee_v', \
           'betel', 's_betel', 'betel_clt1', 'betel_clt2', 'betel_clt3', 'betel_clt4', 'betel_clt5', 'betel_clt6', 'betel_o', 'betel_t', 'betel_q', 'betel_f', 'betel_n', \
           'activity_t', 'carryh', 'sport', 'sport_1', 'sport_d', 'sport_t', 'cardio', 'hypertension', 'Dysrhythmia', 'ap', 'ami', 'Hyperlipidemia', 'HF', 'f_cardio', \
           'f_hyper', 'f_dys', 'f_ap', 'f_ami', 'f_lipid', 'f_hf', 'm_cardio', 'm_hyper', 'm_dys', 'm_ap', 'm_ami', 'm_lipid', 'm_hf', 'bs_cardio', 'bs_hyper', 'bs_hyper_1', \
           'bs_dys', 'bs_dys_1', 'bs_ap', 'bs_ap1', 'bs_ami', 'bs_ami_1', 'bs_lipid', 'bs_lipid_1', 'bs_hf', 'bs_hf_1', \
           'chi_cardio', 'chi_hyper', 'chi_dys', 'chi_ap', 'chi_ami', 'chi_lipid', 'chi_hf', \
           'endocrine', 'diabetes', 'Thyroid', 'f_dm', 'f_Thyroid', 'm_dm', 'm_Thyroid', 'bs_dm', 'bs_dm_1', 'bs_Thyroid', 'bs_Thyroid_1', \
           'chi_dm', 'chi_Thyroid', 'pepticulcer', 'gastritis', 'hbv', 'hcv', 'hepatitis_o', 'FLD', 'fibrosis', 'Cirrhosis', 'Polyposis', 'ibs', \
           'f_pud', 'f_hbv', 'f_hcv', 'f_liver', 'f_fld', 'f_f', 'f_lc', 'f_polyp', 'f_ibs', 'm_pud', 'm_hbv', 'm_hcv', 'm_liver', 'm_fld', 'm_f', 'm_lc', 'm_polyp', 'm_ibs', \
           'bs_pud', 'bs_pud_1', 'bs_hbv', 'bs_hbv_1', 'bs_hcv', 'bs_hcv_1', 'bs_liver', 'bs_fld', 'bs_fld_1', 'bs_f', 'bs_f_1', 'bs_lc', 'bs_lc_1', \
           'bs_polyp', 'bs_polyp_1', 'bs_ibs', 'bs_ibs_1', 'chi_pud', 'chi_hbv', 'chi_hcv', 'chi_liver', 'chi_fld', 'chi_f', 'chi_lc', 'chi_polyp', 'chi_ibs', \
           'respiratory', 'tb', 'Asthma', 'apnea', 'copd', 'f_resp', 'f_Asthma', 'f_apnea', 'f_copd', 'm_resp', 'm_Asthma', 'm_apnea', 'm_copd', \
           'bs_resp', 'bs_Asthma', 'bs_Asthma_1', 'bs_apnea', 'bs_apnea_1', 'bs_copd', 'bs_copd_1', 'chi_resp', 'chi_Asthma', 'chi_apnea', 'chi_copd', \
           'anemia', 'Hemophilia', 'f_anemia', 'f_h', 'm_anemia', 'm_h', 'bs_anemia', 'bs_anemia_1', 'bs_h', 'bs_h_1', 'chi_anemia', 'chi_h', \
           'stroke', 'brainbleed', 'head_injury', 'epilepsy', 'Parkinsons', 'dementia', 'Huntingtons', 'f_stroke', 'f_hi', 'f_brainbleed', 'f_epilep', 'f_pd', 'f_demen', 'f_hd', \
           'm_stroke', 'm_hi', 'm_brainbleed', 'm_epilep', 'm_pd', 'm_demen', 'm_hd', 'bs_stroke', 'bs_stroke_1', 'bs_hi', 'bs_hi_1', 'bs_brainbleed', 'bs_brainbleed_1', 'bs_epilep', \
           'bs_epilep_1', 'bs_pd', 'bs_pd_1', 'bs_demen', 'bs_demen_1', 'bs_hd', 'bs_hd_1', 'chi_stroke', 'chi_hi', 'chi_brainbleed', 'chi_epilep', 'chi_pd', 'chi_demen', 'chi_hd', \
           'insomnia', 'Depression', 'mdd_dx', 'mental', 'f_mdd', 'f_mdd_dx', 'f_mental', 'm_mdd', 'm_mdd_dx', 'm_mental', 'bs_mdd', 'bs_mdd_1', 'bs_mdd_dx', 'bs_mental', 'chi_mdd', \
           'chi_mdd_dx', 'chi_mental', 're_1', 'ckd', 'dialysis', 'gout', 'stone', 'bpn', 'f_re', 'f_ckd', 'f_dialysis', 'f_stone', 'f_bpn', 'm_re', 'm_ckd', 'm_dialysis', 'm_stone', \
           'm_bpn', 'bs_re', 'bs_ckd', 'bs_ckd_1', 'bs_dialysis', 'bs_dialysis_1', 'bs_stone', 'bs_stone_1', 'bs_bpn', 'bs_bpn_1', 'chi_re', 'chi_ckd', 'chi_dialysis', 'chi_stone', 'chi_bpn', \
           'cataract', 'glau', 'retina', 'sicca', 'f_cata', 'f_glau', 'f_retina', 'f_sicca', 'm_cata', 'm_glau', 'm_retina', 'm_sicca', 'bs_cata', 'bs_cata_1', 'bs_glau', 'bs_glau_1', 'bs_retina', \
           'bs_retina_1', 'bs_sicca', 'bs_sicca_1', 'chi_cata', 'chi_glau', 'chi_retina', 'chi_sicca', 'cancer', 'cancer_1', 'cantime', 'cancer_mi', 'lungcan', 'breastcna', 'colican', 'gastrican', \
           'livercan', 'cancer_o', 'f_can', 'f_can_1', 'f_canmi', 'f_lung', 'f_breast', 'f_coli', 'f_gastric', 'f_liver_c', 'f_can_o', 'm_can', 'm_can_1', 'm_canmi', 'm_lung', 'm_breast', 'm_coli', \
           'm_gastric', 'm_liver_c', 'm_can_o', 'bs_can', 'bs_can_1', 'bs_canmi', 'bs_lung', 'bs_lung_1', 'bs_breast', 'bs_breast_1', 'bs_coli', 'bs_coli_1', 'bs_gastric', 'bs_gastric_1', 'bs_liver_c', \
           'bs_liver_c_1', 'bs_can_o', 'chi_can', 'chi_can_1', 'chi_canmi', 'chi_lung', 'chi_breast', 'chi_coli', 'chi_gastric', 'chi_liver_c', 'chi_can_o', 'ndisease', 'fn', 'mn', 'bsn', 'chin', 'UNKNOWN', \
           'funknown', 'munknown', 'bsunknown', 'chiunknown', 'autoimmu', 'dis_o', 'f_dis_o', 'f_disable', 'm_dis_o', 'm_disable', 'bs_dis_o', 'bs_disable', 'chi_dis_o', 'chi_disable', 'Syr_drug', 'Hypnotic', \
           'drug', 'drug_w', 'drug_yr', 'drug_diag', 'drug_cont', 'hyper_drug', 'hyper_drugn', 'hyper_druga', 'hyper_confir', 'hyper_cont', 'dm_drug', 'dm_w', 'dm_yr', 'dm_diag', 'dm_cont', 'hor_drug', 'hor_w', \
           'hor_yr', 'hor_diag', 'hor_diag1', 'hor_cont', 'lipiddrug', 'lipidw', 'lipidyr', 'lipiddiag', 'lipiddiag1', 'lipidcont', 'supply', 'gasdrug', 'cmed', 'hf_none', 'hf_vit', 'hf_wgl', 'hf_foil', 'hf_ca', \
           'hf_p', 'hf_p1', 'hf_p2', 'hf_chic', 'hf_clam', 'hf_e', 'hf_lutein', 'hf_collagen', 'hf_other', 'hf_fmed', 'hf_unknown', 'hf_1', \
           'plastic_1', 'plastic_2', 'plastic_3', 'plastic_4', 'plastic_5', 'plastic_6', 'plastic_7', 'BI_1', 'BI_2', 'BI_3', \
           'SF36_1', 'SF36_2', 'sf36_3_a', 'sf36_3_b', 'sf36_3_c', 'sf36_3_d', 'sf36_3_e', 'sf36_3_f', 'sf36_3_g', 'sf36_3_h', 'sf36_3_i',\
           'sf36_3_j', 'sf36_4_a', 'sf36_4_b', 'sf36_4_c', 'sf36_4_d', 'sf36_5_a', 'sf36_5_b', 'sf36_5_c', 'sf36_6', 'sf36_7', 'sf36_8', 'sf36_9_a', 'sf36_9_b', \
           'sf36_9_c', 'sf36_9_d', 'sf36_9_e', 'sf36_9_f', 'sf36_9_g', 'sf36_9_h', 'sf36_9_i', 'sf36_10', 'sf36_11_a', 'sf36_11_b', 'sf36_11_c', 'sf36_11_d', \
           'check5', 'beda', 'bedb', 'bedt', 'waketa', 'waketb', 'sleepta', 'sleeptb', 'sleep1', 'sleep2', 'sleep3', 'sleep4', 'sleep5', 'sleep6', 'sleep7', 'sleep8', 'sleep9', 'sleep10', 'sleep10a', \
           'sleep11', 'sleep11v3', 'sleep12', 'sleep12v3', 'sleep13', 'sleep13v3', 'sleep14', 'sleep14v3', 'sleep15', 'check6', 'hads_1', 'hads_2', 'hads_3', 'hads_4', 'hads_5', 'hads_6', 'hads_7', 'hads_8', \
           'hads_9', 'hads_10', 'hads_11', 'hads_12', 'hads_13', 'hads_14', 'check7', 'uls8_1', 'uls8_2', 'uls8_3', 'uls8_4', 'uls8_5', 'uls8_6', 'uls8_7', 'uls8_8', 'check8', 'ad8_1', 'ad8_2', 'ad8_3', 'ad8_4', \
           'ad8_5', 'ad8_6', 'ad8_7', 'ad8_8', 'check9', 'ecog12a', 'ecog12a1', 'ecog12b1', 'ecog12b2', 'ecog12b3', 'ecog12b4', 'ecog12b5', 'ecog12b6', 'ecog12b7', 'ecog12b8', 'ecog12b9', 'ecog12b10', 'ecog12b11', 'ecog12b12', \
           'check10', 'mna_1', 'mna_2', 'mna_3', 'mna_4', 'mna_5', 'mna_6', 'mna_7', 'mna_8', 'mna_9', 'mna_10', 'mna_11', 'mna_12', 'mna_13', 'mna_14', 'mna_15', 'check11', 'sarcf_1', 'sarcf_2', 'sarcf_3', 'sarcf_4', 'sarcf_5', \
           'check12', 'd2_drink', 'd2_drinkt', 'd2_drinkav', 'd2_drinkday', 'd2_drinkcat', 'd2_drinknote', 'd2_drinkvol', 'past2_drinkav', 'past2_drinkday', 'past2_drink', 'past2_drinkcat', 'past2_drinknote', 'past2_drinkvol', 'past1_drink', \
           'd2_drinks', 'drink_k1', 'drink_k2', 'drink_k3', 'drink_k4', 'drink_k5', 'drink_k6', 'drink_k7', 'drink_k7_o', 'drink_k7_1', 'drink_k7_2', 'drink_k7_3', 'drink_k7_4', 'drink_k7_5', 'drink_k7_6', 'drink_t', 'drink_q', 'drink_qt1', 'drink_qt2', \
           'drink_red', 'B_1_1', 'B_1_2', 'B_1_3', 'B_1_4', 'B_1_5', 'B_1_6', 'B_1_7', 'B_1_8', 'B_1_9', 'B_1_10', 'B_2_1', 'B_2_2', 'B_2_3', 'B_2_4', 'check13', \
           'life1', 'life2', 'life3', 'life4', 'life5', 'life6', 'life7', 'life8', 'life9', 'life10', 'life11', 'life12', 'life13', 'life14', 'life15', 'life16', 'life17', 'life18', 'life19', 'life20', 'life21', 'life22', 'life23', 'life24', \
           'depres_11', 'depres_12', 'depres_13', 'depres_14', 'depres_15', 'depres_16', 'depres_17', 'depres_18', 'depres_19', 'depres_110', 'depres_111', 'depres_112', 'depres_113', 'depres_114', 'depres_115', 'depres_116', 'depres_117', 'depres_118', \
           'health_1_old', 'health_2_old', 'health_3_old', 'health_4_old', 'health_5_old', 'health_6_old', 'health_7_old', 'health_8_old', 'health_9_old', 'health_10_old', \
           'health_1', 'health_2', 'health_3', 'health_4', 'health_5', 'health_6', 'health_7', 'health_8', 'health_9', 'health_10'
           ]

selected_columns = columns + survey_data
prefixes = ["t1_", "t2_"]
renamed_columns = utils.add_prefix(selected_columns, prefixes)

df_addSurvey_1 = df8[renamed_columns]
df_addSurvey_1['t3_MAFLD'] = df8['t3_MAFLD']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_addSurvey_1['t3_MAFLD'] = df8['t3_MAFLD']


In [136]:
# Remove high missing value columns
missing_threshold = 0.8
def remove_columns_with_high_missing_values(df, threshold):
    """
    Remove columns from a DataFrame that have missing values exceeding the specified threshold.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - threshold (float): The threshold for missing values. Columns with missing values exceeding this threshold will be removed.

    Returns:
    - cleaned_df (DataFrame): The cleaned DataFrame with columns removed.
    """
    total_missing = df.isnull().sum()  # 计算每列的缺失值数量
    total_rows = df.shape[0]  # 数据集的总行数
    columns_to_remove = total_missing[total_missing / total_rows > threshold].index  # 找到超过阈值的列名
    cleaned_df = df.drop(columns=columns_to_remove)  # 删除指定列
    print("columns to remove with high missing values: ", columns_to_remove)

    return cleaned_df

df_addSurvey_2 = remove_columns_with_high_missing_values(df_addSurvey_1,missing_threshold)

columns to remove with high missing values:  Index(['t1_sarcf', 't1_MNA', 't1_smoke_qt1', 't1_smoke_qt2', 't1_smoke_f',
       't1_tea_1', 't1_betel_o', 't1_betel_q', 't1_sport_1', 't1_cardio',
       ...
       't2_health_1_old', 't2_health_2_old', 't2_health_3_old',
       't2_health_4_old', 't2_health_5_old', 't2_health_6_old',
       't2_health_7_old', 't2_health_8_old', 't2_health_9_old',
       't2_health_10_old'],
      dtype='object', length=830)


In [137]:
num_columns = len(df_addSurvey_2.columns)
print("Number of columns in df_addSurvey_2:", num_columns)

Number of columns in df_addSurvey_2: 599


In [138]:
def remove_columns_with_high_unique_values(df, threshold):
    """
    Remove columns from a DataFramse that have unique values exceeding the specified threshold.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - threshold (int): The threshold for unique values. Columns with unique values exceeding this threshold will be removed.

    Returns:
    - cleaned_df (DataFrame): The cleaned DataFrame with columns removed.
    """
    categorical_columns = df.select_dtypes(include='object').columns  # 获取分类字段列
    columns_to_remove = []
    for column in categorical_columns:
        unique_values = df[column].nunique()  # 计算唯一值数量
        if unique_values > threshold:
            columns_to_remove.append(column)
    cleaned_df = df.drop(columns=columns_to_remove)  # 删除指定列

    return cleaned_df

threshold = 6
df_addSurvey_3 = remove_columns_with_high_unique_values(df_addSurvey_2, threshold)

In [139]:
num_columns = len(df_addSurvey_3.columns)
print("Number of columns in df_addSurvey_3:", num_columns)

Number of columns in df_addSurvey_3: 552


In [143]:
# Extra redundant cols needs to be dropped
# drop these cols as those been derived for numeric cols, remain alias *_num,
cols_to_drop_only_MAFLD = ['t2_sex', 't1_MAFLD', 't2_MAFLD']
cols_to_drop_fli_related = ['t2_sex', 't1_MAFLD', 't2_MAFLD', \
                            't1_Triglyceride_y', 't1_BMI', 't1_gamgt', 't1_waist_y', 't1_gamgt', 't1_w', \
                            't2_Triglyceride_y', 't2_BMI', 't2_gamgt', 't2_waist_y', 't2_gamgt', 't2_w']
df_addSurvey_4 = df_addSurvey_3.drop(cols_to_drop_only_MAFLD, axis=1)

# #FLI related cols: Triglyceride_y, BMI, gamgt, waist_y, gamgt
# df_addSurvey_4b = df_addSurvey_3.drop(cols_to_drop_fli_related, axis=1)


num_columns = len(df_addSurvey_4.columns)
print("Number of columns in df_addSurvey_4:", num_columns)

Number of columns in df_addSurvey_4: 549


In [145]:
prefixes = ["t1_", "t2_"]
all_survey_data_renamed_cols = utils.add_prefix(survey_data, prefixes)

#get final_survey_data_included_cols for modeling to define categorical variables
final_survey_data_included_cols = [col for col in df_addSurvey_4.columns if col in all_survey_data_renamed_cols]

In [146]:
#checks 
't1_smoke' in final_survey_data_included_cols

True

In [147]:
#Modeling
# start modeling preparation
# split categorical and numerical variables,
features = df_addSurvey_4.columns.drop(['t3_MAFLD'])

# categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD', \
#                         't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']

def generate_column_names(prefix, column_names):
    new_column_names = []
    for p in prefix:
        new_column_names.extend([p + col for col in column_names])
    return new_column_names

prefix = ['t1_', 't2_']
# categorical_column_names = ['sex', 'w', 'smoke', 'smoke_q', 'coffee', 'betel', 'DM_determine', 'CKD']
categorical_column_names = ['w', 'DM_determine', 'CKD']
categorical_column_names_non_survey = generate_column_names(prefix, categorical_column_names)
categorical_features = categorical_column_names_non_survey + final_survey_data_included_cols + ['t1_sex']
# selected_categorical_column_names = categorical_column_names + final_survey_data_included_cols
# categorical_features = generate_column_names(prefix, selected_categorical_column_names)

numeric_features = df_addSurvey_4.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df_addSurvey_4[categorical_features]
X_numeric = df_addSurvey_4[numeric_features]
y = df_addSurvey_4['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 4825)
y shape:  (9658,)


In [57]:
# train test split, seed=2023
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important features) with Survey data AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) with Survey data AUC score: ", auc_rf)

Logistic model(all important features) with Survey data AUC score:  0.7953911640549505
Random Forest(all important factors) with Survey data AUC score:  0.8395641787080623


Analysis 3

In [148]:
#check unique values
# Analysis 3: Impact of additional features that included in other papers but not in our model
#  adding features: bloodtype, bloodtype_rh, Eosinophil, height_y, weight_y, MCV, Leukocyte, SP_Gravity, systolic_y, diastolic_y, Total_Bilirubin, Total_Protein, Glucose,Platelets, Albumin, Bilirubin, hypertension,  Hematocrit, WBCb
# Select key columns for conventional machine learning models

columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid", "HBsAg_x_num", "Anti_HCV_x_num", "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", "smoke", "smoke_q", "coffee", "betel", "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", "MNA", "AUDIT", "HBV_", "HCV_", "MAFLD", "CKD"]

adding_features = ['bloodtype', 'bloodtype_rh', 'Eosinophil', 'height_y', 'weight_y', 'MCV', 'Leukocyte', 'SP_Gravity', \
                   'systolic_y', 'diastolic_y', 'Total_Bilirubin', 'Total_Protein', 'Glucose', 'Platelets', 'Albumin', \
                   'Bilirubin', 'hypertension',  'Hematocrit', 'WBCb']

selected_features = columns + adding_features
prefixes = ["t1_", "t2_"]
renamed_columns = utils.add_prefix(selected_features, prefixes)

df_addFeatures_1 = df8[renamed_columns]
df_addFeatures_1['t3_MAFLD'] = df8['t3_MAFLD']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_addFeatures_1['t3_MAFLD'] = df8['t3_MAFLD']


In [149]:
# Checks
# addtional Categorical features
df_addFeatures_1['t1_Leukocyte'].value_counts(dropna=False)

t1_Leukocyte
Negative    7002
Trace       1186
1+           725
2+           403
3+           191
NaN          110
陰性            36
NEGATIVE       3
NONE           2
Name: count, dtype: int64

In [150]:
unique_values = df_addFeatures_1['t1_Leukocyte'].unique()
print("Unique Values:\n", unique_values)


Unique Values:
 ['Negative' 'Trace' '1+' '2+' nan '3+' '陰性' 'NEGATIVE' 'NONE']


In [151]:
df_addFeatures_1['t1_Bilirubin'].value_counts(dropna=False)

t1_Bilirubin
Negative    9256
NEGATIVE     234
NaN          110
陰性            53
1+ (0.5)       2
NONE           2
3+ (4.0)       1
Name: count, dtype: int64

In [152]:
df_addFeatures_1['t1_Glucose'].value_counts()

t1_Glucose
Negative      9259
3+ (1000)       95
Trace(100)      75
陰性              50
2+ (500)        33
1+ (250)        29
NEGATIVE         3
NONE             2
3+(1000)         1
1+(250)          1
Name: count, dtype: int64

In [162]:
# Combine Categorical Values
df_addFeatures_2 = df_addFeatures_1.copy()

df_addFeatures_2['t1_Leukocyte'] = df_addFeatures_2['t1_Leukocyte'].replace({'NONE': np.nan, '陰性': 'Negative', 'NEGATIVE': 'Negative'})
df_addFeatures_2['t2_Leukocyte'] = df_addFeatures_2['t2_Leukocyte'].replace({'NONE': np.nan, '陰性': 'Negative', 'NEGATIVE': 'Negative'})

df_addFeatures_2['t1_Bilirubin'] = df_addFeatures_2['t1_Bilirubin'].replace({'NONE': np.nan, '陰性': 'Negative', 'NEGATIVE': 'Negative'})
df_addFeatures_2['t2_Bilirubin'] = df_addFeatures_2['t2_Bilirubin'].replace({'NONE': np.nan, '陰性': 'Negative', 'NEGATIVE': 'Negative'})

df_addFeatures_2['t1_Glucose'] = df_addFeatures_2['t1_Bilirubin'].replace({'NONE': np.nan, '陰性': 'Negative', 'NEGATIVE': 'Negative'})
df_addFeatures_2['t2_Glucose'] = df_addFeatures_2['t2_Bilirubin'].replace({'NONE': np.nan, '陰性': 'Negative', 'NEGATIVE': 'Negative'})


In [163]:
# Output checks
unique_values = df_addFeatures_2['t1_Leukocyte'].unique()
print("Unique Values:\n", unique_values)

Unique Values:
 ['Negative' 'Trace' '1+' '2+' nan '3+']


In [111]:
# added numerical value checks

def analyze_column(df, column_name):
    non_numeric_values = df[column_name][~df[column_name].apply(lambda x: isinstance(x, (int, float)))]
    unique_non_numeric_values = non_numeric_values.unique()
    non_numeric_ratio = len(non_numeric_values) / len(df[column_name])
    null_ratio = df[column_name].isnull().mean()
    
    output = f"Column: {column_name}\n"
    output += "Non-numeric Values:\n" + str(unique_non_numeric_values) + "\n"
    output += "Non-numeric Ratio: " + str(non_numeric_ratio) + "\n"
    output += "Null Ratio: " + str(null_ratio) + "\n"
    output += "\n"
    
    with open("numeric_check_output.txt", "a") as file:
        file.write(output)

columns_to_analyze = ['Eosinophil', 'height_y', 'weight_y', 'MCV', 'SP_Gravity', 'systolic_y', 'diastolic_y', 'Hematocrit', 'WBCb', 'Total_Bilirubin','Total_Protein', 'Platelets', 'Albumin']

prefixes = ["t1_", "t2_"]
renamed_columns_to_analyze = utils.add_prefix(columns_to_analyze, prefixes)

for column in renamed_columns_to_analyze:
    analyze_column(df_addFeatures_1, column)

In [112]:
# additional summary for numeric features
non_numeric_gt_1 = [column for column in renamed_columns_to_analyze if len(df_addFeatures_1[column][~df_addFeatures_1[column].apply(lambda x: isinstance(x, (int, float)))].unique()) > 1]
non_numeric_ratio_gt_01 = [column for column in renamed_columns_to_analyze if (len(df_addFeatures_1[column][~df_addFeatures_1[column].apply(lambda x: isinstance(x, (int, float)))]) / len(df_addFeatures_1[column])) > 0.1]
null_ratio_gt_01 = [column for column in renamed_columns_to_analyze if df_addFeatures_1[column].isnull().mean() > 0.1]

output_additional = f"non_numerical values 的值大于1的字段有：{non_numeric_gt_1}\n"
output_additional += f"Non-numeric Ratio 大于0.1的字段有：{non_numeric_ratio_gt_01}\n"
output_additional += f"Null Ratio 大于0.1的字段有：{null_ratio_gt_01}\n"

with open("numeric_check_output.txt", "a") as file:
    file.write(output_additional)

In [113]:
#Column: t1_Eosinophil Null Ratio: 0.4176848208738869 - needs check null value imputation
#Column: t1_SP_Gravity Non-numeric Values: ['>=1.030' '<=1.005' 'NONE'] - needs double check how to impute these values
# non_numerical values 的值大于1的字段有：['t1_SP_Gravity', 't1_Total_Bilirubin', 't2_SP_Gravity', 't2_Total_Bilirubin', 't2_Total_Protein']
# Non-numeric Ratio 大于0.1的字段有：[]
# Null Ratio 大于0.1的字段有：['t1_Eosinophil', 't2_Eosinophil']

In [155]:
df_addFeatures_3 = df_addFeatures_2.drop('t2_sex', axis=1)

num_columns = len(df_addFeatures_3.columns)
print("Number of columns in df_addFeatures_3:", num_columns)

Number of columns in df_addFeatures_3: 126


In [164]:
for column in df_addFeatures_3.columns:
    try:
        df_addFeatures_3[column] = pd.to_numeric(df_addFeatures_3[column], errors='raise')
    except ValueError:
        print(f"Error converting values in column: {column}")

Error converting values in column: t1_Leukocyte
Error converting values in column: t1_SP_Gravity
Error converting values in column: t1_Total_Bilirubin
Error converting values in column: t1_Glucose
Error converting values in column: t1_Bilirubin
Error converting values in column: t2_Leukocyte
Error converting values in column: t2_SP_Gravity
Error converting values in column: t2_Total_Bilirubin
Error converting values in column: t2_Total_Protein
Error converting values in column: t2_Glucose
Error converting values in column: t2_Bilirubin


In [168]:
df_addFeatures_4 = df_addFeatures_3.copy()

columns_to_convert = ['t1_SP_Gravity', 't1_Total_Bilirubin', 't2_SP_Gravity', 't2_Total_Bilirubin', 't2_Total_Protein']

for column in columns_to_convert:
    df_addFeatures_4[column] = pd.to_numeric(df_addFeatures_4[column], errors='coerce')

In [170]:
print("Data types of columns to check:")
columns_to_check = ['t1_Leukocyte', 't1_SP_Gravity', 't1_Total_Bilirubin', 't1_Glucose', 't1_Bilirubin',
                    't2_Leukocyte', 't2_SP_Gravity', 't2_Total_Bilirubin', 't2_Total_Protein', 't2_Glucose',
                    't2_Bilirubin']
for column in columns_to_check:
    print(column, ":", df_addFeatures_4[column].dtype)

Data types of columns to check:
t1_Leukocyte : object
t1_SP_Gravity : float64
t1_Total_Bilirubin : float64
t1_Glucose : object
t1_Bilirubin : object
t2_Leukocyte : object
t2_SP_Gravity : float64
t2_Total_Bilirubin : float64
t2_Total_Protein : float64
t2_Glucose : object
t2_Bilirubin : object


In [172]:
#Modeling
# start modeling preparation
# split categorical and numerical variables,
features = df_addFeatures_4.columns.drop(['t3_MAFLD'])

# categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD', \
#                         't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']

prefix = ['t1_', 't2_']
categorical_column_names = ['w', 'smoke', 'smoke_q', 'coffee', 'betel', 'DM_determine', 'CKD', \
                            'bloodtype', 'bloodtype_rh', 'Leukocyte', 'Bilirubin', 'Glucose', 'hypertension']
categorical_features = generate_column_names(prefix, categorical_column_names) + ['t1_sex']


numeric_features = df_addFeatures_4.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df_addFeatures_4[categorical_features]
X_numeric = df_addFeatures_4[numeric_features]
y = df_addFeatures_4['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 214)
y shape:  (9658,)


In [174]:
# train test split, seed=2023
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important features) with more features added data AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) with Survey data AUC score: ", auc_rf)


Logistic model(all important features) with more features added data AUC score:  0.8659901446097683
Random Forest(all important factors) with Survey data AUC score:  0.8539513390107312
