<a href="https://colab.research.google.com/github/JasmineAdvanture/fatty-liver-project/blob/BDS-13-CKD/MAFLD_basicAnalysis_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, precision_score, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [4]:
#read dataset as df
df = pd.read_excel("NTCMRC_all.xlsx")

# Create a copy of df as df1
df1 = df.copy()

# Replace '\\N' with NaN
df1 = df1.replace('\\N', np.nan)

# Specify the columns to be converted to FLOAT
columns_to_convert1 = ['BMI', 'Triglyceride_y', 'gamgt', 'waist_y', 'mst', 'egfrn', 'Estimated_GFR_x', 'Alb_Cre_ratio', 'HOMA_IR', 'HS_CRP', \
                       'LDL_C_direct', 'LDL_C_HDL_C', 'Adiponectin', 'Leptin', 'Uric_Acid','Insulin', 'ALT_GPT']

# Specify the columns to be converted to INT
columns_to_convert2 = ['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)', 'smoke', 'smoke_q', \
                       'sex', 'w', 'coffee', 'betel']

# Convert the specified columns to float and fill missing/unconvertible values with NaN
for column in columns_to_convert1:
    df1[column] = pd.to_numeric(df1[column], errors='coerce')

# Convert the specified columns to int and fill missing/unconvertible values with NaN
for column in columns_to_convert2:
    df1[column] = pd.to_numeric(df1[column], errors='coerce').astype(pd.Int64Dtype())


In [5]:
# Calculate FLI using the formula and defined as df2
df2 = df1.copy()
df2['FLI'] = (np.exp(0.953 * np.log(df2['Triglyceride_y']) + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) \
     + 0.053 * df2['waist_y'] - 15.745)) / (1 + np.exp(0.953 * np.log(df2['Triglyceride_y']) \
    + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) + 0.053 * df2['waist_y'] - 15.745)) * 100

# Derive FL_echo based on ultrasound results column
df2['FL_echo'] = df2['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)']
df2['FL_echo'] = df2['FL_echo'].replace('<NA>', np.nan)

#Derive fl_status column based on fl_echo and FLI(when available): 1 for positive; 0 for negative; -1 for unavailable
def derive_fl_status(row):
    # Derive FL_Check column to infer the status by echo or FLI
    liver_status = row['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)']
    fli_value = row['FLI']

    if pd.isna(liver_status) and pd.isna(fli_value):
        return -1
    elif pd.notna(liver_status) and liver_status != 0:
        return 1
    elif pd.notna(fli_value) and fli_value >= 60:
        return 1
    else:
        return 0

df2['fl_status'] = df2.apply(derive_fl_status, axis=1)

#Derive homa_ir_check, hs_crp_check, and mst_total to determine MAFLD risk factors
df2['homa_ir_check'] = df2['HOMA_IR'].apply(lambda x: 1 if x >= 2.5 else 0)
df2['hs_crp_check'] = df2['HS_CRP'].apply(lambda x: 1 if x > 2 else 0)
df2['mst_total'] = df2[['w', 'hyper', 'HDL', 'fg', 'trig', 'homa_ir_check', 'hs_crp_check']].sum(axis=1)

#Derive target variables and named as df3
def derive_MAFLD(df):
    df['MAFLD'] = 0  # Initialize MAFLD field as 0

    # Condition 1: fl_status = -1
    df.loc[df['fl_status'] == -1, 'MAFLD'] = -1

    # Condition 2: fl_check = 0
    df.loc[df['fl_status'] == 0, 'MAFLD'] = 0

    # Condition 3: fl_check = 1
    # Subcondition 1: BMI >= 23
    df.loc[(df['fl_status'] == 1) & (df['BMI'] >= 23), 'MAFLD'] = 1

    # Subcondition 2: BMI < 23 and mst >= 2
    df.loc[(df['fl_status'] == 1) & (df['BMI'] < 23) & (df['mst_total'] >= 2), 'MAFLD'] = 1

    # Subcondition 3: DM_determine = 1
    df.loc[(df['fl_status'] == 1) & (df['DM_determine'] == 1), 'MAFLD'] = 1

    return df

def derive_CKD(df):
    # Initialize CKD field as -1
    df['CKD'] = -1

    # Condition 1: egfrn >= 60 and Alb_Cre_ratio < 3
    df.loc[(df['Estimated_GFR_x'] >= 60) & (df['Alb_Cre_ratio'] < 3), 'CKD'] = 1

    # Condition 2: egfrn >= 60 and 3 <= Alb_Cre_ratio <= 30 or 45 <= egfrn < 60 and Alb_Cre_ratio < 3
    df.loc[((df['Estimated_GFR_x'] >= 60) & (df['Alb_Cre_ratio'].between(3, 30))) |
           ((df['Estimated_GFR_x'].between(45, 60)) & (df['Alb_Cre_ratio'] < 3)), 'CKD'] = 2

    # Condition 3: egfrn >= 60 and Alb_Cre_ratio > 30 or egfrn < 60 and Alb_Cre_ratio >= 0
    df.loc[((df['Estimated_GFR_x'] >= 60) & (df['Alb_Cre_ratio'] > 30)) |
           ((df['Estimated_GFR_x'] < 60) & (df['Alb_Cre_ratio'] >= 0)), 'CKD'] = 3

    # Set CKD as 0 for cases where egfrn and Alb_Cre_ratio are not empty and CKD is still -1
    df.loc[(df['Estimated_GFR_x'].notnull()) & (df['Alb_Cre_ratio'].notnull()) & (df['CKD'] == -1), 'CKD'] = 0

    return df

df3 = derive_CKD(derive_MAFLD(df2))

# Derive FL_group_list, 对CMRC_id分组并计算每个病人的FL_Check的唯一值
grouped = df2.groupby('CMRC_id')['fl_status'].unique()
df2['fl_group_list'] = df2.groupby('CMRC_id')['fl_status'].transform(lambda x: [x.unique().tolist()] * len(x))

# Derive patient_fl_validity according to FL_group_list conditions
def assign_patient_fl_validity(df):
    df['patient_fl_validity'] = -1  # 初始化所有记录为第三组 (-1)

    def get_patient_valid(group_list):
        if isinstance(group_list, list):
            if -1 in group_list and len(group_list) == 1:
                return "unavailable"  # 第三组 (-1)
            elif -1 in group_list:
                return "partial"  # 第二组 (0)
            else:
                return "completed"  # 第一组 (1)
        else:
            return "others"  # 第三组 (-1)

    df['patient_fl_validity'] = df['fl_group_list'].apply(get_patient_valid)

    return df


#assign patient valid value
df4 = assign_patient_fl_validity(df3)

In [7]:
# Deal with values with both string and num values: HBsAg_x, Anti_HCV_x, (values eg. 陰性    0.351)
# This function will extract the numeric values
import re
def extract_numeric_value(value):
    pattern = r'\d+(\.\d+)?'  # 正则表达式模式，匹配一个或多个数字（包括小数点）
    match = re.search(pattern, str(value))
    if match:
        return float(match.group())
    else:
        return None

# extract numeric values for these cols and rename that as  *_num
columns_to_extract = ['HBsAg_x', 'Anti_HCV_x']

df5 = df4.copy()
for column in columns_to_extract:
    new_column_name = column + '_num'
    df5[new_column_name] = df5[column].apply(extract_numeric_value)


In [8]:
def sliding_window_data(df, input_window_size, target_window_size):
    transformed_data = []
    group_counter = {}

    df_sorted = df.sort_values(['CMRC_id', 'year_come'])

    for patient_id, group in df_sorted.groupby('CMRC_id'):
        if len(group) < input_window_size + target_window_size:
            continue


        group_counter.setdefault(patient_id, 0)
        group_counter[patient_id] += 1
        group_alias = f'{patient_id}_group{group_counter[patient_id]}'

        for i in range(len(group) - input_window_size - target_window_size + 1):
            input_data = group[i:i+input_window_size]
            target_data = group[i+input_window_size:i+input_window_size+target_window_size]

            # Flatten input_data and repeat target_data
            input_features_t1 = input_data.iloc[0, :].values.flatten()
            input_features_t2 = input_data.iloc[1, :].values.flatten()
            t3_MAFLD = target_data['MAFLD'].values

            new_row = [group_alias] + list(input_features_t1) + list(input_features_t2) + list(t3_MAFLD)

            transformed_data.append(new_row)

    columns_list = ['CMRC_id'] + [f't1_{col}' for col in input_data.columns] + [f't2_{col}' for col in input_data.columns] + [f't3_MAFLD']
    transformed_df = pd.DataFrame(transformed_data, columns=columns_list)
    return transformed_df


# Sliding window implementation, ie. use previous 2 years record to predict the 3 year MAFLD status
df6 = sliding_window_data(df5, input_window_size=2, target_window_size=1)


In [9]:
total_records = df6.shape[0]
print("Total records in df6:", total_records)

Total records in df6: 9724


In [29]:
# Filter available data that can be applied to models
df7 = df6[(df6['t3_MAFLD'] != -1)]

# Drop ID relevant cols in the dataset
columns_to_drop = ['CMRC_id', 't1_CMRC_id', 't2_CMRC_id', 't1_sid', 't2_sid', 't1_P_Number','t2_P_Number']
df8 = df7.drop(columns=columns_to_drop)

In [36]:
#checks
df8[['t3_MAFLD']].value_counts(dropna=False).reset_index()


Unnamed: 0,t3_MAFLD,0
0,0,7614
1,1,2044


In [37]:
#Select key columns for conventional machine learning models

# This function is for adding prefix for cols, the cols should be a list of column names that needs to add prefix(such as "t1_" in this project)
def add_prefix(cols, prefixes):
# Note the prefixes should be a LIST, eg. prefixes = ["t1_", "t2_"]
    renamed_columns = []
    for prefix in prefixes:
        renamed_columns.extend([prefix + column for column in cols])
    return renamed_columns

columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid", "HBsAg_x", "Anti_HCV_x", "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", "smoke", "smoke_q", "coffee", "betel", "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", "MNA", "AUDIT", "HBV_", "HCV_", "MAFLD", "CKD", \
           'HBsAg_x_num', 'Anti_HCV_x_num']
prefixes = ["t1_", "t2_"]
renamed_columns = add_prefix(columns, prefixes)

df9 = df8[renamed_columns]
df9['t3_MAFLD'] = df8['t3_MAFLD']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['t3_MAFLD'] = df8['t3_MAFLD']


In [38]:
# drop these cols as those been derived for numeric cols, remain alias *_num,
cols_to_drop_only_MAFLD = ['t1_HBsAg_x', 't2_HBsAg_x', 't1_Anti_HCV_x', 't2_Anti_HCV_x', 't1_MAFLD', 't2_MAFLD']
cols_to_drop_fli_related = ['t1_HBsAg_x', 't2_HBsAg_x', 't1_Anti_HCV_x', 't2_Anti_HCV_x', 't1_MAFLD', 't2_MAFLD', \
                            't1_Triglyceride_y', 't1_BMI', 't1_gamgt', 't1_waist_y', 't1_gamgt', 't1_w', \
                            't2_Triglyceride_y', 't2_BMI', 't2_gamgt', 't2_waist_y', 't2_gamgt', 't2_w']
df9_a = df9.drop(cols_to_drop_only_MAFLD, axis=1)

#FLI related cols: Triglyceride_y, BMI, gamgt, waist_y, gamgt
df9_b = df9.drop(cols_to_drop_fli_related, axis=1)

In [39]:
df9_a.head(1)

Unnamed: 0,t1_sex,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,...,t2_sarcf,t2_ms2,t2_MNA,t2_AUDIT,t2_HBV_,t2_HCV_,t2_CKD,t2_HBsAg_x_num,t2_Anti_HCV_x_num,t3_MAFLD
0,0,59,82.0,101.0,264.0,56.4,21.0,19.0,15.0,9.18,...,,1,,1.0,,,1,,,0


In [111]:
df9_a['t2_sex'].value_counts()

0    6473
1    3185
Name: t2_sex, dtype: int64

Baseline Model

In [114]:
# start modeling preparation
# split categorical and numerical variables,

df9_a.drop('t2_sex', axis=1, inplace=True)
features = df9_a.columns.drop(['t3_MAFLD'])

categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD', \
                        't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']
numeric_features = df9_a.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df9_a[categorical_features]
X_numeric = df9_a[numeric_features]
y = df9_a['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 114)
y shape:  (9658,)


In [116]:
# train test split, seed=2023
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Baseline Logistic model(all important factors) AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Baseline Random Forest(all important factors) AUC score: ", auc_rf)

Baseline Logistic model(all important factors) AUC score:  0.8495866274108692
Baseline Random Forest(all important factors) AUC score:  0.8443908617250324


Analysis 1

In [66]:
#Analysis 1: The impact of interval years
# Start with df9_a, dataset contains BMI, etc variables
# Check t1->t3
def select_columns(df, prefix, additional_column=None):
    selected_columns = [col for col in df.columns if col.startswith(prefix)]
    if additional_column is not None:
        selected_columns.append(additional_column)
    return df[selected_columns]

df10_a_t1 = select_columns(df9_a, 't1_', 't3_MAFLD')
df10_a_t1.head(1)

Unnamed: 0,t1_sex,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,...,t1_sarcf,t1_ms2,t1_MNA,t1_AUDIT,t1_HBV_,t1_HCV_,t1_CKD,t1_HBsAg_x_num,t1_Anti_HCV_x_num,t3_MAFLD
0,0,59,82.0,101.0,264.0,56.4,21.0,19.0,15.0,9.18,...,,1,,,0.0,0.0,2,0.44,0.04,0


In [59]:
# start modeling preparation
# split categorical and numerical variables,
features = df10_a_t1.columns.drop(['t3_MAFLD'])

categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD']
                        # 't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']
numeric_features = df10_a_t1.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df10_a_t1[categorical_features]
X_numeric = df10_a_t1[numeric_features]
y = df10_a_t1['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 58)
y shape:  (9658,)


In [60]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)
print("Start train test split with Random Seed = 2023")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important factors) for t1 predict t3 AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) for t1 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 2023
Logistic model(all important factors) for t1 predict t3 AUC score:  0.8412379421221865
Random Forest(all important factors) for t1 predict t3 AUC score:  0.8345329991692992


In [61]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=713)
print("Start train test split with Random Seed = 713")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important factors) for t1 predict t3 AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) for t1 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 713
Logistic model(all important factors) for t1 predict t3 AUC score:  0.8411861814984023
Random Forest(all important factors) for t1 predict t3 AUC score:  0.8286175196324223


In [62]:
# Check t2 ->t3
df10_a_t2 = select_columns(df9_a, 't2_', 't3_MAFLD')
df10_a_t2.head(1)

Unnamed: 0,t2_sex,t2_age,t2_waist_y,t2_Glucose_AC_y,t2_Triglyceride_y,t2_HDL_C_y,t2_AST_GOT,t2_ALT_GPT,t2_gamgt,t2_Insulin,...,t2_sarcf,t2_ms2,t2_MNA,t2_AUDIT,t2_HBV_,t2_HCV_,t2_CKD,t2_HBsAg_x_num,t2_Anti_HCV_x_num,t3_MAFLD
0,0,60,83.0,104.0,255.0,55.8,15.0,18.0,14.0,5.86,...,,1,,1.0,,,1,,,0


In [63]:
# start modeling preparation for t2
# split categorical and numerical variables,
features = df10_a_t2.columns.drop(['t3_MAFLD'])

categorical_features = ['t2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']
numeric_features = df10_a_t2.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df10_a_t2[categorical_features]
X_numeric = df10_a_t2[numeric_features]
y = df10_a_t2['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 57)
y shape:  (9658,)


In [64]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)
print("Start train test split with Random Seed = 2023")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)

print("Logistic model(all important factors) for t2 predict t3 AUC score: ",auc_log)
print("Random Forest(all important factors) for t2 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 2023
Logistic model(all important factors) for t2 predict t3 AUC score:  0.8436544623330827
Random Forest(all important factors) for t2 predict t3 AUC score:  0.831706073158189


In [65]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=713)
print("Start train test split with Random Seed = 713")

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important factors for t2 predict t3 AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) for t1 predict t3 AUC score: ", auc_rf)

Start train test split with Random Seed = 713
Logistic model(all important factors for t2 predict t3 AUC score:  0.8478446530488657
Random Forest(all important factors) for t1 predict t3 AUC score:  0.8360619400114597


Analysis 2

In [78]:
# Analysis 2: the impact of survey data
# Feature selection - added survey data then modeling

columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid", "HBsAg_x", "Anti_HCV_x", "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", \
          #  "smoke", "smoke_q", "coffee", "betel",
           "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", "MNA", "AUDIT", "HBV_", "HCV_", "MAFLD", "CKD", \
           'HBsAg_x_num', 'Anti_HCV_x_num']

survey_data =['smoke', 's_smoke', 'smoke_t', 'smoke_q', 'smoke_qt1', 'smoke_qt2', 'smoke_f', 'smoke_n', 'smoke_second', \
           'tea', 'tea_c', 'tea_1', 'tea_t', 'tea_q', 'tea_f', 'tea_v', 'coffee', 'coffee_c', 'coffee_t', 'coffee_q', 'coffee_f', 'coffee_v', \
           'betel', 's_betel', 'betel_clt1', 'betel_clt2', 'betel_clt3', 'betel_clt4', 'betel_clt5', 'betel_clt6', 'betel_o', 'betel_t', 'betel_q', 'betel_f', 'betel_n', \
           'activity_t', 'carryh', 'sport', 'sport_1', 'sport_d', 'sport_t', 'cardio', 'hypertension', 'Dysrhythmia', 'ap', 'ami', 'Hyperlipidemia', 'HF', 'f_cardio', \
           'f_hyper', 'f_dys', 'f_ap', 'f_ami', 'f_lipid', 'f_hf', 'm_cardio', 'm_hyper', 'm_dys', 'm_ap', 'm_ami', 'm_lipid', 'm_hf', 'bs_cardio', 'bs_hyper', 'bs_hyper_1', \
           'bs_dys', 'bs_dys_1', 'bs_ap', 'bs_ap1', 'bs_ami', 'bs_ami_1', 'bs_lipid', 'bs_lipid_1', 'bs_hf', 'bs_hf_1', \
           'chi_cardio', 'chi_hyper', 'chi_dys', 'chi_ap', 'chi_ami', 'chi_lipid', 'chi_hf', \
           'endocrine', 'diabetes', 'Thyroid', 'f_dm', 'f_Thyroid', 'm_dm', 'm_Thyroid', 'bs_dm', 'bs_dm_1', 'bs_Thyroid', 'bs_Thyroid_1', \
           'chi_dm', 'chi_Thyroid', 'pepticulcer', 'gastritis', 'hbv', 'hcv', 'hepatitis_o', 'FLD', 'fibrosis', 'Cirrhosis', 'Polyposis', 'ibs', \
           'f_pud', 'f_hbv', 'f_hcv', 'f_liver', 'f_fld', 'f_f', 'f_lc', 'f_polyp', 'f_ibs', 'm_pud', 'm_hbv', 'm_hcv', 'm_liver', 'm_fld', 'm_f', 'm_lc', 'm_polyp', 'm_ibs', \
           'bs_pud', 'bs_pud_1', 'bs_hbv', 'bs_hbv_1', 'bs_hcv', 'bs_hcv_1', 'bs_liver', 'bs_fld', 'bs_fld_1', 'bs_f', 'bs_f_1', 'bs_lc', 'bs_lc_1', \
           'bs_polyp', 'bs_polyp_1', 'bs_ibs', 'bs_ibs_1', 'chi_pud', 'chi_hbv', 'chi_hcv', 'chi_liver', 'chi_fld', 'chi_f', 'chi_lc', 'chi_polyp', 'chi_ibs', \
           'respiratory', 'tb', 'Asthma', 'apnea', 'copd', 'f_resp', 'f_Asthma', 'f_apnea', 'f_copd', 'm_resp', 'm_Asthma', 'm_apnea', 'm_copd', \
           'bs_resp', 'bs_Asthma', 'bs_Asthma_1', 'bs_apnea', 'bs_apnea_1', 'bs_copd', 'bs_copd_1', 'chi_resp', 'chi_Asthma', 'chi_apnea', 'chi_copd', \
           'anemia', 'Hemophilia', 'f_anemia', 'f_h', 'm_anemia', 'm_h', 'bs_anemia', 'bs_anemia_1', 'bs_h', 'bs_h_1', 'chi_anemia', 'chi_h', \
           'stroke', 'brainbleed', 'head_injury', 'epilepsy', 'Parkinsons', 'dementia', 'Huntingtons', 'f_stroke', 'f_hi', 'f_brainbleed', 'f_epilep', 'f_pd', 'f_demen', 'f_hd', \
           'm_stroke', 'm_hi', 'm_brainbleed', 'm_epilep', 'm_pd', 'm_demen', 'm_hd', 'bs_stroke', 'bs_stroke_1', 'bs_hi', 'bs_hi_1', 'bs_brainbleed', 'bs_brainbleed_1', 'bs_epilep', \
           'bs_epilep_1', 'bs_pd', 'bs_pd_1', 'bs_demen', 'bs_demen_1', 'bs_hd', 'bs_hd_1', 'chi_stroke', 'chi_hi', 'chi_brainbleed', 'chi_epilep', 'chi_pd', 'chi_demen', 'chi_hd', \
           'insomnia', 'Depression', 'mdd_dx', 'mental', 'f_mdd', 'f_mdd_dx', 'f_mental', 'm_mdd', 'm_mdd_dx', 'm_mental', 'bs_mdd', 'bs_mdd_1', 'bs_mdd_dx', 'bs_mental', 'chi_mdd', \
           'chi_mdd_dx', 'chi_mental', 're_1', 'ckd', 'dialysis', 'gout', 'stone', 'bpn', 'f_re', 'f_ckd', 'f_dialysis', 'f_stone', 'f_bpn', 'm_re', 'm_ckd', 'm_dialysis', 'm_stone', \
           'm_bpn', 'bs_re', 'bs_ckd', 'bs_ckd_1', 'bs_dialysis', 'bs_dialysis_1', 'bs_stone', 'bs_stone_1', 'bs_bpn', 'bs_bpn_1', 'chi_re', 'chi_ckd', 'chi_dialysis', 'chi_stone', 'chi_bpn', \
           'cataract', 'glau', 'retina', 'sicca', 'f_cata', 'f_glau', 'f_retina', 'f_sicca', 'm_cata', 'm_glau', 'm_retina', 'm_sicca', 'bs_cata', 'bs_cata_1', 'bs_glau', 'bs_glau_1', 'bs_retina', \
           'bs_retina_1', 'bs_sicca', 'bs_sicca_1', 'chi_cata', 'chi_glau', 'chi_retina', 'chi_sicca', 'cancer', 'cancer_1', 'cantime', 'cancer_mi', 'lungcan', 'breastcna', 'colican', 'gastrican', \
           'livercan', 'cancer_o', 'f_can', 'f_can_1', 'f_canmi', 'f_lung', 'f_breast', 'f_coli', 'f_gastric', 'f_liver_c', 'f_can_o', 'm_can', 'm_can_1', 'm_canmi', 'm_lung', 'm_breast', 'm_coli', \
           'm_gastric', 'm_liver_c', 'm_can_o', 'bs_can', 'bs_can_1', 'bs_canmi', 'bs_lung', 'bs_lung_1', 'bs_breast', 'bs_breast_1', 'bs_coli', 'bs_coli_1', 'bs_gastric', 'bs_gastric_1', 'bs_liver_c', \
           'bs_liver_c_1', 'bs_can_o', 'chi_can', 'chi_can_1', 'chi_canmi', 'chi_lung', 'chi_breast', 'chi_coli', 'chi_gastric', 'chi_liver_c', 'chi_can_o', 'ndisease', 'fn', 'mn', 'bsn', 'chin', 'UNKNOWN', \
           'funknown', 'munknown', 'bsunknown', 'chiunknown', 'autoimmu', 'dis_o', 'f_dis_o', 'f_disable', 'm_dis_o', 'm_disable', 'bs_dis_o', 'bs_disable', 'chi_dis_o', 'chi_disable', 'Syr_drug', 'Hypnotic', \
           'drug', 'drug_w', 'drug_yr', 'drug_diag', 'drug_cont', 'hyper_drug', 'hyper_drugn', 'hyper_druga', 'hyper_confir', 'hyper_cont', 'dm_drug', 'dm_w', 'dm_yr', 'dm_diag', 'dm_cont', 'hor_drug', 'hor_w', \
           'hor_yr', 'hor_diag', 'hor_diag1', 'hor_cont', 'lipiddrug', 'lipidw', 'lipidyr', 'lipiddiag', 'lipiddiag1', 'lipidcont', 'supply', 'gasdrug', 'cmed', 'hf_none', 'hf_vit', 'hf_wgl', 'hf_foil', 'hf_ca', \
           'hf_p', 'hf_p1', 'hf_p2', 'hf_chic', 'hf_clam', 'hf_e', 'hf_lutein', 'hf_collagen', 'hf_other', 'hf_fmed', 'hf_unknown', 'hf_1', \
           'plastic_1', 'plastic_2', 'plastic_3', 'plastic_4', 'plastic_5', 'plastic_6', 'plastic_7', 'BI_1', 'BI_2', 'BI_3', \
           'SF36_1', 'SF36_2', 'sf36_3_a', 'sf36_3_b', 'sf36_3_c', 'sf36_3_d', 'sf36_3_e', 'sf36_3_f', 'sf36_3_g', 'sf36_3_h', 'sf36_3_i',\
           'sf36_3_j', 'sf36_4_a', 'sf36_4_b', 'sf36_4_c', 'sf36_4_d', 'sf36_5_a', 'sf36_5_b', 'sf36_5_c', 'sf36_6', 'sf36_7', 'sf36_8', 'sf36_9_a', 'sf36_9_b', \
           'sf36_9_c', 'sf36_9_d', 'sf36_9_e', 'sf36_9_f', 'sf36_9_g', 'sf36_9_h', 'sf36_9_i', 'sf36_10', 'sf36_11_a', 'sf36_11_b', 'sf36_11_c', 'sf36_11_d', \
           'check5', 'beda', 'bedb', 'bedt', 'waketa', 'waketb', 'sleepta', 'sleeptb', 'sleep1', 'sleep2', 'sleep3', 'sleep4', 'sleep5', 'sleep6', 'sleep7', 'sleep8', 'sleep9', 'sleep10', 'sleep10a', \
           'sleep11', 'sleep11v3', 'sleep12', 'sleep12v3', 'sleep13', 'sleep13v3', 'sleep14', 'sleep14v3', 'sleep15', 'check6', 'hads_1', 'hads_2', 'hads_3', 'hads_4', 'hads_5', 'hads_6', 'hads_7', 'hads_8', \
           'hads_9', 'hads_10', 'hads_11', 'hads_12', 'hads_13', 'hads_14', 'check7', 'uls8_1', 'uls8_2', 'uls8_3', 'uls8_4', 'uls8_5', 'uls8_6', 'uls8_7', 'uls8_8', 'check8', 'ad8_1', 'ad8_2', 'ad8_3', 'ad8_4', \
           'ad8_5', 'ad8_6', 'ad8_7', 'ad8_8', 'check9', 'ecog12a', 'ecog12a1', 'ecog12b1', 'ecog12b2', 'ecog12b3', 'ecog12b4', 'ecog12b5', 'ecog12b6', 'ecog12b7', 'ecog12b8', 'ecog12b9', 'ecog12b10', 'ecog12b11', 'ecog12b12', \
           'check10', 'mna_1', 'mna_2', 'mna_3', 'mna_4', 'mna_5', 'mna_6', 'mna_7', 'mna_8', 'mna_9', 'mna_10', 'mna_11', 'mna_12', 'mna_13', 'mna_14', 'mna_15', 'check11', 'sarcf_1', 'sarcf_2', 'sarcf_3', 'sarcf_4', 'sarcf_5', \
           'check12', 'd2_drink', 'd2_drinkt', 'd2_drinkav', 'd2_drinkday', 'd2_drinkcat', 'd2_drinknote', 'd2_drinkvol', 'past2_drinkav', 'past2_drinkday', 'past2_drink', 'past2_drinkcat', 'past2_drinknote', 'past2_drinkvol', 'past1_drink', \
           'd2_drinks', 'drink_k1', 'drink_k2', 'drink_k3', 'drink_k4', 'drink_k5', 'drink_k6', 'drink_k7', 'drink_k7_o', 'drink_k7_1', 'drink_k7_2', 'drink_k7_3', 'drink_k7_4', 'drink_k7_5', 'drink_k7_6', 'drink_t', 'drink_q', 'drink_qt1', 'drink_qt2', \
           'drink_red', 'B_1_1', 'B_1_2', 'B_1_3', 'B_1_4', 'B_1_5', 'B_1_6', 'B_1_7', 'B_1_8', 'B_1_9', 'B_1_10', 'B_2_1', 'B_2_2', 'B_2_3', 'B_2_4', 'check13', \
           'life1', 'life2', 'life3', 'life4', 'life5', 'life6', 'life7', 'life8', 'life9', 'life10', 'life11', 'life12', 'life13', 'life14', 'life15', 'life16', 'life17', 'life18', 'life19', 'life20', 'life21', 'life22', 'life23', 'life24', \
           'depres_11', 'depres_12', 'depres_13', 'depres_14', 'depres_15', 'depres_16', 'depres_17', 'depres_18', 'depres_19', 'depres_110', 'depres_111', 'depres_112', 'depres_113', 'depres_114', 'depres_115', 'depres_116', 'depres_117', 'depres_118', \
           'health_1_old', 'health_2_old', 'health_3_old', 'health_4_old', 'health_5_old', 'health_6_old', 'health_7_old', 'health_8_old', 'health_9_old', 'health_10_old', \
           'health_1', 'health_2', 'health_3', 'health_4', 'health_5', 'health_6', 'health_7', 'health_8', 'health_9', 'health_10'
           ]

selected_columns = columns + survey_data
prefixes = ["t1_", "t2_"]
renamed_columns = add_prefix(selected_columns, prefixes)

df_addSurvey_1 = df8[renamed_columns]
df_addSurvey_1['t3_MAFLD'] = df8['t3_MAFLD']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_addSurvey_1['t3_MAFLD'] = df8['t3_MAFLD']


In [79]:
num_columns = len(df_addSurvey_1.columns)
print("Number of columns in df_addSurvey_1:", num_columns)

Number of columns in df_addSurvey_1: 1433


In [80]:
# Remove high missing value columns
def remove_columns_with_high_missing_values(df, threshold):
    """
    Remove columns from a DataFrame that have missing values exceeding the specified threshold.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - threshold (float): The threshold for missing values. Columns with missing values exceeding this threshold will be removed.

    Returns:
    - cleaned_df (DataFrame): The cleaned DataFrame with columns removed.
    """
    total_missing = df.isnull().sum()  # 计算每列的缺失值数量
    total_rows = df.shape[0]  # 数据集的总行数
    columns_to_remove = total_missing[total_missing / total_rows > threshold].index  # 找到超过阈值的列名
    cleaned_df = df.drop(columns=columns_to_remove)  # 删除指定列
    print("columns to remove with high missing values: ", columns_to_remove)

    return cleaned_df

df_addSurvey_2 = remove_columns_with_high_missing_values(df_addSurvey_1,0.8)

columns to remove with high missing values:  Index(['t1_sarcf', 't1_MNA', 't1_smoke_qt1', 't1_smoke_qt2', 't1_smoke_f',
       't1_tea_1', 't1_betel_o', 't1_betel_q', 't1_sport_1', 't1_cardio',
       ...
       't2_health_1_old', 't2_health_2_old', 't2_health_3_old',
       't2_health_4_old', 't2_health_5_old', 't2_health_6_old',
       't2_health_7_old', 't2_health_8_old', 't2_health_9_old',
       't2_health_10_old'],
      dtype='object', length=830)


In [92]:
df_addSurvey_2['t2_sex'].value_counts()

0    6473
1    3185
Name: t2_sex, dtype: int64

In [93]:
num_columns = len(df_addSurvey_2.columns)
print("Number of columns in df_addSurvey_2:", num_columns)

Number of columns in df_addSurvey_2: 603


In [94]:
def remove_columns_with_high_unique_values(df, threshold):
    """
    Remove columns from a DataFrame that have unique values exceeding the specified threshold.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - threshold (int): The threshold for unique values. Columns with unique values exceeding this threshold will be removed.

    Returns:
    - cleaned_df (DataFrame): The cleaned DataFrame with columns removed.
    """
    categorical_columns = df.select_dtypes(include='object').columns  # 获取分类字段列
    columns_to_remove = []
    for column in categorical_columns:
        unique_values = df[column].nunique()  # 计算唯一值数量
        if unique_values > threshold:
            columns_to_remove.append(column)
    cleaned_df = df.drop(columns=columns_to_remove)  # 删除指定列

    return cleaned_df

threshold = 6
df_addSurvey_3 = remove_columns_with_high_unique_values(df_addSurvey_2, threshold)

In [95]:
num_columns = len(df_addSurvey_3.columns)
print("Number of columns in df_addSurvey_3:", num_columns)

Number of columns in df_addSurvey_3: 552


In [96]:
df_addSurvey_3['t2_sex'].value_counts()

0    6473
1    3185
Name: t2_sex, dtype: int64

In [97]:
df_addSurvey_3.head()

Unnamed: 0,t1_sex,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,...,t2_health_2,t2_health_3,t2_health_4,t2_health_5,t2_health_6,t2_health_7,t2_health_8,t2_health_9,t2_health_10,t3_MAFLD
0,0,59,82.0,101.0,264.0,56.4,21.0,19.0,15.0,9.18,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,0,60,83.0,104.0,255.0,55.8,15.0,18.0,14.0,5.86,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,0,62,85.0,103.0,192.0,60.7,18.0,22.0,17.0,4.56,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
3,0,63,84.0,111.0,164.0,83.0,27.0,30.0,16.0,6.3,...,0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0
4,0,58,61.0,99.0,75.0,60.4,23.0,17.0,10.0,3.75,...,1,1.0,1.0,1.0,1.0,1.0,1.0,1,1,0


In [98]:
df_addSurvey_4 = df_addSurvey_3.drop('t2_sex', axis=1)

num_columns = len(df_addSurvey_4.columns)
print("Number of columns in df_addSurvey_4:", num_columns)

Number of columns in df_addSurvey_4: 551


In [100]:
prefixes = ["t1_", "t2_"]
all_survey_data_renamed_cols = add_prefix(survey_data, prefixes)

#get final_survey_data_included_cols for modeling to define categorical variables
final_survey_data_included_cols = [col for col in df_addSurvey_4.columns if col in all_survey_data_renamed_cols]

In [102]:
't1_smoke' in final_survey_data_included_cols

True

In [105]:
't1_betel' in final_survey_data_included_cols

True

In [109]:
#Modeling
# start modeling preparation
# split categorical and numerical variables,
features = df_addSurvey_4.columns.drop(['t3_MAFLD'])

# categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD', \
#                         't2_w', 't2_smoke', 't2_smoke_q', 't2_coffee', 't2_betel', 't2_DM_determine', 't2_CKD']

def generate_column_names(prefix, column_names):
    new_column_names = []
    for p in prefix:
        new_column_names.extend([p + col for col in column_names])
    return new_column_names

prefix = ['t1_', 't2_']
# categorical_column_names = ['sex', 'w', 'smoke', 'smoke_q', 'coffee', 'betel', 'DM_determine', 'CKD']
categorical_column_names = ['w', 'DM_determine', 'CKD']
categorical_column_names_non_survey = generate_column_names(prefix, categorical_column_names)
categorical_features = categorical_column_names_non_survey + final_survey_data_included_cols + ['t1_sex']
# selected_categorical_column_names = categorical_column_names + final_survey_data_included_cols
# categorical_features = generate_column_names(prefix, selected_categorical_column_names)

numeric_features = df_addSurvey_4.columns.drop(categorical_features).drop(['t3_MAFLD'])
X_categorical = df_addSurvey_4[categorical_features]
X_numeric = df_addSurvey_4[numeric_features]
y = df_addSurvey_4['t3_MAFLD']

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

print('X_conbined shape: ',X_combined.shape)
print('y shape: ',y.shape)

X_conbined shape:  (9658, 4827)
y shape:  (9658,)


In [117]:
X_combined.head(3)

Unnamed: 0,t1_age,t1_waist_y,t1_Glucose_AC_y,t1_Triglyceride_y,t1_HDL_C_y,t1_AST_GOT,t1_ALT_GPT,t1_gamgt,t1_Insulin,t1_T_Cholesterol,...,t2_coffee_<NA>,t2_betel_0,t2_betel_1,t2_betel_<NA>,t2_DM_determine_0,t2_DM_determine_1,t2_CKD_-1,t2_CKD_1,t2_CKD_2,t2_CKD_3
0,0.112647,0.129441,-0.027004,0.793641,-0.134864,-0.3052,-0.352104,-0.397066,0.071699,2.723389,...,0,1,0,0,1,0,0,1,0,0
1,0.199871,0.228791,0.091083,0.743757,-0.173223,-0.858277,-0.408268,-0.443176,-0.346993,2.64838,...,0,1,0,0,1,0,0,0,1,0
2,0.37432,0.42749,0.051721,0.394564,0.14004,-0.581738,-0.183612,-0.304845,-0.510939,0.173079,...,0,1,0,0,1,0,0,0,1,0


In [110]:
# train test split, seed=2023
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=2023)

from sklearn.metrics import roc_auc_score, roc_curve
#Log model
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train, y_train)

# Evaluation
predictions_log = logmodel.predict(X_test)
probabilities_log = logmodel.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class

# Calculate AUC
auc_log = roc_auc_score(y_test, probabilities_log)
print("Logistic model(all important features) with Survey data AUC score: ",auc_log)

# Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

predictions_rf = rf_model.predict(X_test)
accuracy_rf = rf_model.score(X_test, y_test)

# Calculate AUC
probabilities_rf = rf_model.predict_proba(X_test)[:, 1]  # Use probabilities of the positive class
auc_rf = roc_auc_score(y_test, probabilities_rf)
print("Random Forest(all important factors) with Survey data AUC score: ", auc_rf)

Logistic model(all important features) with Survey data AUC score:  0.7953819811368734
Random Forest(all important factors) with Survey data AUC score:  0.8395641787080623


Analysis 3

In [None]:
#check unique values
