Feature importance

In [2]:
import joblib 

loaded_model = joblib.load("saved_models/XGBoost_diabetes.pkl")
feature_names = joblib.load("saved_models/diabetes_feature_names.pkl")

importances = loaded_model.feature_importances_
feature_importance = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

for feature, importance in feature_importance[:50]:
    print(f"{feature}: {importance:.4f}")


LBXGH: 0.0471
RXQ033: 0.0188
AUQ410A: 0.0177
DR1HELP: 0.0177
DRD370NQ: 0.0160
RXQ050: 0.0142
DR1TALCO: 0.0132
BAXPF42: 0.0104
DR1TPHOS: 0.0098
MCQ160E: 0.0097
BAQ391B: 0.0093
SLQ320b2000: 0.0093
DRD350H: 0.0093
SMQ725: 0.0090
DR1TCARB: 0.0087
PHDSESNZ: 0.0087
BPQ150: 0.0077
INDFMMPI: 0.0077
DPQ100: 0.0076
MCQ160B: 0.0076
LBXDHE: 0.0075
DRD370AQ: 0.0072
LUARXNC: 0.0072
DR1TVB12: 0.0070
LBXTC: 0.0069
DRD350JQ: 0.0069
DR1TPOTA: 0.0067
BAARFC22: 0.0067
PAD790Qcombined5397605346934028e79b: 0.0067
BAQ491: 0.0066
PHACOFMN: 0.0066
BAQ321B: 0.0066
SMQ681: 0.0064
BPXOPLS2: 0.0064
PHAGUMMN: 0.0060
DRD370DQ: 0.0060
MCQ035: 0.0060
SMD650: 0.0059
LBXHGB: 0.0058
OHQ845: 0.0058
BMXLEG: 0.0056
BPQ101D: 0.0056
KIQ022: 0.0055
BPQ080: 0.0055
WTDRD1: 0.0054
DR1TVC: 0.0054
PHACOFHR: 0.0052
LBXLUH: 0.0052
IMQ060: 0.0051
BAQ321C: 0.0051


In [3]:
# Ensure target is aligned
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import re
from sklearn.impute import KNNImputer, SimpleImputer
from dataset_processing import get_columns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import joblib

# Load dataset
df = pd.read_csv('NHANES_SAMPLE.csv')

# Define target column
target_column, remove_list = get_columns('diab')

# Ensure target column exists
if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in dataset!")

# Handle missing target values
df = df.dropna(subset=[target_column])

# Convert target column to binary classification (1 = diabetes, else 0)
df[target_column] = df[target_column].apply(lambda x: 1 if x == 1 else 0)

# Convert all byte-string values to normal strings
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Select all feature columns (excluding target)
feature_columns = [col for col in df.columns if col != target_column]
feature_columns = [col for col in df.columns if col not in remove_list]

# Define activity columns to combine
activity_columns = [("PAD790Q", "PAD790U"), ("PAD810Q", "PAD810U")]
# Combine activity columns
for freq_col, unit_col in activity_columns:
    if freq_col in df.columns and unit_col in df.columns:
        combined_col = f"{freq_col}_combined"
        df[combined_col] = df[freq_col].astype(str) + "_" + df[unit_col].astype(str)
        df.drop([freq_col, unit_col], axis=1, inplace=True)
        feature_columns.remove(freq_col)
        feature_columns.remove(unit_col)
        feature_columns.append(combined_col)


df =df[feature_columns]

# Separate categorical and numerical columns
categorical_cols = df[feature_columns].select_dtypes(include=['object']).columns.tolist()
numerical_cols = [col for col in feature_columns if col not in categorical_cols]

# One-hot encode categorical features
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Handle missing values using KNN imputer
knn_imputer = KNNImputer(n_neighbors=3) 
# df.fillna(df.median(), inplace=True)
df = df.dropna(axis=1, how='all')
df[:] = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns, index=df.index)

df.columns = [col.replace("_", "") for col in df.columns]
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X = df.drop(columns=[target_column])

y = df[target_column]

X_corr = X.copy()
X_corr[target_column] = y

# Compute correlation with target
correlations = X_corr.corr(numeric_only=True)[target_column].abs()

# Select top-k features based on correlation
top_k = 50
top_corr_features = correlations.sort_values(ascending=False).head(top_k).index.tolist()

# Remove the target itself from the list
top_corr_features = [f for f in top_corr_features if f != target_column]

print(f"Top {top_k} correlation-based features:", top_corr_features)


  df[:] = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns, index=df.index)


Top 50 correlation-based features: ['LBXGH', 'LBXGLU', 'LBDGLUSI', 'RXQ050', 'HUQ010', 'BPQ020', 'FNQ440', 'BMXWAIST', 'RXQ033', 'RIDAGEYR', 'RXQ510', 'LBXTC', 'LBDTCSI', 'OHQ845', 'LUXCAPM', 'FNQ490', 'BPQ101D', 'DMDEDUC2', 'FNDADI', 'LBDHDD', 'LBDHDDSI', 'PAD810Qcombined5397605346934028e79b', 'LUAPNMEbM', 'FNDAEDI', 'LBXSF6SI', 'BMXBMI', 'LBXDHE', 'LBDDHESI', 'BPQ080', 'LBXRDW', 'OCD150', 'AUQ054', 'RHQ031', 'PAD790Qcombined5397605346934028e79b', 'LUXSMED', 'KIQ481', 'BPXOPLS1', 'BMXWT', 'DRQSDIET', 'DRD370M', 'BAXTC41', 'LBDINSI', 'LBXIN', 'AUQ101', 'BMXARMC', 'LUAPNMEbXL', 'BAQ321C', 'MCQ160A', 'OHQ630']


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Lasso (L1) Logistic Regression
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=10000, C=0.01, random_state=42)
lasso.fit(X_scaled, y)

# Extract non-zero coefficients
lasso_selected_features = X.columns[(lasso.coef_ != 0).ravel()].tolist()

print(f"LASSO selected {len(lasso_selected_features)} features:", lasso_selected_features)


LASSO selected 9 features: ['LBXGH', 'LBXTC', 'LBDTCSI', 'BPQ020', 'BPQ080', 'HUQ010', 'FNQ440', 'RXQ033', 'RXQ050']
