In [1]:
import pandas as pd
import os

# Set working directory
data_dir = r"E:\Project + App"

# File names
files = {
    "DEMO": "P_DEMO.XPT",
    "GHB": "P_GHB.XPT",
    "ALB_CR": "P_ALB_CR.XPT",
    "MCQ": "P_MCQ.XPT",
    "SMQ": "P_SMQ.XPT",
    "PAQ": "P_PAQ.XPT",
    "GLU": "P_GLU.XPT"
}

# Load all files into a dictionary of DataFrames
data = {}
for key, fname in files.items():
    fpath = os.path.join(data_dir, fname)
    data[key] = pd.read_sas(fpath)

print("✅ All datasets loaded!")

✅ All datasets loaded!


In [2]:
for key, df in data.items():
    print(f"\n🔹 {key}: {df.shape[0]} rows, {df.shape[1]} columns")
    print(df.columns[:10])  # Show first 10 columns


🔹 DEMO: 15560 rows, 29 columns
Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'DMDBORN4'],
      dtype='object')

🔹 GHB: 10409 rows, 2 columns
Index(['SEQN', 'LBXGH'], dtype='object')

🔹 ALB_CR: 13027 rows, 8 columns
Index(['SEQN', 'URXUMA', 'URXUMS', 'URDUMALC', 'URXUCR', 'URXCRS', 'URDUCRLC',
       'URDACT'],
      dtype='object')

🔹 MCQ: 14986 rows, 63 columns
Index(['SEQN', 'MCQ010', 'MCQ025', 'MCQ035', 'MCQ040', 'MCQ050', 'AGQ030',
       'MCQ053', 'MCQ080', 'MCQ092'],
      dtype='object')

🔹 SMQ: 11137 rows, 16 columns
Index(['SEQN', 'SMQ020', 'SMD030', 'SMQ040', 'SMQ050Q', 'SMQ050U', 'SMD057',
       'SMQ078', 'SMD641', 'SMD650'],
      dtype='object')

🔹 PAQ: 9693 rows, 17 columns
Index(['SEQN', 'PAQ605', 'PAQ610', 'PAD615', 'PAQ620', 'PAQ625', 'PAD630',
       'PAQ635', 'PAQ640', 'PAD645'],
      dtype='object')

🔹 GLU: 5090 rows, 4 columns
Index(['SEQN', 'WTSAFPRP', 'LBXGLU', 'LBDGLUSI'], dtype='object'

In [3]:
# Start with DEMO as the base
df = data['DEMO']

# Merge each of the other files onto df
merge_keys = ['GHB', 'ALB_CR', 'MCQ', 'SMQ', 'PAQ', 'GLU']

for key in merge_keys:
    df = pd.merge(df, data[key], on='SEQN', how='inner')

print(f"✅ Merged dataset shape: {df.shape}")

✅ Merged dataset shape: (4438, 133)


In [4]:
selected_columns = [
    'SEQN',
    # Demographics
    'RIAGENDR', 'RIDAGEYR', 'RIDRETH1',

    # GHB
    'LBXGH',   # Glycohemoglobin %

    # ALB/CR
    'URXUMA', 'URXUCR',  # Albumin, Creatinine in urine

    # MCQ: medical condition Qs
    'MCQ010',  # Ever told you had arthritis
    'MCQ160A', # Ever told you had kidney disease

    # SMQ: smoking
    'SMQ020',  # Smoked at least 100 cigs
    'SMD030',  # Smoke now?

    # PAQ: physical activity
    'PAQ605',  # Moderate activity?
    'PAQ620',  # Vigorous activity?

    # GLU
    'LBXGLU',  # Blood glucose (mg/dL)
]

# Filter the merged dataset to only keep these
df = df[selected_columns]

print(f"✅ Final selected shape: {df.shape}")
df.head()

✅ Final selected shape: (4438, 14)


Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,LBXGH,URXUMA,URXUCR,MCQ010,MCQ160A,SMQ020,SMD030,PAQ605,PAQ620,LBXGLU
0,109271.0,1.0,49.0,3.0,5.6,2.4,32.0,1.0,1.0,1.0,18.0,2.0,1.0,103.0
1,109274.0,1.0,68.0,5.0,5.7,12.8,120.0,2.0,1.0,2.0,,1.0,1.0,154.0
2,109282.0,1.0,76.0,3.0,5.5,16.0,192.0,2.0,2.0,1.0,18.0,2.0,2.0,95.0
3,109286.0,2.0,33.0,5.0,5.7,6.8,74.0,2.0,2.0,2.0,,2.0,2.0,92.0
4,109290.0,2.0,68.0,4.0,8.4,22.4,272.0,2.0,1.0,2.0,,2.0,1.0,106.0


In [5]:
df = df.dropna()
print(f"✅ After dropping missing: {df.shape}")

✅ After dropping missing: (1689, 14)


In [6]:
# Create binary target: 1 = Yes (has CKD), 0 = No
df['CKD'] = df['MCQ160A'].apply(lambda x: 1 if x == 1 else 0)

# Drop the original column as we’ve used it to make the label
df = df.drop(columns=['MCQ160A'])

print(df['CKD'].value_counts())

CKD
0    1068
1     621
Name: count, dtype: int64


In [7]:
# Step 1: Load ALB_CR data (if not already done)
alb_cr = pd.read_csv('E:/Project + App/ALB_CR.csv')  # or read_sas if in .XPT

# Step 2: Calculate ACR
alb_cr['ACR'] = (alb_cr['URXUMA'] / alb_cr['URXUCR']) * 88.4

# Step 3: Define CKD based on ACR
alb_cr['CKD'] = (alb_cr['ACR'] >= 30).astype(int)

# Step 4: Check CKD distribution
print(alb_cr['CKD'].value_counts())

FileNotFoundError: [Errno 2] No such file or directory: 'E:/Project + App/ALB_CR.csv'

In [None]:
import pandas as pd

file_path = 'E:/Project + App/ALB_CR_J.XPT'
alb_cr = pd.read_sas(file_path)

alb_cr.head()

In [None]:
alb_cr['ACR'] = (alb_cr['URXUMA'] / alb_cr['URXUCR']) * 88.4

In [None]:
alb_cr['CKD_lab'] = alb_cr['ACR'].apply(lambda x: 1 if x >= 30 else 0)

In [None]:
alb_cr['CKD_lab'].value_counts()

In [None]:
# Step 1: Keep only SEQN and CKD_lab from alb_cr
ckd_label_df = alb_cr[['SEQN', 'CKD_lab']]

# Step 2: Merge on SEQN
final_df = pd.merge(merged_data, ckd_label_df, on='SEQN', how='inner')

# Step 3: Drop any remaining missing values
final_df = final_df.dropna()

# Step 4: Check result
print("Final shape after merge:", final_df.shape)
print(final_df['CKD_lab'].value_counts())

In [None]:
# Load the demographic file if not already loaded
import pandas as pd
demo_df = pd.read_sas("E:/Project + App/P_DEMO.XPT")  # adjust if file name differs

# Keep only relevant features
demo_df = demo_df[['SEQN', 'RIDAGEYR', 'RIAGENDR', 'RIDRETH3', 'DMDEDUC2', 'INDFMPIR']]
demo_df = demo_df.rename(columns={
    'RIDAGEYR': 'Age',
    'RIAGENDR': 'Gender',
    'RIDRETH3': 'Race',
    'DMDEDUC2': 'Education',
    'INDFMPIR': 'IncomeRatio'
})

In [None]:
# Use your existing CKD-labeled dataframe
ckd_label_df = alb_cr[['SEQN', 'CKD_lab']]

# Merge demographics with CKD labels
merged_data = pd.merge(demo_df, ckd_label_df, on='SEQN', how='inner')

# Drop missing values
merged_data = merged_data.dropna()

# Check size and class balance
print("Merged data shape:", merged_data.shape)
print(merged_data['CKD_lab'].value_counts())

In [None]:
# Check how many IDs are common
common_ids = set(demo_df['SEQN']).intersection(set(alb_cr['SEQN']))
print("Number of common SEQN IDs:", len(common_ids))

# Optionally view a few
print("Sample common IDs:", list(common_ids)[:10])

In [None]:
demo_df['SEQN'].head()
demo_df['SEQN'].min(), demo_df['SEQN'].max()

In [None]:
alb_cr['SEQN'].head()
alb_cr['SEQN'].min(), alb_cr['SEQN'].max()

In [None]:
import pandas as pd

alb_cr = pd.read_sas(r"E:\Project + App\ALB_CR_J.XPT")

# Quick preview
print(alb_cr[['SEQN', 'URXUMA', 'URXUCR']].head())
print("SEQN range:", alb_cr['SEQN'].min(), "-", alb_cr['SEQN'].max())

In [None]:
# Step 1: Calculate ACR and Create CKD_lab label
alb_cr = alb_cr.copy()
alb_cr['ACR'] = (alb_cr['URXUMA'] / alb_cr['URXUCR']) * 100

# Step 2: Apply CKD_lab label
alb_cr['CKD_lab'] = alb_cr['ACR'].apply(lambda x: 1 if x >= 30 else 0)

# Step 3: Filter relevant columns
ckd_label_df = alb_cr[['SEQN', 'ACR', 'CKD_lab']].dropna()

# Step 4: Quick summary
print("CKD_lab value counts:")
print(ckd_label_df['CKD_lab'].value_counts())

In [None]:
print(f"Features SEQN range: {final_features_df['SEQN'].min()} - {final_features_df['SEQN'].max()}")


In [None]:
# Step 1: Merge demographic and lab data (example only)
merged_features_df = pd.merge(demo_df, lab_df, on='SEQN', how='inner')

# Add lifestyle or other datasets the same way
merged_features_df = pd.merge(merged_features_df, diet_df, on='SEQN', how='inner')  # Optional

# Assign to final_features_df for consistency
final_features_df = merged_features_df

# Sanity check SEQN range
print(f"Features SEQN range: {final_features_df['SEQN'].min()} - {final_features_df['SEQN'].max()}")


In [None]:
alb_cr = pd.read_sas('ALB_CR_J.XPT')

In [None]:
alb_cr = pd.read_sas(r'‪E:\Project + App\ALB_CR_J.xpt')

In [None]:
file_path = r'E:\Project + App\ALB_CR_J.xpt'
print(os.path.exists(file_path))  # Should return True

In [None]:
# Re-type the path manually or use raw string without hidden characters
alb_cr = pd.read_sas(r'E:\Project + App\ALB_CR_J.xpt')

In [None]:
# Merge albumin-creatinine data with CKD label dataset
merged_df = pd.merge(CKD_lab_df, alb_cr, on='SEQN', how='inner')

# Check result
print(f"Merged shape: {merged_df.shape}")
print(merged_df.head())

In [None]:
creatinine_df = pd.read_sas(r"E:\Project + App\BIOPRO_J.xpt")  # adjust path
print(creatinine_df.columns.tolist())

In [None]:
import pandas as pd

# Load biochemistry and demographics
biopro_df = pd.read_sas(r"E:\Project + App\BIOPRO_J.xpt")
demo_df = pd.read_sas(r"E:\Project + App\P_DEMO.xpt")

In [None]:
print("Biochemistry Profile columns:")
print(biopro_df.columns.tolist())

print("\nDemographic columns:")
print(demo_df.columns.tolist())

In [None]:
# Merge on SEQN
df = pd.merge(biopro_df, demo_df, on='SEQN', how='inner')

# Rename for clarity
df = df.rename(columns={
    'LBXSCR': 'Creatinine',
    'LBXSGL': 'Glucose',
    'LBXSCH': 'Cholesterol',
    'LBXSUA': 'UricAcid',
    'LBXSCK': 'CreatineKinase',
    'RIDAGEYR': 'Age',
    'RIAGENDR': 'Sex',
    'RIDRETH3': 'Race'
})

# Create CKD label (custom threshold based on sex)
def classify_ckd(row):
    if row['Sex'] == 1:  # Male
        return 1 if row['Creatinine'] > 1.2 else 0
    elif row['Sex'] == 2:  # Female
        return 1 if row['Creatinine'] > 1.0 else 0
    else:
        return np.nan

biopro_df['CKD_lab'] = biopro_df.apply(classify_ckd, axis=1)

# Drop rows with missing values in key features
df_model = df[['Age', 'Sex', 'Race', 'Creatinine', 'Glucose', 'Cholesterol', 'UricAcid', 'CreatineKinase', 'CKD_lab']].dropna()

# Check target distribution
print(df_model['CKD_lab'].value_counts())

In [None]:
# Assuming 'Creatinine' column exists and is numeric
def classify_ckd(row):
    if pd.notnull(row['Creatinine']):
        return 1 if row['Creatinine'] > 1.0 else 0
    else:
        return np.nan

# Apply function
df['CKD_lab'] = df.apply(classify_ckd, axis=1)

In [None]:
print(biopro_df.columns)

In [None]:
merged_df = biopro_df.merge(demo_df[['SEQN', 'SEXN']], on='SEQN', how='left')

In [None]:
demo_df.columns

In [None]:
merged_df = biopro_df.merge(demo_df[['SEQN', 'RIAGENDR']], on='SEQN', how='left')


In [None]:
def classify_ckd(row):
    if row['RIAGENDR'] == 1:  # Male
        return 1 if row['LBXSCR'] > 1.2 else 0
    elif row['RIAGENDR'] == 2:  # Female
        return 1 if row['LBXSCR'] > 1.0 else 0
    else:
        return np.nan

merged_df['CKD_lab'] = merged_df.apply(classify_ckd, axis=1)

In [None]:
import numpy as np

In [None]:
print(merged_df['CKD_lab'].value_counts(dropna=False))

In [None]:
print('LBXSCR' in merged_df.columns)

In [None]:
print("Total rows:", len(merged_df))
print("Missing LBXSCR:", merged_df['LBXSCR'].isna().sum())
print("Missing RIAGENDR:", merged_df['RIAGENDR'].isna().sum())

In [None]:
# Step 1: Check data types
print("biopro_df SEQN type:", biopro_df['SEQN'].dtype)
print("demo_df SEQN type:", demo_df['SEQN'].dtype)

# Step 2: Standardize SEQN to int (if needed)
biopro_df['SEQN'] = biopro_df['SEQN'].astype(int)
demo_df['SEQN'] = demo_df['SEQN'].astype(int)

# Step 3: Merge again
merged_df = pd.merge(biopro_df, demo_df, on='SEQN', how='left')  # or 'inner'

# Step 4: Confirm merge
print("Merged rows:", len(merged_df))
print("Missing RIAGENDR after merge:", merged_df['RIAGENDR'].isna().sum())

In [None]:
# How many SEQN values in common?
common_ids = set(biopro_df['SEQN']).intersection(set(demo_df['SEQN']))
print("Number of matching SEQN values:", len(common_ids))

# Also check how many total unique SEQNs in each
print("Unique SEQN in biopro_df:", biopro_df['SEQN'].nunique())
print("Unique SEQN in demo_df:", demo_df['SEQN'].nunique())

In [None]:
# Convert SEQN to integer in both dataframes
biopro_df['SEQN'] = biopro_df['SEQN'].astype(int)
demo_df['SEQN'] = demo_df['SEQN'].astype(int)

# Re-check overlap
matching_seqn = set(biopro_df['SEQN']).intersection(set(demo_df['SEQN']))
print(f"Number of matching SEQN values after fix: {len(matching_seqn)}")

# Now try merging
merged_df = pd.merge(biopro_df, demo_df, on='SEQN', how='left')

# Validate
print("Merged rows:", len(merged_df))
print("Missing RIAGENDR after merge:", merged_df['RIAGENDR'].isna().sum())

In [None]:
import pandas as pd

# Load SAS transport files (.xpt)
biopro_df = pd.read_sas(r'E:\Project + App\BIOPRO_J.xpt', format='xport')
demo_df = pd.read_sas(r'E:\Project + App\P_DEMO.xpt', format='xport')

# Check the first few rows
print(biopro_df.head())
print(demo_df.head())

In [None]:
biopro_df['SEQN'] = biopro_df['SEQN'].astype(int)
demo_df['SEQN'] = demo_df['SEQN'].astype(int)

In [None]:
matching_ids = set(biopro_df['SEQN']) & set(demo_df['SEQN'])
print(f"Number of matching SEQN values: {len(matching_ids)}")

In [None]:
demo_df = pd.read_sas(r"E:\Project + App\BIOPRO_J.xpt")  # Load demographics for Cycle J

In [None]:
# Ensure SEQN is integer
biopro_df['SEQN'] = biopro_df['SEQN'].astype(int)
demo_df['SEQN'] = demo_df['SEQN'].astype(int)

# Merge
merged_df = pd.merge(biopro_df, demo_df, on='SEQN', how='inner')
print("Merged rows:", merged_df.shape[0])

In [None]:
# Step 1: Feature list
selected_cols = [
    'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'INDFMPIR', 'DMDEDUC2',
    'LBXSCR', 'LBXGH', 'LBDGLUSI', 'LBDHDL', 'LBDLDL',
    'LBXSATSI', 'LBXSKSI', 'LBXSBU',
    'CKD_lab'
]

# Step 2: Subset and drop missing values
data = merged_df[selected_cols].dropna()

# Step 3: Encode target as binary
data['CKD_lab'] = data['CKD_lab'].astype(int)

# Step 4: Encode categoricals
data['RIAGENDR'] = data['RIAGENDR'].replace({1: 'Male', 2: 'Female'})
data['RIDRETH1'] = data['RIDRETH1'].astype(str)
data['DMDEDUC2'] = data['DMDEDUC2'].astype(str)

# One-hot encode categorical vars
data_encoded = pd.get_dummies(data, columns=['RIAGENDR', 'RIDRETH1', 'DMDEDUC2'], drop_first=True)

# Step 5: Create X and y
X = data_encoded.drop(columns=['CKD_lab'])
y = data_encoded['CKD_lab']

# Display status
print(f"✅ Final feature shape: {X.shape}")
print("✅ CKD Class distribution:\n", y.value_counts())

In [None]:
print(merged_df.columns.tolist())

In [None]:
merged_df[['LBXSCR_x', 'LBXSCR_y']].head()

In [None]:
# Drop all '_y' columns
merged_df = merged_df.drop(columns=[col for col in merged_df.columns if col.endswith('_y')])

# Rename '_x' columns to remove the suffix
merged_df.columns = merged_df.columns.str.replace('_x$', '', regex=True)

In [None]:
selected_cols = [
    'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'INDFMPIR', 'DMDEDUC2',
    'LBXSCR', 'LBXGH', 'LBDGLUSI', 'LBDHDL', 'LBDLDL',
    'LBXSATSI', 'LBXSKSI', 'LBXSBU',
    'CKD_lab'  # or whichever label you’ve defined
]

data = merged_df[selected_cols].dropna()

In [None]:
print(merged_df.columns.tolist())

In [None]:
selected_cols = [
    'LBXSCR',     # Serum creatinine
    'LBXSATSI',   # Sodium
    'LBXSBU',     # BUN
    'LBXSKSI',    # Potassium
    'LBXSGL',     # Glucose
    'LBXSCH',     # Cholesterol
    'LBXSCA',     # Calcium
    'LBXSUA',     # Uric acid
    'LBXSGTSI',   # SGPT
    'CKD_lab'     # This will be created using eGFR
]

In [None]:
# CKD label: if creatinine >= 1.3 mg/dL (simplified proxy), assume CKD
merged_df['CKD_lab'] = merged_df['LBXSCR'].apply(lambda x: 1 if x >= 1.3 else 0)

In [None]:
data = merged_df[selected_cols].dropna()

In [None]:
# Define features and target
X = data.drop(columns='CKD_lab')
y = data['CKD_lab']

print("Class balance:\n", y.value_counts())

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='CKD_lab')
y = data['CKD_lab']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and label
X = data.drop(columns='CKD_lab')
y = data['CKD_lab']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Train logistic regression
log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

# Print evaluation metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_scaled, y_train)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_sm, y_train_sm)

# Predict and evaluate
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]

# Print evaluation metrics
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_rf))

In [None]:
!pip install shap

In [None]:
import shap

# Create SHAP explainer for tree-based models
explainer = shap.TreeExplainer(rf, model_output="probability")
shap_values = explainer.shap_values(X_test)

# Plot summary of feature importance
shap.summary_plot(shap_values[1], X_test_scaled, feature_names=X.columns)

In [None]:
import pandas as pd

# Re-wrap the scaled test set with column names
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Now plot using SHAP
shap.summary_plot(shap_values[1], X_test_df)

In [None]:
import shap

# Use raw output (default), no background data needed
explainer = shap.TreeExplainer(rf)

# Use unscaled test set for explanation
shap_values = explainer.shap_values(X_test)

# If X_test is a NumPy array, convert to DataFrame
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Plot SHAP summary for class 1 (CKD)
shap.summary_plot(shap_values[1], X_test_df)

In [None]:
import numpy as np

In [None]:
print(type(shap_values))          # list or np.ndarray?
print(len(shap_values))           # should be 2 for binary classification
print(np.array(shap_values).shape)  # will show shape of each class
print(X_test_df.shape)           # confirm feature count

In [None]:
# Extract SHAP values for class 1 (CKD)
shap_values_class1 = shap_values[:, :, 1]  # shape: (1180, 9)

# SHAP summary plot
shap.summary_plot(shap_values_class1, X_test_df)

In [None]:
import matplotlib.pyplot as plt

# Save summary plot to file
shap.summary_plot(shap_values[:, :, 1], X_test_df, show=False)
plt.tight_layout()
plt.savefig("shap_summary_ckd.png", dpi=300, bbox_inches='tight')
plt.close()

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=shap_importance.values, y=shap_importance.index, hue=shap_importance.index, palette="viridis", legend=False)
plt.xlabel("Mean |SHAP Value|")
plt.title("Feature Importance (Mean Absolute SHAP Values)")
plt.tight_layout()
plt.savefig("shap_bar_plot_ckd.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values_single, X_single)

In [None]:
# Choose a single test instance (e.g., the 10th sample)
i = 10
X_single = X_test_df.iloc[i:i+1]  # Keep as DataFrame for SHAP
print("CKD status (actual):", y_test[i])  # Optional: see actual label

# Extract SHAP values for class 1 (CKD) only
shap_values_single = shap_values[i, :, 1]

# Display force plot
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values_single, X_single)

In [None]:
# Loop using .iloc to access by position
for idx in range(len(y_test)):
    if y_test.iloc[idx] == 1:      # access by position
        i = idx
        break

# Select the CKD-positive sample
X_single = X_test_df.iloc[i:i+1]                  # as DataFrame
shap_values_single = shap_values[i, :, 1]         # SHAP for class 1 (CKD)

print(f"CKD status (actual): {y_test.iloc[i]} (Index: {i})")

# Show SHAP force plot
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values_single, X_single)

In [None]:
# Calculate mean absolute SHAP values for feature importance
shap_importance = pd.DataFrame({
    "Feature": X_test_df.columns,
    "SHAP Importance": np.abs(shap_values[:, :, 1]).mean(axis=0)
}).sort_values("SHAP Importance", ascending=False)

print(shap_importance.head(10))

In [None]:
shap.dependence_plot("LBXSCR", shap_values[:, :, 1], X_test_df)

In [None]:
import shap
import matplotlib.pyplot as plt

# Dependence plot for BUN (LBXSBU)
shap.dependence_plot(
    ind="LBXSBU",
    shap_values=shap_values,
    features=X_test,
    interaction_index="LBXSCR",  # Interaction with Creatinine
    show=False
)
plt.title("SHAP Dependence Plot: BUN (LBXSBU)")
plt.tight_layout()
plt.show()="LBXSBU")

In [None]:
import shap
import matplotlib.pyplot as plt

# Dependence plot for BUN (LBXSBU)
shap.dependence_plot(
    ind="LBXSBU",
    shap_values=shap_values,
    features=X_test,
    interaction_index="LBXSCR",  # Interaction with Creatinine
    show=False
)
plt.title("SHAP Dependence Plot: BUN (LBXSBU)")
plt.tight_layout()
plt.show()

In [None]:
print(shap_values.shape)
print(X_test.shape)

In [None]:
# Extract SHAP values for class 1 (CKD)
shap_values_class1 = shap_values[:, :, 1]


In [None]:
import shap
import matplotlib.pyplot as plt

# Dependence plot
shap.dependence_plot(
    ind="LBXSBU",                      # Main feature: BUN
    shap_values=shap_values_class1,   # Use SHAP values for class 1 (CKD)
    features=X_test,
    interaction_index="LBXSCR",       # Interacting feature: Creatinine
    show=False
)
plt.title("SHAP Dependence Plot: BUN vs Creatinine")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Generate and save SHAP dependence plot
shap.dependence_plot(
    "LBXSBU",             # BUN
    shap_values[:, :, 1], # SHAP values for class 1 (CKD = 1)
    X_test_df,
    interaction_index="LBXSCR", # Creatinine
    show=False
)

plt.tight_layout()
plt.savefig("shap_dependence_bun_vs_creatinine.png", dpi=300, bbox_inches='tight')
plt.close()

In [None]:
# Create data folder if it doesn't exist
import os
os.makedirs("../data", exist_ok=True)

# Save the final DataFrame as CSV
merged_df.to_csv("../data/ckd_simulated_input.csv", index=False)

In [8]:
for var in dir():
    if isinstance(eval(var), pd.DataFrame):
        print(var)

_
_4
df


In [10]:
import os
os.makedirs("../data", exist_ok=True)
df.to_csv("../data/ckd_simulated_input.csv", index=False)