In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# import shap

ModuleNotFoundError: No module named 'shap'

In [19]:
diabetes_df = pd.read_csv("diabetes_prediction_dataset.csv")
framingham_df = pd.read_csv("framingham.csv")

In [31]:
diabetes_df = diabetes_df.rename(columns={
    'bmi': 'BMI',  
    'gender': 'sex'
})
framingham_df = framingham_df.rename(columns={
    'male': 'sex' 
})

diabetes_df['sex'] = diabetes_df['sex'].map({'Male':1, 'Female':0, 'Other':0})
framingham_df['hypertension'] = framingham_df['prevalentHyp']

shared_features = ['age', 'sex', 'BMI']
diabetes_specific = ['HbA1c_level', 'blood_glucose_level','hypertension']
heart_specific = ['sysBP', 'diaBP', 'totChol']

def prepare_data(df, features, target):
    """Helper function to prepare data with error checking"""
    missing_cols = [col for col in features + [target] if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Columns missing: {missing_cols}")
    df = df[features + [target]].dropna()
    X = df[features]
    y = df[target]
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
try:
    X_train_d, X_test_d, y_train_d, y_test_d = prepare_data(diabetes_df, shared_features + diabetes_specific, 'diabetes')
    X_train_h, X_test_h, y_train_h, y_test_h = prepare_data(framingham_df, shared_features + heart_specific, 'TenYearCHD')
except ValueError as e:
    print(f"Data preparation error: {e}")
    print("\nAvailable columns in diabetes dataset:", diabetes_df.columns.tolist())
    print("Available columns in Framingham dataset:", framingham_df.columns.tolist())

Data preparation error: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Available columns in diabetes dataset: ['sex', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'BMI', 'HbA1c_level', 'blood_glucose_level', 'diabetes', 'male']
Available columns in Framingham dataset: ['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD', 'hypertension']


In [76]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load datasets with dtype specification for mixed types
diabetes_df = pd.read_csv("diabetes_prediction_dataset.csv", dtype={'BMI': float})
framingham_df = pd.read_csv("framingham.csv", dtype={'BMI': float})

# Standardize column names (confirmed from your error output)
diabetes_df = diabetes_df.rename(columns={
    'blood_glucose_level': 'glucose',
    'bmi': 'BMI'
})
framingham_df = framingham_df.rename(columns={
    'totChol': 'cholesterol',
    'sysBP': 'systolic_bp',
    'diaBP': 'diastolic_bp',
    'male': 'gender' ,
    'prevalentHyp' : 'hypertension'
})

diabetes_df['gender'] = diabetes_df['gender'].map({'Male':1, 'Female':0, 'Other':0})

# Define FINAL feature sets (using verified column names)
shared_features = ['age', 'gender', 'BMI']  
diabetes_specific = ['HbA1c_level', 'glucose']
heart_specific = ['systolic_bp', 'diastolic_bp', 'cholesterol']

# Data cleaning pipeline
def clean_data(df, features, target):
    # Convert all feature columns to numeric, coercing errors
    for col in features:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Drop rows where target is missing
    df = df.dropna(subset=[target])
    
    # Drop rows with missing features
    df_clean = df.dropna(subset=features)
    
    # Check if data remains
    if len(df_clean) == 0:
        raise ValueError(f"No samples left after cleaning for target: {target}")
    
    return df_clean[features], df_clean[target]

# Prepare data
X_d, y_d = clean_data(diabetes_df, shared_features + diabetes_specific, 'diabetes')
X_h, y_h = clean_data(framingham_df, shared_features + heart_specific, 'TenYearCHD')

# Align dataset sizes
common_size = min(len(X_d), len(X_h))
X_d = X_d.iloc[:common_size]
y_d = y_d.iloc[:common_size]
X_h = X_h.iloc[:common_size]
y_h = y_h.iloc[:common_size]

# Train-test split
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.2, random_state=42)
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)

print("Aligned diabetes shape:", X_train_d_aligned.shape)
print("Aligned heart shape:", X_train_h_aligned.shape)

# Scale features
scaler = StandardScaler()
X_train_d[shared_features] = scaler.fit_transform(X_train_d[shared_features])
X_test_d[shared_features] = scaler.transform(X_test_d[shared_features])
X_train_h[shared_features] = scaler.fit_transform(X_train_h[shared_features])
X_test_h[shared_features] = scaler.transform(X_test_h[shared_features])

Aligned diabetes shape: (3337, 5)
Aligned heart shape: (3337, 6)


In [78]:
# Cell 2: Model Construction
# Shared trunk
shared_input = Input(shape=(len(shared_features),), name='shared_input')
x = Dense(64, activation='relu')(shared_input)

# Diabetes branch
diabetes_input = Input(shape=(len(diabetes_specific),), name='diabetes_input')
diabetes_features = Concatenate()([shared_input, diabetes_input])
diabetes_out = Dense(1, activation='sigmoid', name='diabetes')(Dense(32, activation='relu')(diabetes_features))

# Heart branch
heart_input = Input(shape=(len(heart_specific),), name='heart_input')
heart_features = Concatenate()([shared_input, heart_input])
heart_out = Dense(1, activation='sigmoid', name='heart')(Dense(32, activation='relu')(heart_features))

# Combined model
model = Model(
    inputs=[shared_input, diabetes_input, heart_input],
    outputs=[diabetes_out, heart_out]
)

model.compile(
    optimizer='adam',
    loss={'diabetes': 'binary_crossentropy', 'heart': 'binary_crossentropy'},
    loss_weights=[0.4, 0.6],
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

In [86]:
history = model.fit(
    [X_train_d[shared_features], X_train_d[diabetes_specific], X_train_h[heart_specific]],
    [y_train_d, y_train_h],
    validation_data=(
        [X_test_d[shared_features], X_test_d[diabetes_specific], X_test_h[heart_specific]],
        [y_test_d, y_test_h]
    ),
    epochs=50,
    batch_size=32,
)

Epoch 1/50




[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - diabetes_accuracy: 0.5541 - diabetes_loss: 1.1039 - heart_auc: 0.4444 - heart_loss: 2.7163 - loss: 3.8202



[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - diabetes_accuracy: 0.5554 - diabetes_loss: 1.1004 - heart_auc: 0.4443 - heart_loss: 2.7085 - loss: 3.8090 - val_diabetes_accuracy: 0.8958 - val_diabetes_loss: 0.1600 - val_heart_auc: 0.4017 - val_heart_loss: 0.5241 - val_loss: 0.6979
Epoch 2/50
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - diabetes_accuracy: 0.9159 - diabetes_loss: 0.1452 - heart_auc: 0.4122 - heart_loss: 0.4973 - loss: 0.6425 - val_diabetes_accuracy: 0.8958 - val_diabetes_loss: 0.1513 - val_heart_auc: 0.4182 - val_heart_loss: 0.4057 - val_loss: 0.5665
Epoch 3/50
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - diabetes_accuracy: 0.9181 - diabetes_loss: 0.1354 - heart_auc: 0.4144 - heart_loss: 0.3694 - loss: 0.5047 - val_diabetes_accuracy: 0.8958 - val_diabetes_loss: 0.1439 - val_heart_auc: 0.4443 - val_heart_loss: 0.3171 - val_loss: 0.4680
Epoch 4/50
[1m105/105[0m [32m━━━━━━━━━━━━

In [87]:
from sklearn.metrics import roc_auc_score, classification_report

# Diabetes evaluation
diabetes_pred = model.predict([X_test_d[shared_features], X_test_d[diabetes_specific], np.zeros_like(X_test_h[heart_specific])])[0]
print("Diabetes AUC:", roc_auc_score(y_test_d, diabetes_pred))
print(classification_report(y_test_d, np.round(diabetes_pred)))

# Heart disease evaluation
heart_pred = model.predict([X_test_h[shared_features], np.zeros_like(X_test_d[diabetes_specific]), X_test_h[heart_specific]])[1]
print("\nHeart Disease AUC:", roc_auc_score(y_test_h, heart_pred))
print(classification_report(y_test_h, np.round(heart_pred)))

[1m23/27[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step  



[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Diabetes AUC: 0.907861577232774
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       748
           1       0.86      0.14      0.24        87

    accuracy                           0.91       835
   macro avg       0.88      0.57      0.59       835
weighted avg       0.90      0.91      0.88       835

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Heart Disease AUC: 0.6316456563442038
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       712
           1       0.50      0.03      0.06       123

    accuracy                           0.85       835
   macro avg       0.68      0.51      0.49       835
weighted avg       0.80      0.85      0.79       835



In [92]:
!pip install shap

Collecting shap
  Downloading shap-0.47.1-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.47.1-cp312-cp312-win_amd64.whl (490 kB)
   ---------------------------------------- 0.0/490.6 kB ? eta -:--:--
   --- ----------------------------------- 41.0/490.6 kB 991.0 kB/s eta 0:00:01
   ---------------------------------------  481.3/490.6 kB 7.5 MB/s eta 0:00:01
   ---------------------------------------- 490.6/490.6 kB 6.2 MB/s eta 0:00:00
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.47.1 slicer-0.0.8


In [96]:
def predict_risk(age, sex, bmi, hba1c=None, glucose=None, sysbp=None, diabp=None, chol=None):
    # Scale shared features
    shared_scaled = scaler.transform([[age, sex, bmi]])
    
    # Prepare inputs (handle missing features)
    diabetes_input = np.array([[hba1c, glucose]]) if hba1c else np.zeros((1, len(diabetes_specific)))
    heart_input = np.array([[sysbp, diabp, chol]]) if sysbp else np.zeros((1, len(heart_specific)))
    
    # Predict
    diabetes_prob, heart_prob = model.predict(
        [shared_scaled, diabetes_input, heart_input],
        verbose=0
    )
    
    return {
        'diabetes_risk': float(diabetes_prob[0][0] * 100),
        'heart_disease_risk': float(heart_prob[0][0] * 100)
    }

In [98]:
print("\nSample Prediction:")
print(predict_risk(
    age=55,
    sex=1,  # male
    bmi=28,
    hba1c=6.5,
    glucose=140,
    sysbp=145,
    diabp=90,
    chol=240
))


Sample Prediction:
{'diabetes_risk': 11.785471439361572, 'heart_disease_risk': 13.291500508785248}




In [None]:
loaded_model = tf.keras.models.load_model('mtl_model.keras')

model.load_weights('model_weights.h5')  

# Load TFLite model
interpreter = tf.lite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()