In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pickle
import warnings
from warnings import filterwarnings
import tensorflow as tf
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import shap
from sklearn.preprocessing import StandardScaler

filterwarnings("ignore")
sns.set()

In [13]:
def load_data():
    df1 = pd.read_csv("calories.csv")
    df2 = pd.read_csv("exercise.csv")
    df = pd.concat([df2, df1["Calories"]], axis=1)
    df.drop(columns=["User_ID"], axis=1, inplace=True)
    return df

def process_features(df):
    # One-hot encoding
    categorical = pd.get_dummies(df["Gender"], drop_first=True)
    numerical = df.select_dtypes(include=np.number)
    return pd.concat([categorical, numerical], axis=1)

In [14]:
# Feature analysis visualizations
def create_visualizations(data):
    # Numerical distributions
    plt.figure(figsize=(20, 15))
    plotnumber = 1
    num_cols = data.columns[1:]  # Skip Male column

    for col in num_cols:
        if plotnumber <= 8:
            ax = plt.subplot(3, 3, plotnumber)
            sns.histplot(data[col], kde=True)
            plt.xlabel(col, fontsize=12)
            plotnumber += 1
    plt.tight_layout()
    plt.savefig('numerical_distributions.png')
    plt.close()

    # Correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(), cmap='Blues', annot=True)
    plt.title('Feature Correlation')
    plt.savefig('correlation_heatmap.png')
    plt.close()

In [15]:
def explain_with_shap(model, X_data, feature_names, model_type='traditional', title_suffix=""):
    print(f"\nGenerating SHAP explanations ({model_type})...")
    sample = X_data.sample(min(100, len(X_data)), random_state=1)

    try:
        sample_array = sample.values.astype(np.float32)

        if model_type == 'traditional':
            explainer = shap.TreeExplainer(model)
        elif model_type == 'neural_net':
            bg_sample_size = min(50, sample_array.shape[0])
            # Use permutation to avoid issues with np.random.choice
            indices = np.random.permutation(sample_array.shape[0])[:bg_sample_size]
            background = sample_array[indices]
            explainer = shap.DeepExplainer(model, background)

        shap_values = explainer.shap_values(sample_array)
        if isinstance(shap_values, list):
            shap_values = shap_values[0]

        plt.figure(figsize=(10, 6))
        shap.summary_plot(shap_values, sample_array, feature_names=feature_names, show=False)
        plt.savefig(f'shap_summary_{title_suffix}.png', bbox_inches='tight')
        plt.close()

        expected_value = explainer.expected_value
        if isinstance(expected_value, list):
            expected_value = expected_value[0]
        if isinstance(expected_value, np.ndarray) and expected_value.size == 1:
            expected_value = expected_value.item()

        force_instance = sample_array[0]
        plt.figure()
        shap.force_plot(expected_value, shap_values[0], force_instance,
                        feature_names=feature_names, matplotlib=True, show=False)
        plt.savefig(f'shap_force_{title_suffix}.png', bbox_inches='tight')
        plt.close()

    except Exception as e:
        print(f"SHAP Error: {str(e)}")

In [16]:
def plot_predictions_heatmap(y_true, y_pred, model_label, task):
    """
    Creates and saves a heatmap comparing the true and predicted values.
    """
    heatmap_data, xedges, yedges = np.histogram2d(y_true, y_pred, bins=30)
    plt.figure(figsize=(8, 6))
    sns.heatmap(heatmap_data.T, cmap="coolwarm", annot=False)
    plt.xlabel(f"True {task}")
    plt.ylabel(f"Predicted {task}")
    plt.title(f"Heatmap of {task} Predictions ({model_label})")
    plt.savefig(f"heatmap_{task}_{model_label}.png", bbox_inches='tight')
    plt.close()

def plot_feature_importance(model, feature_names, model_label, task):
    """
    Plots and saves a bar plot of feature importances or coefficients (if available).
    """
    try:
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances = model.coef_
        else:
            print(f"No feature importance available for {model_label}")
            return
        fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        fi_df.sort_values(by='Importance', ascending=False, inplace=True)
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=fi_df)
        plt.title(f'Feature Importance for {model_label} ({task})')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{task}_{model_label}.png', bbox_inches='tight')
        plt.close()
    except Exception as e:
        print(f"Error plotting feature importance for {model_label}: {e}")

In [17]:
# Neural Network Model Definitions
def build_calorie_nn(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

def train_nn_calorie(X_train, X_val, y_train, y_val):
    model = build_calorie_nn(X_train.shape[1])
    early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=200,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig('training_history.png')
    plt.close()

    return model

In [18]:
def build_calorie_nn(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

def train_calorie_model(X_train, X_test, y_train, y_test):
    def predict(ml_model, model_name):
        model = ml_model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f'\n{model_name} Performance (Calorie Prediction):')
        print(f'R2 Score: {metrics.r2_score(y_test, y_pred):.4f}')
        print(f'MAE: {metrics.mean_absolute_error(y_test, y_pred):.2f}')
        print(f'RMSE: {np.sqrt(metrics.mean_squared_error(y_test, y_pred)):.2f}')
        return model

    print("\n" + "="*50)
    print("Training Calorie Models")
    print("="*50)

    models = {
        'XGBRegressor': XGBRegressor(),
        'LinearRegression': LinearRegression(),
        'DecisionTree': DecisionTreeRegressor(),
        'RandomForest': RandomForestRegressor()
    }

    best_model = None
    best_score = -np.inf

    for name, model in models.items():
        current_model = predict(model, name)
        score = metrics.r2_score(y_test, current_model.predict(X_test))
        if score > best_score:
            best_score = score
            best_model = current_model

    return best_model

def train_models(X_train, X_val, y_train, y_val):
    """Train all models and return them"""
    # Train calorie models
    calorie_model = train_calorie_model(X_train, X_val, y_train, y_val)
    calorie_nn = train_nn_calorie(X_train, X_val, y_train, y_val)

    return calorie_model, calorie_nn

def test_models(models, X_test, y_test):
    """Evaluate models and return test predictions"""
    calorie_model, calorie_nn = models

    return {
        'traditional': calorie_model.predict(X_test),
        'neural_net': calorie_nn.predict(X_test).flatten()
    }

def generate_shap(models, X_sample):
    """Generate SHAP explanations for all models"""
    calorie_model, calorie_nn = models

    # Calorie explanations
    explain_with_shap(
        calorie_model,
        X_sample,
        feature_names=X_sample.columns.tolist(),
        model_type='traditional',
        title_suffix="Calorie_Traditional"
    )
    explain_with_shap(
        calorie_nn,
        X_sample,
        feature_names=X_sample.columns.tolist(),
        model_type='neural_net',
        title_suffix="Calorie_NeuralNet"
    )


def print_metrics(y_true, y_pred):
    """Helper function to print metrics"""
    print(f"R2 Score: {metrics.r2_score(y_true, y_pred):.4f}")
    print(f"MAE: {metrics.mean_absolute_error(y_true, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_true, y_pred)):.2f}")

In [19]:
def main():
    # Data processing
    df = load_data()
    processed_data = process_features(df)

    create_visualizations(processed_data)

    # Split data (remove Body_Temp and Height from features)
    temp_data, final_test_data = train_test_split(
        processed_data.drop(columns=['Body_Temp', 'Height']),  # Remove unused columns
        test_size=0.2,
        random_state=1
    )

    X = temp_data.drop(columns=['Calories'])
    y = temp_data['Calories']

    # Train/val split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.25, random_state=1)

    # Train models
    models = train_models(X_train, X_val, y_train, y_val)

    # Prepare test data
    X_test = final_test_data.drop(columns=['Calories'])
    y_test = final_test_data['Calories']

    # Test models
    calorie_preds = test_models(models, X_test, y_test)

    # Print results
    print("\n" + "="*50)
    print("Final Test Set Evaluation")
    print("="*50)

    for model_type in ['traditional', 'neural_net']:
        print(f"\nCalorie Prediction ({model_type}):")
        print_metrics(y_test, calorie_preds[model_type])

    # Save models
    with open('calorie_model.pkl', 'wb') as f:
        pickle.dump(models[0], f)
    models[1].save('calorie_nn.h5')

    # SHAP explanations
    print("\n" + "="*50)
    print("Generating SHAP Explanations")
    print("="*50)
    generate_shap(models, X_test.sample(100, random_state=1))

if __name__ == "__main__":
    main()


Training Calorie Models

XGBRegressor Performance (Calorie Prediction):
R2 Score: 0.9988
MAE: 1.49
RMSE: 2.17

LinearRegression Performance (Calorie Prediction):
R2 Score: 0.9581
MAE: 9.67
RMSE: 12.88

DecisionTree Performance (Calorie Prediction):
R2 Score: 0.9924
MAE: 3.43
RMSE: 5.47

RandomForest Performance (Calorie Prediction):
R2 Score: 0.9977
MAE: 1.84
RMSE: 3.02
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step





Final Test Set Evaluation

Calorie Prediction (traditional):
R2 Score: 0.9986
MAE: 1.53
RMSE: 2.38

Calorie Prediction (neural_net):
R2 Score: 0.9989
MAE: 1.57
RMSE: 2.10

Generating SHAP Explanations

Generating SHAP explanations (traditional)...

Generating SHAP explanations (neural_net)...
SHAP Error: only integer scalar arrays can be converted to a scalar index
