In [None]:
import numpy as np
import pandas as pd

from sklearn import datasets

# Load the diabetes dataset
diabetes = datasets.load_diabetes(as_frame=True)

# Create a DataFrame from the dataset
df = diabetes.frame


Original features and derived features:


In [8]:

# --- Derived Features ---

# 1. BMI Squared
# Reasoning: The relationship between BMI and diabetes risk is often non-linear.
# Squaring the BMI captures a potential accelerating effect, where risk increases
# disproportionately at higher BMI values. This allows the model to better fit
# the non-linear relationship observed in medical data.
df['bmi_squared'] = df['bmi'] ** 2

# 2. Interaction between BMI and S5 (log of serum triglycerides)
# Reasoning: This feature captures the synergistic effect of obesity and high blood fat.
# An individual with both a high BMI and high triglyceride levels is at a significantly
# greater risk. The product of these two features serves as a powerful interaction term,
# representing a more complex metabolic state than either feature alone.
df['bmi_s5_interaction'] = df['bmi'] * df['s5']

# 3. Interaction between BP (blood pressure) and S1 (total cholesterol)
# Reasoning: Both high blood pressure and high cholesterol are key components of
# metabolic syndrome, which is a strong precursor to type 2 diabetes. The co-occurrence
# of these conditions is more predictive of risk than their individual effects.
# This interaction term allows the model to give higher weight to individuals
# who exhibit this dangerous combination of risk factors.
df['bp_s1_interaction'] = df['bp'] * df['s1']

# --- Display the new DataFrame with derived features ---

print("Original features and derived features:")
print(df.head())

Original features and derived features:
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  bmi_squared  bmi_s5_interaction  \
0 -0.002592  0.019907 -0.017646   151.0     0.003806            0.001228   
1 -0.039493 -0.068332 -0.092204    75.0     0.002650            0.003517   
2 -0.002592  0.002861 -0.025930   141.0     0.001976            0.000127   
3  0.034309  0.022688 -0.009362   206.0     0.000134           -0.000263   
4 -0.002592 -0.031988 -0.046641   135.0     0.001324            0.001164   

   bp_s1_interaction  
0          -0.000967  
1     

In [10]:
# --- Model Training and Evaluation ---
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# ----------------------------------------------------
# Step 1: Baseline Model (using only original features)
# ----------------------------------------------------

# Define features (X_baseline) and target (y)
X_baseline = diabetes.data
y = diabetes.target

# Split the data into training and testing sets
X_train_base, X_test_base, y_train, y_test = train_test_split(X_baseline, y, test_size=0.2, random_state=42)

# Initialize and train the baseline model
baseline_model = LinearRegression()
baseline_model.fit(X_train_base, y_train)

# Make predictions and evaluate
y_pred_base = baseline_model.predict(X_test_base)
mse_base = mean_squared_error(y_test, y_pred_base)
r2_base = r2_score(y_test, y_pred_base)

print("--- Baseline Model Performance (Original Features Only) ---")
print(f"Mean Squared Error (MSE): {mse_base:.2f}")
print(f"R-squared (R2): {r2_base:.2f}")

# ----------------------------------------------------
# Step 2: Enhanced Model (with original + derived features)
# ----------------------------------------------------

# Create a DataFrame from the dataset to add derived features
df_enhanced = diabetes.frame

# Add the derived features
df_enhanced['bmi_squared'] = df_enhanced['bmi'] ** 2
df_enhanced['bmi_s5_interaction'] = df_enhanced['bmi'] * df_enhanced['s5']
df_enhanced['bp_s1_interaction'] = df_enhanced['bp'] * df_enhanced['s1']

# Define enhanced features (X_enhanced)
X_enhanced = df_enhanced.drop(columns=['target'])

# Split the data into training and testing sets
# Use the same random state to ensure the split is identical to the baseline
X_train_enh, X_test_enh, _, _ = train_test_split(X_enhanced, y, test_size=0.2, random_state=42)

# Initialize and train the enhanced model
enhanced_model = LinearRegression()
enhanced_model.fit(X_train_enh, y_train)

# Make predictions and evaluate
y_pred_enh = enhanced_model.predict(X_test_enh)
mse_enh = mean_squared_error(y_test, y_pred_enh)
r2_enh = r2_score(y_test, y_pred_enh)

print("\n--- Enhanced Model Performance (with Derived Features) ---")
print(f"Mean Squared Error (MSE): {mse_enh:.2f}")
print(f"R-squared (R2): {r2_enh:.2f}")

# ----------------------------------------------------
# Step 3: Compare the Results
# ----------------------------------------------------

print("\n--- Comparison ---")
if r2_enh > r2_base:
    print("The R-squared value for the enhanced model is HIGHER. This indicates an improvement.")
else:
    print("The R-squared value for the enhanced model is NOT HIGHER. This does not indicate an improvement.")

if mse_enh < mse_base:
    print("The Mean Squared Error for the enhanced model is LOWER. This indicates an improvement.")
else:
    print("The Mean Squared Error for the enhanced model is NOT LOWER. This does not indicate an improvement.")

--- Baseline Model Performance (Original Features Only) ---
Mean Squared Error (MSE): 2900.19
R-squared (R2): 0.45

--- Enhanced Model Performance (with Derived Features) ---
Mean Squared Error (MSE): 2847.43
R-squared (R2): 0.46

--- Comparison ---
The R-squared value for the enhanced model is HIGHER. This indicates an improvement.
The Mean Squared Error for the enhanced model is LOWER. This indicates an improvement.
