In [1]:
#import libraries and set defaults

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import datasets

In [2]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes(as_frame=True)
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [6]:
print("Diabetis Dataset shape:", diabetes.data.shape)
print("First 5 rows of features:\n", diabetes.data[:5])

Diabetis Dataset shape: (442, 10)
First 5 rows of features:
         age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  


In [24]:
#adding more derived features 

x = diabetes.data.copy()
y = diabetes.target #target value

# Derived features
# obesity × blood pressure
x["bmi_bp"]       = x["bmi"] * x["bp"]
# nonlinear age effect
x["age_squared"]  = x["age"] ** 2   
# interaction age × BMI
x["age_bmi"]      = x["age"] * x["bmi"]        


print("New diabetis dataset shape with derived features:", x.shape)
print("view dataset:\n", x.head())


New diabetis dataset shape with derived features: (442, 13)
view dataset:
         age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6    bmi_bp  age_squared   age_bmi  
0 -0.002592  0.019907 -0.017646  0.001349     0.001450  0.002349  
1 -0.039493 -0.068332 -0.092204  0.001355     0.000004  0.000097  
2 -0.002592  0.002861 -0.025930 -0.000252     0.007276  0.003792  
3  0.034309  0.022688 -0.009362  0.000425     0.007932  0.001033  
4 -0.002592 -0.031988 -0.046641 -0.000796     0.000029 -0.000196  


In [18]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Train and test split 80% and 20%
x_training, x_testing, y_training, y_testing = train_test_split(
    x, y, test_size=0.2, random_state=42
)


In [25]:
# Baseline model (original 10 features only)
main_column = diabetes.feature_names
reg_base = LinearRegression().fit(x_training[main_column], y_training)
y_prediction = reg_base.predict(x_testing[main_column])

# Model with the derived features plus 3 new derived features
reggression_new_model = LinearRegression().fit(x_training, y_training)
y_prediction_n= reggression_new_model.predict(x_testing)

In [29]:
#  performance comparison

print("Baseline R²:", r2_score(y_testing, y_prediction))
print("Baseline MSE:", mean_squared_error(y_testing, y_prediction))
print("With adding 3 more derived features R²:", r2_score(y_testing, y_prediction_n))
print("With adding 3 more derived features MSE:", mean_squared_error(y_testing, y_prediction_n))


Baseline R²: 0.45260276297191926
Baseline MSE: 2900.1936284934823
With adding 3 more derived features R²: 0.48501946113827377
With adding 3 more derived features MSE: 2728.4450424222764


In [28]:
# Cross-validation checking
cv_original = cross_val_score(LinearRegression(), diabetes.data, y, cv=5, scoring="r2")
cv_add  = cross_val_score(LinearRegression(), x,             y, cv=5, scoring="r2")


print("Original  mean R²:", cv_original.mean())
print("With adding more  derived features mean R²:", cv_add.mean())

Original  mean R²: 0.4823164359086422
With adding more  derived features mean R²: 0.4942641201642937
