# Lab 4
# Sandeep Pandey - 8878312

In [1]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error

In [2]:
# Load the Diabetes dataset
diabetes_data = load_diabetes(as_frame=True)
X = diabetes_data['data']
y = diabetes_data['target']

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [16]:
# Multivariate Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Polynomial Regression of 2nd Degree on BMI
bmi_poly = PolynomialFeatures(degree=2, include_bias=False)
X_bmi_poly = bmi_poly.fit_transform(X_train['bmi'].values.reshape(-1, 1))
poly_bmi_model = LinearRegression()
poly_bmi_model.fit(X_bmi_poly, y_train)

LinearRegression()

In [17]:
# Multivariate Polynomial Regression of 2nd Degree
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X_train)
poly_model = LinearRegression()
poly_model.fit(X_poly, y_train)

LinearRegression()

In [21]:
# Step 3: Model Evaluation
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    return r2, mae, mape

linear_r2, linear_mae, linear_mape = evaluate_model(linear_model, X_val, y_val)
poly_bmi_r2, poly_bmi_mae, poly_bmi_mape = evaluate_model(poly_bmi_model, bmi_poly.transform(X_val['bmi'].values.reshape(-1, 1)), y_val)
poly_r2, poly_mae, poly_mape = evaluate_model(poly_model, poly_features.transform(X_val), y_val)

# Printing the results
print("Multivariate Linear Regression:")
print(f"R-squared: {linear_r2}, MAE: {linear_mae}, MAPE: {linear_mape}")

print("\nPolynomial Regression of 2nd Degree on BMI:")
print(f"R-squared: {poly_bmi_r2}, MAE: {poly_bmi_mae}, MAPE: {poly_bmi_mape}")

print("\nMultivariate Polynomial Regression of 2nd Degree:")
print(f"R-squared: {poly_r2}, MAE: {poly_mae}, MAPE: {poly_mape}")

Multivariate Linear Regression:
R-squared: 0.5112606807552806, MAE: 38.21679949379056, MAPE: 0.34616633675614844

Polynomial Regression of 2nd Degree on BMI:
R-squared: 0.29622305527298465, MAE: 48.27302777867063, MAPE: 0.4190243458933215

Multivariate Polynomial Regression of 2nd Degree:
R-squared: 0.3665161345216754, MAE: 42.47818716668599, MAPE: 0.3809723180165827


In [23]:
linear_test_r2, linear_test_mae, linear_test_mape = evaluate_model(linear_model, X_test, y_test)
poly_bmi_test_r2, poly_bmi_test_mae, poly_bmi_test_mape = evaluate_model(poly_bmi_model, bmi_poly.transform(X_test['bmi'].values.reshape(-1, 1)), y_test)
poly_test_r2, poly_test_mae, poly_test_mape = evaluate_model(poly_model, poly_features.transform(X_test), y_test)

print("\nTest Set Performance:")

print("Multivariate Linear Regression:")
print(f"R-squared: {linear_test_r2}, MAE: {linear_test_mae}, MAPE: {linear_test_mape}")

print("\nPolynomial Regression of 2nd Degree on BMI:")
print(f"R-squared: {poly_bmi_test_r2}, MAE: {poly_bmi_test_mae}, MAPE: {poly_bmi_test_mape}")

print("\nMultivariate Polynomial Regression of 2nd Degree:")
print(f"R-squared: {poly_test_r2}, MAE: {poly_test_mae}, MAPE: {poly_test_mape}")

# Insights and Comparison
# Let's compare the models based on their performance

# R-squared values
print("\nInsights:")
print("R-squared values:")
print(f"Multivariate Linear Regression R-squared: {linear_test_r2}")
print(f"Polynomial Regression of 2nd Degree on BMI R-squared: {poly_bmi_test_r2}")
print(f"Multivariate Polynomial Regression of 2nd Degree R-squared: {poly_test_r2}")

# MAE values
print("\nMean Absolute Error (MAE):")
print(f"Multivariate Linear Regression MAE: {linear_test_mae}")
print(f"Polynomial Regression of 2nd Degree on BMI MAE: {poly_bmi_test_mae}")
print(f"Multivariate Polynomial Regression of 2nd Degree MAE: {poly_test_mae}")

# MAPE values
print("\nMean Absolute Percentage Error (MAPE):")
print(f"Multivariate Linear Regression MAPE: {linear_test_mape}")
print(f"Polynomial Regression of 2nd Degree on BMI MAPE: {poly_bmi_test_mape}")
print(f"Multivariate Polynomial Regression of 2nd Degree MAPE: {poly_test_mape}")


Test Set Performance:
Multivariate Linear Regression:
R-squared: 0.44710298835010365, MAE: 45.56644720821204, MAPE: 0.3869652899172567

Polynomial Regression of 2nd Degree on BMI:
R-squared: 0.259067431590856, MAE: 53.084911736971435, MAPE: 0.45819301631136533

Multivariate Polynomial Regression of 2nd Degree:
R-squared: 0.4521167981327442, MAE: 46.004513397336346, MAPE: 0.39366397469993675

Insights:
R-squared values:
Multivariate Linear Regression R-squared: 0.44710298835010365
Polynomial Regression of 2nd Degree on BMI R-squared: 0.259067431590856
Multivariate Polynomial Regression of 2nd Degree R-squared: 0.4521167981327442

Mean Absolute Error (MAE):
Multivariate Linear Regression MAE: 45.56644720821204
Polynomial Regression of 2nd Degree on BMI MAE: 53.084911736971435
Multivariate Polynomial Regression of 2nd Degree MAE: 46.004513397336346

Mean Absolute Percentage Error (MAPE):
Multivariate Linear Regression MAPE: 0.3869652899172567
Polynomial Regression of 2nd Degree on BMI MA

## How many parameters are we fitting for each of the three models?

Multivariate Linear Regression: There are 10 features. Additionally, there is one intercept term. So, the total number of parameters in this model is 11 (10 for features + 1 for intercept).

Polynomial Regression of 2nd Degree on BMI: In this case, we're creating polynomial features for BMI up to the 2nd degree. This involves creating a quadratic term for BMI. So, there are three parameters in this model: BMI, BMI^2, and the intercept term.

Multivariate Polynomial Regression of 2nd Degree: 65 Features

## Which model would you choose for deployment, and why?

The Multivariate Polynomial Regression of 2nd Degree has the highest R-squared value (0.452), indicating that it explains more of the variance in the target variable compared to the other models. This means it captures the relationships in the data better.

The Multivariate Linear Regression has the lowest MAE (45.57), indicating that, on average, its predictions are closest to the actual values.

Given these results, I would choose the Multivariate Polynomial Regression of 2nd Degree for deployment. It provides a good balance between capturing the underlying patterns in the data (high R-squared) and making accurate predictions (relatively low MAE). Additionally, it doesn't introduce unnecessary complexity as compared to the polynomial regression on BMI.

In [27]:
len(poly_features.get_feature_names_out())

65