# Testing Linear Regression Assumptions - Diabetes Dataset

This notebook tests the key assumptions of linear regression:

1. **Linearity**: Relationship between predictors and target is linear
2. **Independence**: Observations are independent
3. **Homoscedasticity**: Constant variance of residuals
4. **Normality**: Residuals are normally distributed
5. **No Multicollinearity**: Features are not highly correlated
6. **No Autocorrelation**: Residuals are not correlated with each other


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro, normaltest, jarque_bera, anderson

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# For VIF calculation
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)

print("=" * 80)
print("LINEAR REGRESSION ASSUMPTIONS TESTING")
print("=" * 80)

## Load and Prepare Data

In [None]:
# Load diabetes dataset
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
feature_names = diabetes.feature_names

# Create DataFrame for easier manipulation
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print("Dataset Information:")
print(f"  Samples: {len(df)}")
print(f"  Features: {len(feature_names)}")
print(f"\nFeature names: {', '.join(feature_names)}")
print(f"\nDataset shape: {df.shape}")

# Display first few rows
print("\nFirst few rows:")
print(df.head())

# Basic statistics
print("\nTarget variable statistics:")
print(df['target'].describe())

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate residuals
residuals_train = y_train - y_train_pred
residuals_test = y_test - y_test_pred

# Model performance
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("\nModel Performance:")
print(f"  Training R²:   {train_r2:.4f}")
print(f"  Test R²:       {test_r2:.4f}")
print(f"  Training RMSE: {train_rmse:.2f}")
print(f"  Test RMSE:     {test_rmse:.2f}")

# Model coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nModel Coefficients (sorted by absolute value):")
print(coef_df.to_string(index=False))
print(f"\nIntercept: {model.intercept_:.2f}")

## Assumption 1: Linearity

**Test**: The relationship between each predictor and the target should be linear.

**Methods**:
- Residuals vs Fitted values plot (should show random scatter)
- Partial regression plots for each feature
- Scatter plots of features vs target

In [None]:
print("\n" + "=" * 80)
print("ASSUMPTION 1: LINEARITY")
print("=" * 80)

# Residuals vs Fitted values
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Residuals vs Fitted (Training)
axes[0].scatter(y_train_pred, residuals_train, alpha=0.5)
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0].set_xlabel('Fitted Values', fontsize=12)
axes[0].set_ylabel('Residuals', fontsize=12)
axes[0].set_title('Residuals vs Fitted Values (Training)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Residuals vs Fitted (Test)
axes[1].scatter(y_test_pred, residuals_test, alpha=0.5, color='orange')
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Fitted Values', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('Residuals vs Fitted Values (Test)', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✓ Interpretation:")
print("  - Random scatter around zero line → Linearity assumption satisfied")
print("  - Patterns (curved, funnel) → Linearity assumption violated")

In [None]:
# Feature vs Target scatter plots
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.ravel()

for idx, feature in enumerate(feature_names):
    axes[idx].scatter(df[feature], df['target'], alpha=0.5)
    
    # Add trend line
    z = np.polyfit(df[feature], df['target'], 1)
    p = np.poly1d(z)
    axes[idx].plot(df[feature], p(df[feature]), "r--", linewidth=2)
    
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Target', fontsize=10)
    axes[idx].set_title(f'{feature} vs Target', fontsize=11)
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Feature vs Target Relationships', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## Assumption 2: Independence of Observations

**Test**: Observations should be independent of each other.

**Methods**:
- Durbin-Watson test (tests for autocorrelation)
- Residuals vs Index plot

**Durbin-Watson Statistic**:
- Values range from 0 to 4
- ~2 indicates no autocorrelation
- <2 indicates positive autocorrelation
- >2 indicates negative autocorrelation

In [None]:
print("\n" + "=" * 80)
print("ASSUMPTION 2: INDEPENDENCE")
print("=" * 80)

# Durbin-Watson test
dw_stat = durbin_watson(residuals_train)

print(f"\nDurbin-Watson Statistic: {dw_stat:.4f}")
print("\nInterpretation:")
if 1.5 < dw_stat < 2.5:
    print("  ✓ No significant autocorrelation (independence satisfied)")
elif dw_stat <= 1.5:
    print("  ✗ Positive autocorrelation detected (independence violated)")
else:
    print("  ✗ Negative autocorrelation detected (independence violated)")

# Residuals vs Index plot
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Training residuals
axes[0].scatter(range(len(residuals_train)), residuals_train, alpha=0.5)
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0].set_xlabel('Observation Index', fontsize=12)
axes[0].set_ylabel('Residuals', fontsize=12)
axes[0].set_title('Residuals vs Index (Training)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Test residuals
axes[1].scatter(range(len(residuals_test)), residuals_test, alpha=0.5, color='orange')
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Observation Index', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('Residuals vs Index (Test)', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✓ Look for patterns or trends in residuals over index")
print("  - Random scatter → Independence satisfied")
print("  - Patterns/trends → May indicate autocorrelation")

## Assumption 3: Homoscedasticity (Constant Variance)

**Test**: Residuals should have constant variance across all levels of fitted values.

**Methods**:
- Scale-Location plot (sqrt of standardized residuals vs fitted values)
- Breusch-Pagan test
- Residuals vs Fitted plot (already shown in Linearity)

In [None]:
print("\n" + "=" * 80)
print("ASSUMPTION 3: HOMOSCEDASTICITY")
print("=" * 80)

# Standardized residuals
std_residuals_train = residuals_train / np.std(residuals_train)
std_residuals_test = residuals_test / np.std(residuals_test)

# Scale-Location plot
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Training
axes[0].scatter(y_train_pred, np.sqrt(np.abs(std_residuals_train)), alpha=0.5)
axes[0].set_xlabel('Fitted Values', fontsize=12)
axes[0].set_ylabel('√|Standardized Residuals|', fontsize=12)
axes[0].set_title('Scale-Location Plot (Training)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Add smoothed line
from scipy.interpolate import make_interp_spline
sorted_idx = np.argsort(y_train_pred)
x_smooth = y_train_pred[sorted_idx]
y_smooth = np.sqrt(np.abs(std_residuals_train[sorted_idx]))
# Simple moving average for smoothing
window = 20
y_ma = np.convolve(y_smooth, np.ones(window)/window, mode='valid')
x_ma = x_smooth[window-1:]
axes[0].plot(x_ma, y_ma, 'r-', linewidth=2, label='Trend')
axes[0].legend()

# Test
axes[1].scatter(y_test_pred, np.sqrt(np.abs(std_residuals_test)), alpha=0.5, color='orange')
axes[1].set_xlabel('Fitted Values', fontsize=12)
axes[1].set_ylabel('√|Standardized Residuals|', fontsize=12)
axes[1].set_title('Scale-Location Plot (Test)', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✓ Interpretation:")
print("  - Horizontal line with equal spread → Homoscedasticity satisfied")
print("  - Funnel shape or trend → Heteroscedasticity (violation)")

# Statistical test for homoscedasticity
# Simple variance comparison across fitted value ranges
low_fitted = y_train_pred < np.median(y_train_pred)
high_fitted = y_train_pred >= np.median(y_train_pred)

var_low = np.var(residuals_train[low_fitted])
var_high = np.var(residuals_train[high_fitted])
variance_ratio = max(var_low, var_high) / min(var_low, var_high)

print(f"\nVariance Ratio (High/Low fitted values): {variance_ratio:.4f}")
print("  - Ratio close to 1 → Homoscedasticity")
print("  - Ratio >> 1 → Heteroscedasticity")

## Assumption 4: Normality of Residuals

**Test**: Residuals should be normally distributed.

**Methods**:
- Q-Q plot (quantile-quantile plot)
- Histogram of residuals
- Shapiro-Wilk test
- Anderson-Darling test
- Jarque-Bera test

In [None]:
print("\n" + "=" * 80)
print("ASSUMPTION 4: NORMALITY OF RESIDUALS")
print("=" * 80)

# Visual tests
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Q-Q plot (Training)
stats.probplot(residuals_train, dist="norm", plot=axes[0, 0])
axes[0, 0].set_title('Q-Q Plot (Training)', fontsize=14, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# Q-Q plot (Test)
stats.probplot(residuals_test, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot (Test)', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# Histogram (Training)
axes[1, 0].hist(residuals_train, bins=30, edgecolor='black', alpha=0.7, density=True)
# Overlay normal distribution
mu, sigma = residuals_train.mean(), residuals_train.std()
x = np.linspace(residuals_train.min(), residuals_train.max(), 100)
axes[1, 0].plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal dist')
axes[1, 0].set_xlabel('Residuals', fontsize=12)
axes[1, 0].set_ylabel('Density', fontsize=12)
axes[1, 0].set_title('Histogram of Residuals (Training)', fontsize=14, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Histogram (Test)
axes[1, 1].hist(residuals_test, bins=20, edgecolor='black', alpha=0.7, density=True, color='orange')
mu_test, sigma_test = residuals_test.mean(), residuals_test.std()
x_test = np.linspace(residuals_test.min(), residuals_test.max(), 100)
axes[1, 1].plot(x_test, stats.norm.pdf(x_test, mu_test, sigma_test), 'r-', linewidth=2, label='Normal dist')
axes[1, 1].set_xlabel('Residuals', fontsize=12)
axes[1, 1].set_ylabel('Density', fontsize=12)
axes[1, 1].set_title('Histogram of Residuals (Test)', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✓ Q-Q Plot Interpretation:")
print("  - Points along diagonal line → Normality satisfied")
print("  - Deviation from diagonal → Non-normality")

In [None]:
# Statistical tests for normality
print("\n" + "=" * 80)
print("NORMALITY TESTS (Training Residuals)")
print("=" * 80)

# Shapiro-Wilk Test
shapiro_stat, shapiro_p = shapiro(residuals_train)
print(f"\n1. Shapiro-Wilk Test:")
print(f"   Statistic: {shapiro_stat:.6f}")
print(f"   P-value: {shapiro_p:.6f}")
if shapiro_p > 0.05:
    print("   ✓ Fail to reject H0: Residuals are normally distributed (p > 0.05)")
else:
    print("   ✗ Reject H0: Residuals are NOT normally distributed (p < 0.05)")

# D'Agostino's K-squared Test
dagostino_stat, dagostino_p = normaltest(residuals_train)
print(f"\n2. D'Agostino's K² Test:")
print(f"   Statistic: {dagostino_stat:.6f}")
print(f"   P-value: {dagostino_p:.6f}")
if dagostino_p > 0.05:
    print("   ✓ Fail to reject H0: Residuals are normally distributed (p > 0.05)")
else:
    print("   ✗ Reject H0: Residuals are NOT normally distributed (p < 0.05)")

# Jarque-Bera Test
jb_stat, jb_p = jarque_bera(residuals_train)
print(f"\n3. Jarque-Bera Test:")
print(f"   Statistic: {jb_stat:.6f}")
print(f"   P-value: {jb_p:.6f}")
if jb_p > 0.05:
    print("   ✓ Fail to reject H0: Residuals are normally distributed (p > 0.05)")
else:
    print("   ✗ Reject H0: Residuals are NOT normally distributed (p < 0.05)")

# Anderson-Darling Test
anderson_result = anderson(residuals_train)
print(f"\n4. Anderson-Darling Test:")
print(f"   Statistic: {anderson_result.statistic:.6f}")
print("   Critical Values:")
for i, (sl, cv) in enumerate(zip(anderson_result.significance_level, anderson_result.critical_values)):
    print(f"     {sl}%: {cv:.6f}", end="")
    if anderson_result.statistic < cv:
        print(" ✓ (Normal)")
    else:
        print(" ✗ (Not normal)")

# Descriptive statistics
print(f"\n5. Descriptive Statistics:")
print(f"   Mean: {residuals_train.mean():.6f} (should be ~0)")
print(f"   Std Dev: {residuals_train.std():.6f}")
print(f"   Skewness: {stats.skew(residuals_train):.6f} (should be ~0)")
print(f"   Kurtosis: {stats.kurtosis(residuals_train):.6f} (should be ~0)")

## Assumption 5: No Multicollinearity

**Test**: Independent variables should not be highly correlated with each other.

**Methods**:
- Correlation matrix and heatmap
- Variance Inflation Factor (VIF)

**VIF Interpretation**:
- VIF = 1: No correlation
- 1 < VIF < 5: Moderate correlation
- VIF > 5: High correlation (problematic)
- VIF > 10: Severe multicollinearity

In [None]:
print("\n" + "=" * 80)
print("ASSUMPTION 5: NO MULTICOLLINEARITY")
print("=" * 80)

# Correlation matrix
corr_matrix = df[feature_names].corr()

# Visualize correlation matrix
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            ax=ax)
ax.set_title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Find high correlations
print("\nHigh Correlations (|r| > 0.7):")
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            high_corr.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

if high_corr:
    for feat1, feat2, corr in high_corr:
        print(f"  {feat1} <-> {feat2}: {corr:.4f}")
else:
    print("  ✓ No features with |correlation| > 0.7")

In [None]:
# Calculate VIF for each feature
print("\n" + "=" * 80)
print("VARIANCE INFLATION FACTOR (VIF)")
print("=" * 80)

vif_data = pd.DataFrame()
vif_data["Feature"] = feature_names
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(len(feature_names))]
vif_data = vif_data.sort_values('VIF', ascending=False)

print("\nVIF Values (sorted by magnitude):")
print(vif_data.to_string(index=False))

print("\nInterpretation:")
print("  VIF < 5:  ✓ Low multicollinearity")
print("  5 ≤ VIF < 10: ⚠ Moderate multicollinearity")
print("  VIF ≥ 10: ✗ High multicollinearity (problematic)")

problematic = vif_data[vif_data['VIF'] > 10]
if len(problematic) > 0:
    print(f"\n✗ {len(problematic)} feature(s) with VIF > 10:")
    print(problematic.to_string(index=False))
else:
    print("\n✓ No features with VIF > 10")

# Visualize VIF
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['red' if x > 10 else 'orange' if x > 5 else 'green' for x in vif_data['VIF']]
ax.barh(vif_data['Feature'], vif_data['VIF'], color=colors)
ax.axvline(x=5, color='orange', linestyle='--', linewidth=2, label='VIF = 5')
ax.axvline(x=10, color='red', linestyle='--', linewidth=2, label='VIF = 10')
ax.set_xlabel('VIF Value', fontsize=12)
ax.set_title('Variance Inflation Factor by Feature', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## Additional Diagnostic Plots

In [None]:
print("\n" + "=" * 80)
print("ADDITIONAL DIAGNOSTIC PLOTS")
print("=" * 80)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Residuals vs Fitted
axes[0, 0].scatter(y_train_pred, residuals_train, alpha=0.5)
axes[0, 0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0, 0].set_xlabel('Fitted Values', fontsize=11)
axes[0, 0].set_ylabel('Residuals', fontsize=11)
axes[0, 0].set_title('1. Residuals vs Fitted', fontsize=12, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# 2. Q-Q Plot
stats.probplot(residuals_train, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('2. Normal Q-Q Plot', fontsize=12, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# 3. Scale-Location
axes[1, 0].scatter(y_train_pred, np.sqrt(np.abs(std_residuals_train)), alpha=0.5)
axes[1, 0].set_xlabel('Fitted Values', fontsize=11)
axes[1, 0].set_ylabel('√|Standardized Residuals|', fontsize=11)
axes[1, 0].set_title('3. Scale-Location', fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# 4. Residuals vs Leverage (Cook's distance)
# Calculate leverage
from numpy.linalg import inv
H = X_train.dot(inv(X_train.T.dot(X_train))).dot(X_train.T)
leverage = np.diag(H)

# Calculate Cook's distance
n = len(y_train)
p = X_train.shape[1]
mse_train = np.mean(residuals_train**2)
cooks_d = (std_residuals_train**2 / p) * (leverage / (1 - leverage))

axes[1, 1].scatter(leverage, std_residuals_train, alpha=0.5)
axes[1, 1].axhline(y=0, color='red', linestyle='--', linewidth=1)
# Mark influential points (Cook's distance > 0.5)
influential = cooks_d > 0.5
if np.any(influential):
    axes[1, 1].scatter(leverage[influential], std_residuals_train[influential], 
                      color='red', s=100, label='Influential points')
    axes[1, 1].legend()
axes[1, 1].set_xlabel('Leverage', fontsize=11)
axes[1, 1].set_ylabel('Standardized Residuals', fontsize=11)
axes[1, 1].set_title('4. Residuals vs Leverage', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Identify influential points
influential_points = np.where(cooks_d > 0.5)[0]
print(f"\nInfluential Points (Cook's distance > 0.5): {len(influential_points)}")
if len(influential_points) > 0:
    print(f"  Indices: {influential_points[:10]}..." if len(influential_points) > 10 else f"  Indices: {influential_points}")

## Summary Report

In [None]:
print("\n" + "=" * 80)
print("REGRESSION ASSUMPTIONS SUMMARY")
print("=" * 80)

summary = {
    'Assumption': [],
    'Status': [],
    'Key Metric': []
}

# 1. Linearity
summary['Assumption'].append('1. Linearity')
summary['Status'].append('Check residuals vs fitted plot')
summary['Key Metric'].append('Visual inspection')

# 2. Independence
summary['Assumption'].append('2. Independence')
if 1.5 < dw_stat < 2.5:
    summary['Status'].append('✓ Satisfied')
else:
    summary['Status'].append('✗ Violated')
summary['Key Metric'].append(f'Durbin-Watson: {dw_stat:.4f}')

# 3. Homoscedasticity
summary['Assumption'].append('3. Homoscedasticity')
if variance_ratio < 2:
    summary['Status'].append('✓ Satisfied')
else:
    summary['Status'].append('⚠ Check scale-location plot')
summary['Key Metric'].append(f'Variance Ratio: {variance_ratio:.4f}')

# 4. Normality
summary['Assumption'].append('4. Normality')
if shapiro_p > 0.05:
    summary['Status'].append('✓ Satisfied')
else:
    summary['Status'].append('✗ Violated')
summary['Key Metric'].append(f'Shapiro p-value: {shapiro_p:.4f}')

# 5. No Multicollinearity
summary['Assumption'].append('5. No Multicollinearity')
max_vif = vif_data['VIF'].max()
if max_vif < 10:
    summary['Status'].append('✓ Satisfied')
else:
    summary['Status'].append('✗ High VIF detected')
summary['Key Metric'].append(f'Max VIF: {max_vif:.2f}')

summary_df = pd.DataFrame(summary)
print("\n" + summary_df.to_string(index=False))

print("\n" + "=" * 80)
print("RECOMMENDATIONS")
print("=" * 80)

recommendations = []

if shapiro_p < 0.05:
    recommendations.append("- Consider transformation of target variable (log, sqrt) for normality")

if variance_ratio > 2:
    recommendations.append("- Consider weighted least squares or robust regression for heteroscedasticity")

if max_vif > 10:
    recommendations.append("- Remove or combine highly correlated features")
    recommendations.append("- Consider Ridge regression for multicollinearity")

if not (1.5 < dw_stat < 2.5):
    recommendations.append("- Investigate potential time-series patterns in data")

if len(influential_points) > 0:
    recommendations.append(f"- Investigate {len(influential_points)} influential observations")

if len(recommendations) > 0:
    print("\n".join(recommendations))
else:
    print("\n✓ All assumptions reasonably satisfied!")
    print("  Model appears appropriate for linear regression.")

print("\n" + "=" * 80)
print("TESTING COMPLETE")
print("=" * 80)