In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

# Generate some example data
np.random.seed(42)
X = np.random.rand(100, 1)  # Independent variable
y = 2 * X + 1 + 0.1 * np.random.randn(100, 1)  # Dependent variable with some noise

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print regression metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Residual analysis
residuals = y_test - y_pred

# Plot residuals
plt.figure(figsize=(10, 5))
plt.scatter(X_test, residuals, color='blue', alpha=0.5)
plt.xlabel("X_test")
plt.ylabel("Residuals")
plt.title("Residual Analysis")
plt.axhline(y=0, color='red', linestyle='--')
plt.show()

# Check if residuals are normally distributed (p-value should be > 0.05)
_, p_value = stats.shapiro(residuals)
print(f"Shapiro-Wilk Test p-value: {p_value:.4f}")

# Plot predicted vs. actual values
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, color='green')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Predicted vs. Actual Values")
plt.show()
