# Multilinear & Polynomial Regression Analysis


### Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

def sanjay():
    print("SANJAY R - 24BAD407")

---
# SCENARIO 1: Multilinear Regression on Student Performance
---

### Load and Explore Dataset

In [None]:

df = pd.read_csv("/kaggle/input/datasets/spscientist/students-performance-in-exams/StudentsPerformance.csv")

print("Dataset Shape:", df.shape)
print(" ")
print(df.head())
print(" ")
print(df.info())

### Create Target Variable
Calculate average final score from math, reading, and writing scores

In [None]:
df["final_score"] = (df["math score"] + df["reading score"] + df["writing score"]) / 3

print("Target Variable Statistics:")
print(df["final_score"].describe())

### Simulate Additional Features
Add study hours, attendance, and sleep hours for demonstration

In [None]:
np.random.seed(42)

df["study_hours"] = np.random.randint(1, 8, size=len(df))
df["attendance"] = np.random.randint(60, 100, size=len(df))
df["sleep_hours"] = np.random.randint(4, 9, size=len(df))

print("Updated Dataset with Simulated Features:")
print(df.head())

### Feature and Target Selection

In [None]:

X = df.drop(columns=["math score", "reading score", "writing score", "final_score"])
y = df["final_score"]

print("Features (X):")
print(X.head())
print("\nTarget (y):")
print(y.head())

### Identify Feature Types

In [None]:
categorical_features = X.select_dtypes(include="object").columns.tolist()
numerical_features = X.select_dtypes(exclude="object").columns.tolist()

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)

### Create Preprocessing Pipeline

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

print("Preprocessing pipeline created successfully")

### Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

### Train Multilinear Regression Model

In [None]:
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)
print("Linear Regression model trained successfully")

### Make Predictions

In [None]:
y_pred = model.predict(X_test)

print("Sample Predictions vs Actual:")
comparison_df = pd.DataFrame({
    "Actual": y_test.values[:10],
    "Predicted": y_pred[:10],
    "Difference": y_test.values[:10] - y_pred[:10]
})
print(comparison_df)

### Model Evaluation

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("="*50)
print("LINEAR REGRESSION MODEL PERFORMANCE")
print("="*50)
print(f"Mean Squared Error (MSE)  : {mse:.4f}")
print(f"Root Mean Squared Error   : {rmse:.4f}")
print(f"R² Score                  : {r2:.4f}")
print("="*50)

### Analyze Regression Coefficients

In [None]:

feature_names = (
    numerical_features +
    model.named_steps["preprocessing"]
         .transformers_[1][1]
         .get_feature_names_out(categorical_features).tolist()
)

coefficients = model.named_steps["regressor"].coef_

coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print("\nTop 10 Feature Coefficients:")
print(coef_df.head(10))

### Ridge Regression (L2 Regularization)

In [None]:
ridge = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", Ridge(alpha=1.0))
])

ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

print("Ridge Regression Performance:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_ridge)):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred_ridge):.4f}")

### Lasso Regression

In [None]:
lasso = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", Lasso(alpha=0.01, max_iter=10000))
])

lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

print("Lasso Regression Performance:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lasso)):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred_lasso):.4f}")

### Visualizations

#### Predicted vs Actual Scores

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k', s=80)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel("Actual Final Score", fontsize=12)
plt.ylabel("Predicted Final Score", fontsize=12)
plt.title("Predicted vs Actual Final Scores", fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

#### Top Feature Influences (Coefficient Magnitude)

In [None]:
plt.figure(figsize=(12, 6))
top_features = coef_df.set_index("Feature").abs().sort_values(
    by="Coefficient", ascending=False
).head(10)

top_features.plot(kind="barh", legend=False, color='steelblue')
plt.xlabel("Absolute Coefficient Value", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.title("Top 10 Feature Influences on Final Score", fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

#### Residual Distribution

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='coral', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero Error')
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Distribution of Residuals", fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Mean Residual: {residuals.mean():.4f}")
print(f"Std Residual: {residuals.std():.4f}")

---
# SCENARIO 2: Polynomial Regression on Auto MPG Dataset
---

### Load Auto MPG Dataset

In [None]:
df = pd.read_csv("/kaggle/input/datasets/organizations/uciml/autompg-dataset/auto-mpg.csv")

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())

### Data Cleaning

In [None]:

df["horsepower"] = df["horsepower"].replace("?", np.nan)
df["horsepower"] = df["horsepower"].astype(float)

print("Missing values in horsepower:", df["horsepower"].isna().sum())

### Handle Missing Values

In [None]:
mean_hp = df["horsepower"].mean()
df["horsepower"].fillna(mean_hp, inplace=True)

print(f"Filled missing values with mean: {mean_hp:.2f}")
print("Missing values after filling:", df["horsepower"].isna().sum())

### Feature and Target Selection

In [None]:
X = df[["horsepower"]]
y = df["mpg"]

print("Feature (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print("\nTarget statistics:")
print(y.describe())

### Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

### Feature Scaling

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed")

### Polynomial Regression Function

In [None]:
def polynomial_model(degree):
    """
    Train a polynomial regression model of given degree
    
    Parameters:
    -----------
    degree : int
        Degree of polynomial features
    
    Returns:
    --------
    model : LinearRegression
        Trained model
    poly : PolynomialFeatures
        Polynomial feature transformer
    mse, rmse, r2 : float
        Evaluation metrics
    """
    poly = PolynomialFeatures(degree=degree)
    
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_test_poly = poly.transform(X_test_scaled)
    
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    y_pred = model.predict(X_test_poly)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return model, poly, mse, rmse, r2

print("Polynomial regression function defined")

### Train Models with Different Polynomial Degrees

In [None]:
results = {}

degrees = [1, 2, 3, 4, 5]

print("Training polynomial models...\n")
for d in degrees:
    model, poly, mse, rmse, r2 = polynomial_model(d)
    results[d] = {"MSE": mse, "RMSE": rmse, "R²": r2}
    print(f"Degree {d}: RMSE={rmse:.4f}, R²={r2:.4f}")

results_df = pd.DataFrame(results).T
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION RESULTS")
print("="*60)
print(results_df)
print("="*60)

### 2.9 Model Comparison Table

| **Degree** | **Behavior**                    | **Typical Outcome**        |
|------------|---------------------------------|----------------------------|
| 1          | Linear fit                      | Underfitting               |
| 2          | Quadratic curve                 | Good bias-variance balance |
| 3          | Cubic curve                     | Better fit                 |
| 4          | Higher-order polynomial         | Overfitting risk           |
| 5          | Very complex curve              | High overfitting risk      |

### Visualization: Polynomial Regression Curves

In [None]:

X_range = np.linspace(X.min().values[0], X.max().values[0], 300).reshape(-1, 1)
X_range_scaled = scaler.transform(X_range)

plt.figure(figsize=(12, 7))
plt.scatter(X, y, alpha=0.4, s=30, c='gray', label="Actual Data", edgecolors='k', linewidths=0.5)

colors = ['blue', 'green', 'orange', 'red', 'purple']
for i, d in enumerate([1, 2, 3, 4, 5]):
    poly = PolynomialFeatures(d)
    X_poly = poly.fit_transform(X_train_scaled)
    model = LinearRegression()
    model.fit(X_poly, y_train)
    
    y_curve = model.predict(poly.transform(X_range_scaled))
    plt.plot(X_range, y_curve, label=f"Degree {d}", linewidth=2.5, color=colors[i])

plt.xlabel("Horsepower", fontsize=12)
plt.ylabel("MPG (Miles Per Gallon)", fontsize=12)
plt.title("Polynomial Regression: Effect of Degree on Model Fit", fontsize=14, fontweight='bold')
plt.legend(loc='upper right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Overfitting vs Underfitting Analysis

In [None]:
train_errors = []
test_errors = []
degrees_extended = list(range(1, 11))

for d in degrees_extended:
    poly = PolynomialFeatures(d)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_test_poly = poly.transform(X_test_scaled)
    
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    train_errors.append(mean_squared_error(y_train, model.predict(X_train_poly)))
    test_errors.append(mean_squared_error(y_test, model.predict(X_test_poly)))


plt.figure(figsize=(12, 7))
plt.plot(degrees_extended, train_errors, marker='o', markersize=8, 
         linewidth=2.5, label="Training Error", color='blue', linestyle='-')
plt.plot(degrees_extended, test_errors, marker='s', markersize=8, 
         linewidth=2.5, label="Test Error", color='red', linestyle='-')


optimal_degree = degrees_extended[np.argmin(test_errors)]
min_test_error = min(test_errors)
plt.axvline(x=optimal_degree, color='green', linestyle='--', linewidth=2, alpha=0.7,
            label=f'Optimal Degree = {optimal_degree}')


plt.axvspan(1, 2, alpha=0.1, color='orange', label='Underfitting Region')
plt.axvspan(6, 10, alpha=0.1, color='red', label='Overfitting Region')

plt.xlabel("Polynomial Degree", fontsize=12, fontweight='bold')
plt.ylabel("Mean Squared Error (MSE)", fontsize=12, fontweight='bold')
plt.title("Bias-Variance Tradeoff: Overfitting vs Underfitting", 
          fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3, linestyle='--')
plt.xticks(degrees_extended)
plt.tight_layout()
plt.show()

print(f"\nOptimal Polynomial Degree: {optimal_degree}")
print(f"Minimum Test Error: {min_test_error:.4f}")

### Ridge Regression for Overfitting Control

In [None]:

poly_high = PolynomialFeatures(degree=5)

X_train_poly = poly_high.fit_transform(X_train_scaled)
X_test_poly = poly_high.transform(X_test_scaled)


alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

print("\n" + "="*70)
print("RIDGE REGRESSION WITH DEGREE 5 POLYNOMIAL")
print("="*70)
print(f"{'Alpha':<10} {'RMSE':<15} {'R² Score':<15}")
print("-"*70)

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_poly, y_train)
    y_pred_ridge = ridge.predict(X_test_poly)
    
    rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
    r2_ridge = r2_score(y_test, y_pred_ridge)
    
    print(f"{alpha:<10} {rmse_ridge:<15.4f} {r2_ridge:<15.4f}")

print("="*70)


ridge_optimal = Ridge(alpha=1.0)
ridge_optimal.fit(X_train_poly, y_train)
y_pred_ridge_optimal = ridge_optimal.predict(X_test_poly)

print(f"\nOptimal Ridge Regression (alpha=1.0):")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_ridge_optimal)):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred_ridge_optimal):.4f}")

### Final Model Comparison

In [None]:

comparison_results = []


model_linear, poly_linear, mse_linear, rmse_linear, r2_linear = polynomial_model(1)
comparison_results.append(['Linear (Degree 1)', rmse_linear, r2_linear])

model_poly3, poly_3, mse_3, rmse_3, r2_3 = polynomial_model(3)
comparison_results.append(['Polynomial (Degree 3)', rmse_3, r2_3])

rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge_optimal))
r2_ridge = r2_score(y_test, y_pred_ridge_optimal)
comparison_results.append(['Ridge (Degree 5, α=1.0)', rmse_ridge, r2_ridge])

comparison_df = pd.DataFrame(comparison_results, columns=['Model', 'RMSE', 'R² Score'])

print("\n" + "="*60)
print("FINAL MODEL COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))
print("="*60)

best_model_idx = comparison_df['RMSE'].idxmin()
print(f"\nBest Model: {comparison_df.loc[best_model_idx, 'Model']}")
print(f"Best RMSE: {comparison_df.loc[best_model_idx, 'RMSE']:.4f}")
print(f"Best R² Score: {comparison_df.loc[best_model_idx, 'R² Score']:.4f}")