In [None]:
# =============================
# 1) Setup
# =============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy import stats


In [None]:
# =============================
# 2) Load Dataset
# =============================

from google.colab import files
uploaded = files.upload()

df = pd.read_csv("train.csv")   # Upload house prices train.csv here
print("Shape:", df.shape)
df.head()


In [None]:
# =============================
# 3) Preprocessing
# =============================

# Drop Id column
df = df.drop(columns=["Id"])

# Fill missing numeric with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].median())

# Fill missing categoricals with mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Define target and features
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

In [None]:
# =============================
# 4) EDA
# =============================

# Univariate plot of target
sns.histplot(y, bins=30)
plt.title("SalePrice Distribution")
plt.show()

# Correlation heatmap of first 15 numeric features to avoid clutter
plt.figure(figsize=(12,8))
sns.heatmap(df.corr().iloc[:, :15].head(15), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap (sample features)")
plt.show()

In [None]:

# =============================
# 5) Linear Regression
# =============================

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
train_r2 = lr.score(X_train, y_train)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("Test R²:", r2)
print("Train R²:", train_r2)
print("Train-Test Gap:", train_r2 - r2)

In [None]:
# =============================
# 6) Multicollinearity & Assumptions
# =============================

X_train_numeric = X_train.astype(float)
X_const = sm.add_constant(X_train_numeric)
vif = pd.DataFrame()
vif["Feature"] = X_train_numeric.columns
vif["VIF"] = [variance_inflation_factor(X_const.values, i + 1) for i in range(len(X_train_numeric.columns))]
print("Top VIFs:\n", vif.sort_values(by="VIF", ascending=False).head(5))

residuals = y_test - y_pred

sh_stat, sh_p = stats.shapiro(residuals)
dw = durbin_watson(residuals)
bp_stat, bp_p, _, _ = het_breuschpagan(residuals, sm.add_constant(X_test.astype(float)))

print(f"Shapiro p={sh_p:.3f} | Durbin-Watson={dw:.2f} | Breusch-Pagan p={bp_p:.3f}")

In [None]:
# =============================
# 7) Tuning - Ridge & Lasso
# =============================

ridge = Ridge(alpha=1.0).fit(X_train, y_train)
lasso = Lasso(alpha=0.1).fit(X_train, y_train)

print("Ridge R²:", ridge.score(X_test, y_test))
print("Lasso R²:", lasso.score(X_test, y_test))
print("Lasso non-zero coefficients:", np.sum(lasso.coef_ != 0))

In [None]:
# =============================
# 8) Final Summary
# =============================

coefficients = pd.Series(lr.coef_, index=X_train.columns).sort_values(key=abs, ascending=False)
print("Top 3 Drivers of SalePrice:\n", coefficients.head(3))