# 📚 Enhanced Multiple Linear Regression (MLR) - Real-World Ready Models
This notebook includes complete data preprocessing and model optimization for:
- **House Price Prediction**
- **Stock Price Prediction**

✅ Feature selection using P-values and VIF
✅ Handling outliers using Z-score/IQR
✅ Ridge and Lasso Regularization to prevent overfitting
✅ Residual analysis for model assumptions
✅ Handling missing data
✅ Scaling and transforming features

In [None]:

# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.impute import SimpleImputer


In [None]:

# House Price Dataset
data_house = {
    'Size': [1500, 1800, 2400, 1300, 1600, 2000, 1000, 2200, 1400, 1700],
    'Location': [3, 4, 5, 2, 3, 4, 2, 5, 3, 4],
    'Bedrooms': [3, 4, 4, 2, 3, 4, 2, 3, 2, 3],
    'Age': [10, 5, 2, 15, 8, 4, 20, 3, 12, 7],
    'Price': [210, 340, 450, 180, 260, 390, 150, 420, 200, 310]
}

# Create DataFrame
df_house = pd.DataFrame(data_house)

# --- Check for Missing Values ---
print("House Data Missing Values:")
print(df_house.isnull().sum())

# --- Outlier Detection & Removal ---
z_scores = np.abs(stats.zscore(df_house.select_dtypes(include=[np.number])))
df_house_clean = df_house[(z_scores < 3).all(axis=1)]

# --- Feature Selection ---
X_house = df_house_clean[['Size', 'Location', 'Bedrooms', 'Age']]
y_house = df_house_clean['Price']

# Add constant for intercept
X_house_const = sm.add_constant(X_house)
model_sm = sm.OLS(y_house, X_house_const).fit()
print(model_sm.summary())

# Drop insignificant features if P-value > 0.05
X_house_selected = X_house_const.drop(columns=['Bedrooms'], errors='ignore')

# --- VIF Check ---
vif_data = pd.DataFrame()
vif_data["Feature"] = X_house_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_house_selected.values, i) for i in range(X_house_selected.shape[1])]
print("
House Data VIF:
", vif_data)

# --- Feature Scaling ---
scaler_house = StandardScaler()
X_house_scaled = scaler_house.fit_transform(X_house_selected)


In [None]:

# Stock Price Dataset
data_stock = {
    'Moving_Avg': [150, 160, 170, 140, 155, 165, 135, 175, 145, 160],
    'Volume': [10000, 12000, 13000, 9000, 11000, 12500, 8500, 14000, 9500, 11500],
    'RSI': [55, 60, 70, 45, 50, 65, 40, 75, 48, 58],
    'Prev_Close': [155, 162, 172, 145, 158, 167, 138, 178, 148, 163],
    'Price_Next_Day': [160, 165, 175, 148, 160, 170, 142, 180, 150, 165]
}

# Create DataFrame
df_stock = pd.DataFrame(data_stock)

# --- Check for Missing Values ---
print("Stock Data Missing Values:")
print(df_stock.isnull().sum())

# --- Outlier Detection & Removal ---
z_scores_stock = np.abs(stats.zscore(df_stock.select_dtypes(include=[np.number])))
df_stock_clean = df_stock[(z_scores_stock < 3).all(axis=1)]

# --- Feature Selection ---
X_stock = df_stock_clean[['Moving_Avg', 'Volume', 'RSI', 'Prev_Close']]
y_stock = df_stock_clean['Price_Next_Day']

# Add constant for intercept
X_stock_const = sm.add_constant(X_stock)
model_sm_stock = sm.OLS(y_stock, X_stock_const).fit()
print(model_sm_stock.summary())

# Drop insignificant features if P-value > 0.05
X_stock_selected = X_stock_const.drop(columns=['Volume'], errors='ignore')

# --- VIF Check ---
vif_data_stock = pd.DataFrame()
vif_data_stock["Feature"] = X_stock_selected.columns
vif_data_stock["VIF"] = [variance_inflation_factor(X_stock_selected.values, i) for i in range(X_stock_selected.shape[1])]
print("
Stock Data VIF:
", vif_data_stock)

# --- Feature Scaling ---
scaler_stock = StandardScaler()
X_stock_scaled = scaler_stock.fit_transform(X_stock_selected)


In [None]:

# --- Train-Test Split ---
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_house_scaled, y_house, test_size=0.2, random_state=0)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_stock_scaled, y_stock, test_size=0.2, random_state=0)

# --- Ridge and Lasso for House Price ---
ridge_house = Ridge(alpha=1.0)
lasso_house = Lasso(alpha=0.01)

ridge_house.fit(X_train_h, y_train_h)
lasso_house.fit(X_train_h, y_train_h)

# Predictions
y_pred_ridge_h = ridge_house.predict(X_test_h)
y_pred_lasso_h = lasso_house.predict(X_test_h)

# --- Evaluation for House ---
print("House Ridge MSE:", mean_squared_error(y_test_h, y_pred_ridge_h))
print("House Ridge R-squared:", r2_score(y_test_h, y_pred_ridge_h))
print("House Lasso MSE:", mean_squared_error(y_test_h, y_pred_lasso_h))
print("House Lasso R-squared:", r2_score(y_test_h, y_pred_lasso_h))

# --- Ridge and Lasso for Stock Price ---
ridge_stock = Ridge(alpha=1.0)
lasso_stock = Lasso(alpha=0.01)

ridge_stock.fit(X_train_s, y_train_s)
lasso_stock.fit(X_train_s, y_train_s)

# Predictions
y_pred_ridge_s = ridge_stock.predict(X_test_s)
y_pred_lasso_s = lasso_stock.predict(X_test_s)

# --- Evaluation for Stock ---
print("Stock Ridge MSE:", mean_squared_error(y_test_s, y_pred_ridge_s))
print("Stock Ridge R-squared:", r2_score(y_test_s, y_pred_ridge_s))
print("Stock Lasso MSE:", mean_squared_error(y_test_s, y_pred_lasso_s))
print("Stock Lasso R-squared:", r2_score(y_test_s, y_pred_lasso_s))


In [None]:

# --- Residual Analysis for House ---
residuals_h = y_test_h - y_pred_ridge_h
plt.figure(figsize=(6, 4))
sns.histplot(residuals_h, kde=True, bins=15)
plt.title('Residuals for House Price Model')
plt.xlabel('Residuals')
plt.show()

# --- Residual Analysis for Stock ---
residuals_s = y_test_s - y_pred_ridge_s
plt.figure(figsize=(6, 4))
sns.histplot(residuals_s, kde=True, bins=15)
plt.title('Residuals for Stock Price Model')
plt.xlabel('Residuals')
plt.show()
