In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

# -----------------------------
# Load Data
# -----------------------------
df = pd.read_csv("house_prices.csv")
print("Preview:\n", df.head(), "\n")

# -----------------------------
# Basic Cleaning
# -----------------------------
df = df.dropna()

# Target column (adjust if different)
target_col = "Price"

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# -----------------------------
# Handle Categorical Columns
# -----------------------------
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# Build & Train Model (scikit-learn)
# -----------------------------
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)

# -----------------------------
# Predictions & Evaluation
# -----------------------------
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("HOUSE PRICE PREDICTION MODEL")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.3f}")

# -----------------------------
# Visualization: Predictions vs Actual
# -----------------------------
plt.figure(figsize=(7, 5))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Predicted vs Actual House Prices")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.tight_layout()
plt.savefig("predictions_vs_actual.png")
plt.show()

# -----------------------------
# Save Evaluation Report
# -----------------------------
report = f"""
HOUSE PRICE PREDICTION MODEL REPORT

Dataset: house_prices.csv

Evaluation Metrics:
MAE: {mae:.2f}
MSE: {mse:.2f}
R² Score: {r2:.3f}

Insights:
- The model captures the general trend between features and house prices.
- Area/size-related features contribute most to price prediction.
- R² indicates how much variance in price is explained by the model.

Recommendations:
- Collect more data for better generalization.
- Try advanced models like Random Forest for improved performance.
"""

with open("model_evaluation_report.md", "w") as f:
    f.write(report)

print("Model evaluation report saved as model_evaluation_report.md")
print("Plot saved as predictions_vs_actual.png")