In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

: 

# House Price Prediction using Regression

## Objective
The goal of this project is to build regression models to predict house sale prices and
understand how different features influence the final prediction.


In [None]:
## Pull down the training dataset and check the inputs provided
train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
# train_df[["SalePrice"]].head()
train_df.head()

In [None]:
## We want to know the price of the data from the size of the living room 
X = train_df[["GrLivArea"]]   # Feature
y = train_df["SalePrice"]    # Target

X.head(), y.head()


## Train-Test split
We want to split the provided data into training and testing data so that we can train and test our model correctly and accurately. 

For this, we will be using the sklearn's model-selection and test-train-split. 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

## Create a Regression Model


### 1. Linear Regression: Linear regression learns a line

Price of the house = (house_area × weight) + bias

where,
weight = how strongly the house_area affects the price
bias = base price of the house when area is 0 sqft. 


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

## Model looks at the area vs price and learns the best fitting line and stores that pattern internally 
model.fit(X_train, y_train)

## Predicting the model performance
Based on the house area alone, what price does the model predict for the houses. 

In [None]:
y_pred = model.predict(X_test)
y_pred[:5]

## Evaluate the model performance
How good/bad did the model predict the test data

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred))
r2_baseline = r2_score(y_test, y_pred)

rmse, r2

## Prediction
On an average, the prediction of the model was off by ~$58472 (RMSE)

R-squared: R² tells you how much of the variation in the target variable your model explains or it explains how well does the model explain why prices are different. 

R² >= 0.5 is always decent. 

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color="red", linestyle="--")
plt.xlabel("Actual Sale Price")
plt.ylabel("Predicted Sale Price")
plt.title("Actual vs Predicted Prices (Linear Regression)")
plt.savefig("actual_vs_predicted_linear_regression.png", dpi=300, bbox_inches="tight")
plt.show()


## Linear Regression Plot Conclusion
We see that the model does not provide the perfect fitting for the price predictions. However, this is also because this graph only uses house area in square feet as the feature (X). We can add more features to our model to make the predictions more accurate. 

## Adding more features. 
We can add more features including OverallQuality, total basement squarefeet area, number of cars in garage, year the house was built 

In [None]:
features = [
    "GrLivArea",
    "OverallQual",
    "TotalBsmtSF",
    "GarageCars",
    "YearBuilt"
]

X = train_df[features]
y = train_df["SalePrice"]


In [None]:
## train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [None]:
## Linear regression on the new features

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
## new model prediction
y_pred = model.predict(X_test)
y_pred[:5]

In [None]:
## rmse and r-squared for new features in model
rmse_multi = np.sqrt(mean_squared_error(y_test, y_pred))
r2_multi = r2_score(y_test, y_pred)

rmse, r2

## New Prediction conclusion
As shown in the above rmse and r-squared scores, we see a significant improvement in the linear regression model from previous when we were just using the square feet area. Also, we see that the plot for new predictions is more closely defined and not as scattered as the previous one. 

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color="red", linestyle="--")
plt.xlabel("Actual Sale Price")
plt.ylabel("Predicted Sale Price")
plt.title("New Actual vs Predicted Prices (Linear Regression)")
plt.savefig("actual_vs_predicted_linear_regression.png", dpi=300, bbox_inches="tight")
plt.show()


## Learn Feature Importance
This tells you which features increase price most and which matter less.

In [None]:
pd.Series(model.coef_, index=features).sort_values()


This shows that the overall quality of the house and also the garage space matters more than basement area or living area. 

## Summary

In [None]:
results = pd.DataFrame([
    {"Model": "Linear Regression (1 feature)", "RMSE": rmse_baseline, "R2": r2_baseline},
    {"Model": "Linear Regression (5 features)", "RMSE": rmse_multi, "R2": r2_multi},
])
results

## Model Improvement Analysis

A baseline linear regression model was first trained using a single feature
(`GrLivArea`). The model was then improved by incorporating additional numeric
features such as overall quality, basement area, garage capacity, and year built.

Adding these features resulted in a noticeable decrease in RMSE and an increase
in R², indicating improved predictive performance and better explanation of
price variability. This highlights the importance of feature selection in
regression models.

## Limitations

This project focuses on linear regression with a limited set of numeric features.
Categorical variables, advanced preprocessing, and nonlinear models were
intentionally excluded to maintain interpretability and focus on foundational
regression concepts.