In [None]:
#  Importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set visualization style
sns.set(style="whitegrid")


In [None]:
#  Load the dataset
df = pd.read_csv('train.csv')
print("Dataset loaded. Shape:", df.shape)
df.head()


In [None]:
#  Handling missing values for both numerical and categorical columns
# Note: test_data is cloned from df to demonstrate on both
test_data = df.copy()

for column in df.columns:
    if df[column].dtype == 'object':
        # Fill missing categorical values with mode
        mode_val = df[column].mode()[0]
        df[column] = df[column].fillna(mode_val)
        test_data[column] = test_data[column].fillna(mode_val)
    else:
        # Fill missing numerical values with mean
        mean_val = df[column].mean()
        df[column] = df[column].fillna(mean_val)
        test_data[column] = test_data[column].fillna(mean_val)


In [None]:
#  Selecting relevant features for predicting house prices
features = ['GrLivArea', 'TotalBsmtSF', 'GarageArea', 'OverallQual', 'YearBuilt']
target = 'SalePrice'

X = df[features]
y = df[target]


In [None]:
#  Splitting the dataset into training and validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)


In [None]:
#  Training the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

#  Making predictions on the validation set
y_pred = model.predict(X_val)


In [None]:
#  Evaluating model performance
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.4f}")


In [None]:
#  Plot: Actual vs Predicted Sale Prices
plt.figure(figsize=(8,6))
plt.scatter(y_pred, y_val, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
plt.xlabel("Predicted Sale Price")
plt.ylabel("Actual Sale Price")
plt.title("Actual vs Predicted Sale Price")
plt.tight_layout()
plt.show()


In [None]:
#  Residual Analysis
residuals = y_val - y_pred

# Residual scatter plot
plt.figure(figsize=(8,6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Sale Price")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.tight_layout()
plt.show()

# Residual distribution
plt.figure(figsize=(8,6))
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.title("Distribution of Residuals")
plt.tight_layout()
plt.show()


In [None]:
#  Display model coefficients to interpret feature impact
coef_df = pd.DataFrame(model.coef_, index=features, columns=['Coefficient'])
print("Linear Regression Coefficients:")
print(coef_df)
