In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
DATA_PATH = '../data/raw/train.csv'

df = pd.read_csv(DATA_PATH)

df = df[['GrLivArea', 'BedroomAbvGr', 'FullBath', 'SalePrice']].dropna()

df = df[df['GrLivArea'] < df['GrLivArea'].quantile(0.99)]

df.head()


In [None]:
sns.pairplot(df.sample(min(500, len(df))),
             y_vars='SalePrice',
             x_vars=['GrLivArea', 'BedroomAbvGr', 'FullBath'],
             height=4)
plt.show()

df.corr()['SalePrice'].sort_values(ascending=False)


In [None]:
X = df[['GrLivArea','BedroomAbvGr','FullBath']]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.4f}")

# Save model
os.makedirs('../models', exist_ok=True)
with open('../models/linear_regression.pkl', 'wb') as f:
    pickle.dump(lr, f)

coeffs = pd.DataFrame({'feature': X.columns, 'coef': lr.coef_})
coeffs
