In [None]:
## üè† House Price Prediction Pipeline
# Author: Randy Jin
# For: Machine Learning Engineer Coding Test @ OpenHouse.ai

In [None]:
# ‚úÖ 1. Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

import joblib
import os

In [None]:
# ‚úÖ 2. Load Data
df = pd.read_csv("dataset.csv")
df.head()
# Remove Outliers
df = df[df['SalePrice'] < 500000]
# Log Transform Target Variable
df['LogSalePrice'] = np.log1p(df['SalePrice'])


In [None]:
# ‚úÖ 3. Quick Data Overview
print(df.shape)
df.info()
df.describe()
df.isnull().sum().sort_values(ascending=False)

In [None]:
# ‚úÖ 4. Feature & Target Split
y = df['LogSalePrice']
X = df.drop(['SalePrice', 'LogSalePrice'], axis=1)

In [None]:
# ‚úÖ 5. Separate Features
df_numeric = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
print("Original Numeric fields count: ",len(df_numeric))
df_categorical = X.select_dtypes(include=["object", "category"]).columns.tolist()
print("Original Categorical fields count: ", len(df_categorical))

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

# Display columns with missing values
print("\nColumns with missing values:")
print(missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False))

In [None]:
# ### Exploratory Data Analysis

# Distribution of the target variable (SalePrice)
plt.figure(figsize=(10, 6))
sns.histplot(df['SalePrice'], kde=True)
plt.title('Distribution of Sale Price')
plt.xlabel('Sale Price ($)')
plt.ylabel('Frequency')
os.makedirs("images", exist_ok=True)
filename = os.path.join("images", "sale_price_distribution.png")
plt.savefig(filename)
plt.close()

# Check log transformation of SalePrice
plt.figure(figsize=(10, 6))
sns.histplot(np.log1p(df['SalePrice']), kde=True)
plt.title('Distribution of Log-Transformed Sale Price')
plt.xlabel('Log(Sale Price + 1)')
plt.ylabel('Frequency')
filename = os.path.join("images", "log_sale_price_distribution.png")
plt.savefig(filename)
plt.close()

# Correlation analysis for numerical features
numerical_data = df.select_dtypes(include=['int64', 'float64'])
correlation_matrix = numerical_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
filename = os.path.join("images", "correlation_heatmap.png")
plt.savefig(filename)
plt.close()

# Top correlations with SalePrice
sale_price_corr = correlation_matrix['SalePrice'].sort_values(ascending=False)
sale_price_corr = pd.DataFrame(sale_price_corr)
sale_price_corr.columns = ['Correlation with SalePrice']
print("\nTop 10 Features Most Correlated with SalePrice:")
print(sale_price_corr.head(10))

In [None]:
# ‚úÖ 6. Preprocessing Pipelines
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, df_numeric),
    ("cat", categorical_pipeline, df_categorical)
])

In [None]:
debug_pipeline = make_pipeline(preprocessor)
transformed = debug_pipeline.fit_transform(X)
print(transformed.shape)
# ÊãøÂá∫ÂàÜÁ±ªÁöÑ encoderÔºàÊ≥®ÊÑèËøôÊòØ pipeline ÈáåÁöÑÊ≠•È™§Ôºâ
encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']

# Ëé∑ÂèñÊâÄÊúâ OneHot ÁºñÁ†ÅÁîüÊàêÁöÑÂàóÂêç
encoded_feature_names = encoder.get_feature_names_out(df_categorical)

print("üöÄ Transformed Categorical fields count: ", len(encoded_feature_names))

print("üöÄ Column name after OneHot encoding:")
print(encoded_feature_names[:len(encoded_feature_names)])

final_feature_names = df_numeric + encoded_feature_names.tolist()
print("üöÄ Transformed fields in total: ", len(final_feature_names))
print("üöÄ All of the column name after OneHot encoding:")
print(final_feature_names[:len(final_feature_names)])

In [None]:
# ‚úÖ 7. Build Full Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

In [None]:
# ‚úÖ 8. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ‚úÖ 9. Train Model
pipeline.fit(X_train, y_train)

In [None]:
# ‚úÖ 10. Predict & Evaluate
y_pred_log = pipeline.predict(X_test)

# Inverse Transform Back to Original Scale
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)


mae = mean_absolute_error(y_test_actual, y_pred_actual)
mse = mean_squared_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, y_pred_actual)

print(f"\nüìä Evaluation Metrics (Converted to Original SalePrice):")
print(f"MAE : {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R^2 : {r2:.4f}")

In [None]:
# ‚úÖ 11. Save Model
joblib.dump(pipeline, "house_price_pipeline.pkl")

In [None]:
# ‚úÖ 12. Optional: Plot Prediction vs Actual
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Prediction vs Actual")
plt.tight_layout()
plt.savefig("prediction_vs_actual.png")
plt.show()