<h1 style="text-align:center">Build and Evaluate Time Series Models</h1>
<h2 style="text-align:center">House Pricing- House Quality</h2>
<p style="text-align:center">Robert Evans</p>
<p style="text-align:center">School of Technology & Engineering, National University</p>
<p style="text-align:center">DDS-8555: Predictive Analysis</p>
<p style="text-align:center">Dr. Mohammad Yavarimanesh</p>
<p style="text-align:center">March 2, 2025</p>

## Import Libraries

In [48]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Load Data

In [49]:
kaggle = False

if kaggle:
    path = ""
else:
    path = "Week8_Data_Houses"

train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")

## Build Model

In [50]:
# Define features and target
features = ['ExterQual', 'BsmtQual', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'PoolQC', 'Fence']
target = 'SalePrice'

In [51]:
# One-hot encode the categorical features using OneHotEncoder.
# Using drop='first' to avoid multicollinearity.
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_encoded = ohe.fit_transform(X)

# Retrieve the one-hot encoded feature names.
feature_names = ohe.get_feature_names_out(features)

# Create a DataFrame with the encoded features for easier handling.
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names)

In [52]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_encoded_df, y, test_size=0.2, random_state=42)

# Reset indices to ensure proper alignment between features and target.
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Add a constant to the training and testing sets to account for the intercept.
X_train_const = sm.add_constant(X_train, has_constant='add')
X_test_const = sm.add_constant(X_test, has_constant='add')

In [53]:
# Fit the OLS model on the training data.
model = sm.OLS(y_train, X_train_const).fit()

In [54]:

# Predict on the test set.
y_pred = model.predict(X_test_const)

In [55]:
# Evaluate model performance using R² and RMSE.
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test Set Performance:")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Test Set Performance:
R²: 0.6696
RMSE: 50342.59


In [56]:
# Print the detailed OLS regression summary (includes variable names, coefficients, and p-values).
print("\nOLS Regression Summary:")
print(model.summary())


OLS Regression Summary:
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.681
Model:                            OLS   Adj. R-squared:                  0.674
Method:                 Least Squares   F-statistic:                     90.22
Date:                Tue, 04 Mar 2025   Prob (F-statistic):          1.58e-260
Time:                        11:28:24   Log-Likelihood:                -14135.
No. Observations:                1168   AIC:                         2.833e+04
Df Residuals:                    1140   BIC:                         2.847e+04
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            

## Kaggle Submission

In [58]:
# Extract features from test.csv
X_new = test[features]

# Use the previously fitted OneHotEncoder (ohe) to transform the test features.
# Note: ohe and feature_names were fitted/extracted from the training process.
X_new_encoded = ohe.transform(X_new)

# Create a DataFrame with the encoded features using the same column names as in training.
X_new_encoded_df = pd.DataFrame(X_new_encoded, columns=feature_names)

# Add a constant term for the intercept to match the model's design matrix.
X_new_encoded_const = sm.add_constant(X_new_encoded_df, has_constant='add')

# Predict SalePrice using the trained OLS model.
predictions = model.predict(X_new_encoded_const)

print("Predicted SalePrice values for test.csv:")
predictions

Predicted SalePrice values for test.csv:




0       120345.852013
1       146330.126611
2       185724.609174
3       180081.363940
4       196174.953179
            ...      
1454     97831.197160
1455    122948.181018
1456    162762.828331
1457    120792.978004
1458    188326.938179
Length: 1459, dtype: float64

In [60]:
# Create a DataFrame with the Id and predicted SalePrice.
submission_df = pd.DataFrame({
    'id': test['Id'],
    'SalePrice': predictions
})

# Save the submission DataFrame to a CSV file.
submission_df.to_csv("house_prediction_1.csv", index=False)
print("Submission file saved as house_prediction_1.csv")

Submission file saved as house_prediction_1.csv
