# Phase VIII â€” Final Prediction Pipeline

This notebook generates final predictions on the Kaggle test dataset using
the trained linear regression model and produces submission.csv.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [3]:
test_df = pd.read_csv("../data/test.csv")

In [4]:
test_df.shape

(1459, 80)

In [7]:
from scripts.data_preprocessing import (
    nan_check,
    none_check_cat,
    categorical_cols,
    onehot_encode_categorical
)

In [8]:
# handle missing values
test_df_clean = nan_check(test_df)

In [9]:
test_df_clean = none_check_cat(test_df_clean)

In [10]:
# onehot encoding for categorical features
categorical = categorical_cols(test_df_clean)
X_test_df = onehot_encode_categorical(test_df_clean, categorical)

In [11]:
X_test_df.shape

(1459, 292)

In [34]:
# reload train features structure
train_df = pd.read_csv("../data/train.csv")

In [35]:
y_train = np.log(train_df["SalePrice"].values)

In [36]:
X_train_df = train_df.drop(columns="SalePrice")

X_train_df = nan_check(X_train_df)
X_train_df = none_check_cat(X_train_df)

categorical_train = categorical_cols(X_train_df)
X_train_df = onehot_encode_categorical(X_train_df, categorical_train)


In [37]:
X_train_df_aligned = X_train_df.reindex(
    columns=X_test_df_aligned.columns,
    fill_value=0
)

In [38]:
X_train_np = X_train_df_aligned.values.astype(np.float64)

In [39]:
X_train_scaled = (X_train_np - mu) / sigma

In [40]:
X_train_bias = np.c_[
    np.ones((X_train_scaled.shape[0], 1)),
    X_train_scaled
]

In [41]:
X_train_bias.shape

(1460, 305)

In [42]:
model = LinearRegressionMaster()
model.fit_gradient_descent(
    X_train_bias,
    y_train,
    alpha=0.01,
    epochs=1000
)

In [44]:
X_test_bias = np.c_[
    np.ones((X_test_scaled.shape[0], 1)),
    X_test_scaled
]

In [45]:
X_test_bias.shape

(1459, 305)

In [46]:
# predict log(SalePrice) for test data
y_test_log_pred = model.predict(X_test_bias)
# sanity
y_test_log_pred.shape

(1459,)

In [47]:
# convert log-price back to actual price
y_test_pred = np.exp(y_test_log_pred)
# sanity checks
y_test_pred.min(), y_test_pred.mean(), y_test_pred.max()

(np.float64(35633.554357266235),
 np.float64(177267.81817723825),
 np.float64(1367562.588154909))

In [50]:
predictions = pd.DataFrame({
    "Id": test_df["Id"],
    "Predicted_SalePrice": y_test_pred
})

predictions.head(2)

Unnamed: 0,Id,Predicted_SalePrice
0,1461,120839.42634
1,1462,157053.115856


In [51]:
predictions.to_csv("predicted_sale_prices.csv", index=False)

In [52]:
predictions.to_csv("../outputs/predicted_sale_prices.csv", index=False)

# End

Note: The predictions generated here are not submitted to Kaggle.
This step demonstrates the full inference pipeline on unseen data
using a linear regression model built from scratch.