In [1]:
#Cell 1 - Import Libraries & Setup Paths

import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# Append project root if needed
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

# Define directories for outputs
figures_dir = "../reports/figures"
text_dir = "../reports/text"
processed_dir = "../data/processed"
raw_dir = "../data/raw"

os.makedirs(figures_dir, exist_ok=True)
os.makedirs(text_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)


In [2]:
#Cell 2 - Import Preprocessing & Feature Engineering Functions and Model Loading

from src.data_preprocessing import impute_missing_values  # if needed
from src.feature_engineering import engineer_features, encode_categorical
from src.models import load_model

print("Modules imported successfully.")


Modules imported successfully.


In [3]:
#Cell 3 - Load Final Ensemble Model

ensemble_model_path = os.path.join(processed_dir, "ensemble_model.pkl")
ensemble_model = load_model(ensemble_model_path)
print("Ensemble model loaded from:", ensemble_model_path)


Ensemble model loaded from: ../data/processed/ensemble_model.pkl


In [4]:
#Cell 4 - Load and Process Test Data
# Load the test dataset (assumed to be in raw directory)

test_data_path = os.path.join(raw_dir, "test.csv")
df_test = pd.read_csv(test_data_path)
print("Test data loaded. Shape:", df_test.shape)

# Preserve the 'Id' column for submission
ids = df_test['Id']

# Apply missing value imputation if needed (using similar logic as training)
df_test = impute_missing_values(df_test)

# Apply feature engineering using our module
df_test = engineer_features(df_test)

# Optionally, encode categorical features (using same columns as training)
# Adjust the list of categorical columns based on training.
cat_cols = ['MSZoning', 'Neighborhood']  # Example; adjust as needed.
df_test_encoded = encode_categorical(df_test, cat_cols=cat_cols)

print("Test data processed. Shape after feature engineering and encoding:", df_test_encoded.shape)


Test data loaded. Shape: (1459, 80)
Test data processed. Shape after feature engineering and encoding: (1459, 111)


In [5]:
#Cell 5 - Prepare Features for Prediction
# In training we used numeric columns only, so select those columns.
# Drop 'Id' from features if present.

numeric_test = df_test_encoded.select_dtypes(include=['number'])
if 'Id' in numeric_test.columns:
    numeric_test = numeric_test.drop(columns=['Id'])

# For prediction, ensure the features match those used in training.
# (This assumes that your training pipeline produced consistent columns.)
X_test = numeric_test
print("Prepared test features shape:", X_test.shape)


Prepared test features shape: (1459, 69)


In [6]:
#Cell 6 - Generate Predictions & Create Submission File
# Generate predictions using the loaded ensemble model

y_test_pred = ensemble_model.predict(X_test)

# Create a submission DataFrame with Id and SalePrice
submission = pd.DataFrame({
    'Id': ids,
    'SalePrice': y_test_pred
})

# Save the submission file to processed folder
submission_path = os.path.join(processed_dir, "submission.csv")
submission.to_csv(submission_path, index=False)
print("Submission file saved to:", submission_path)


Submission file saved to: ../data/processed/submission.csv


In [None]:
#Cell 7 - Optional: Display Submission Sample

display(submission.head())


Unnamed: 0,Id,SalePrice
0,1461,128691.4317
1,1462,158538.327457
2,1463,184800.778271
3,1464,185981.785912
4,1465,199741.577749
