In [13]:
import os
import joblib
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [24]:
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)


In [36]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [47]:
continuous_features  = ["TotalBsmtSF", "GarageCars", "YearBuilt", "MasVnrArea"]
categorical_features = ["ExterQual", "Foundation", "BsmtQual", "SaleCondition"]
target_column = "SalePrice"


In [37]:
from sklearn.model_selection import train_test_split

X = df[continuous_features + categorical_features].copy()
y = df[target_column].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Split completed. Train shape: {X_train.shape}, Test shape: {X_test.shape}")
display(X_train.head())


Split completed. Train shape: (1168, 8), Test shape: (292, 8)


Unnamed: 0,TotalBsmtSF,GarageCars,YearBuilt,MasVnrArea,ExterQual,Foundation,BsmtQual,SaleCondition
254,1314,1,1957,0.0,TA,CBlock,TA,Normal
1066,799,2,1993,0.0,Gd,PConc,Gd,Normal
638,796,0,1910,0.0,TA,CBlock,Fa,Normal
799,731,1,1937,252.0,TA,BrkTil,Gd,Normal
380,1026,1,1924,0.0,TA,BrkTil,TA,Normal


In [38]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

scaler = StandardScaler()
try:
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

scaler.fit(X_train[continuous_features])
encoder.fit(X_train[categorical_features])

X_train_scaled = pd.DataFrame(
    scaler.transform(X_train[continuous_features]),
    columns=continuous_features,
    index=X_train.index
)

X_train_encoded = pd.DataFrame(
    encoder.transform(X_train[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features),
    index=X_train.index
)

X_train_processed = pd.concat([X_train_scaled, X_train_encoded], axis=1)

print("Feature processing complete.")
display(X_train_processed.head())


Feature processing complete.


Unnamed: 0,TotalBsmtSF,GarageCars,YearBuilt,MasVnrArea,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,Foundation_BrkTil,Foundation_CBlock,...,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtQual_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
254,0.572612,-1.056544,-0.455469,-0.599984,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1066,-0.596547,0.295092,0.718609,-0.599984,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
638,-0.603357,-2.408179,-1.988293,-0.599984,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
799,-0.750921,-1.056544,-1.107734,0.857019,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
380,-0.081209,-1.056544,-1.531707,-0.599984,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [39]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train_processed, y_train)
print("Random Forest model trained successfully.")

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test[continuous_features]),
    columns=continuous_features,
    index=X_test.index
)

X_test_encoded = pd.DataFrame(
    encoder.transform(X_test[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features),
    index=X_test.index
)

X_test_processed = pd.concat([X_test_scaled, X_test_encoded], axis=1)

y_pred = rf_model.predict(X_test_processed)
rmsle_score = compute_rmsle(y_test, y_pred)
print(f"Model evaluation complete. RMSLE: {rmsle_score}")


Random Forest model trained successfully.
Model evaluation complete. RMSLE: 0.22


In [41]:
import joblib
from pathlib import Path

models_path = Path("models")
models_path.mkdir(parents=True, exist_ok=True)

# Save trained model and preprocessing objects
joblib.dump(rf_model, models_path / "model.joblib")
joblib.dump(scaler, models_path / "scaler.joblib")
joblib.dump(encoder, models_path / "encoder.joblib")

print("Model and preprocessing objects saved successfully in 'models/' folder.")


Model and preprocessing objects saved successfully in 'models/' folder.


In [42]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

def build_model(data: pd.DataFrame) -> dict[str, float]:
    """Build, train, and evaluate the model on the provided dataset."""
    continuous = ["TotalBsmtSF", "GarageCars", "YearBuilt", "MasVnrArea"]
    categorical = ["ExterQual", "Foundation", "BsmtQual", "SaleCondition"]

    X = data[continuous + categorical]
    y = data["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    scaler.fit(X_train[continuous])
    encoder.fit(X_train[categorical])

    X_train_processed = pd.concat([
        pd.DataFrame(scaler.transform(X_train[continuous]), columns=continuous, index=X_train.index),
        pd.DataFrame(encoder.transform(X_train[categorical]), columns=encoder.get_feature_names_out(categorical), index=X_train.index)
    ], axis=1)

    model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train_processed, y_train)

    X_test_processed = pd.concat([
        pd.DataFrame(scaler.transform(X_test[continuous]), columns=continuous, index=X_test.index),
        pd.DataFrame(encoder.transform(X_test[categorical]), columns=encoder.get_feature_names_out(categorical), index=X_test.index)
    ], axis=1)

    y_pred = model.predict(X_test_processed)
    rmsle_value = compute_rmsle(y_test, y_pred)

    Path("models").mkdir(parents=True, exist_ok=True)
    joblib.dump(model, "models/model.joblib")
    joblib.dump(scaler, "models/scaler.joblib")
    joblib.dump(encoder, "models/encoder.joblib")

    return {"rmsle": rmsle_value}


In [46]:
import pandas as pd
import os

os.makedirs("../data", exist_ok=True)
X_train_processed.to_parquet("../data/processed_df.parquet", index=False)

expected_df = pd.read_parquet("../data/processed_df.parquet")

# Actual processed DataFrame from current notebook
actual_df = X_train_processed

pd.testing.assert_frame_equal(
    actual_df.reset_index(drop=True),
    expected_df.reset_index(drop=True)
)

print("Consistency test passed: Data identical ")


Consistency test passed: Data identical 


In [45]:
import pandas as pd
import joblib

rf_model = joblib.load("models/model.joblib")
scaler = joblib.load("models/scaler.joblib")
encoder = joblib.load("models/encoder.joblib")

test_df = pd.read_csv("test.csv")

X_infer = test_df[continuous_features + categorical_features].copy()
X_infer[continuous_features] = X_infer[continuous_features].fillna(X_infer[continuous_features].mean())
X_infer[categorical_features] = X_infer[categorical_features].fillna("Missing")

X_infer_scaled = pd.DataFrame(scaler.transform(X_infer[continuous_features]), columns=continuous_features)
X_infer_encoded = pd.DataFrame(
    encoder.transform(X_infer[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features)
)
X_infer_processed = pd.concat([X_infer_scaled, X_infer_encoded], axis=1)

predictions = rf_model.predict(X_infer_processed)

output = pd.DataFrame({
    "Id": test_df.get("Id", pd.Series(range(1, len(predictions)+1))),
    "Predicted_SalePrice": predictions
})

print("Predictions for test data:")
display(output.head())


Predictions for test data:


Unnamed: 0,Id,Predicted_SalePrice
0,1461,123370.658333
1,1462,151132.0
2,1463,214492.0
3,1464,192240.5
4,1465,213486.885
