In [1]:
# This notebook assumes it is run from the `notebooks/` directory.
# All paths are relative to the project root.

In [1]:
import os
print(os.getcwd())

C:\Users\Sameer Kumar\Documents\GitHub\satellite-imagery-property-valuation\notebooks


In [2]:
import pandas as pd
import pickle
import numpy as np
import os

print("Working directory:", os.getcwd())
print("Data folder:", os.listdir("../data/"))

df = pd.read_csv("../data/train.csv")
df["id"] = df["id"].astype(str)

with open("../models/image_embeddings.pkl", "rb") as f:
    img_emb = pickle.load(f)

print("Tabular rows:", len(df))
print("Image embeddings:", len(img_emb))

Working directory: C:\Users\Sameer Kumar\Documents\GitHub\satellite-imagery-property-valuation\notebooks
Data folder: ['.gitkeep', 'test.csv', 'train.csv']
Tabular rows: 16209
Image embeddings: 14599


In [3]:
# Merging Tabular plus Image data

# Map image embeddings to dataframe
df["image_vec"] = df["id"].map(img_emb)

# Keep only rows with images
df_mm = df.dropna(subset=["image_vec"]).reset_index(drop=True)

print("Multimodal rows:", len(df_mm))

Multimodal rows: 14697


In [4]:
tab_features = [
    "bedrooms", "bathrooms", "sqft_living",
    "sqft_lot", "floors", "waterfront",
    "view", "condition", "grade",
    "sqft_above", "sqft_basement",
    "yr_built", "yr_renovated",
    "lat", "long"
]

X_tab = df_mm[tab_features].values

X_img = np.vstack(df_mm["image_vec"].values)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_tab_scaled = scaler.fit_transform(X_tab)

In [6]:
X_fused = np.hstack([X_tab_scaled, X_img])

In [7]:
print("Final multimodal rows:", len(df_mm))
print("Feature matrix shape:", X_fused.shape)

Final multimodal rows: 14697
Feature matrix shape: (14697, 2063)


In [8]:
y = df_mm["price"].values

In [9]:
from sklearn.model_selection import train_test_split

Xf_train, Xf_val, yf_train, yf_val = train_test_split(
    X_fused, y, test_size=0.2, random_state=42
)

Xt_train, Xt_val, yt_train, yt_val = train_test_split(
    X_tab_scaled, y, test_size=0.2, random_state=42
)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
models = {
    "Linear Regression (Tabular)": LinearRegression(),

    "Random Forest (Tabular)": RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),

    "XGBoost (Tabular)": XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ),

    "XGBoost (Multimodal)": XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
}

In [12]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    if "Multimodal" in name:
        model.fit(Xf_train, yf_train)
        preds = model.predict(Xf_val)

        rmse = np.sqrt(mean_squared_error(yf_val, preds))
        r2 = r2_score(yf_val, preds)

    else:
        model.fit(Xt_train, yt_train)
        preds = model.predict(Xt_val)

        rmse = np.sqrt(mean_squared_error(yt_val, preds))
        r2 = r2_score(yt_val, preds)

    results[name] = {
        "RMSE": rmse,
        "R2": r2
    }

    print(f"{name} RMSE: {rmse:.2f}")
    print(f"{name} R2:   {r2:.4f}")


Training Linear Regression (Tabular)...
Linear Regression (Tabular) RMSE: 189900.02
Linear Regression (Tabular) R2:   0.7084

Training Random Forest (Tabular)...
Random Forest (Tabular) RMSE: 121494.08
Random Forest (Tabular) R2:   0.8807

Training XGBoost (Tabular)...
XGBoost (Tabular) RMSE: 108089.42
XGBoost (Tabular) R2:   0.9055

Training XGBoost (Multimodal)...
XGBoost (Multimodal) RMSE: 122478.75
XGBoost (Multimodal) R2:   0.8787


In [13]:
os.makedirs("../models", exist_ok=True)

with open("../models/multimodal_xgb_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [14]:
results_df = (
    pd.DataFrame(results)
      .T
      .reset_index()
      .rename(columns={"index": "Model"})
      .sort_values("RMSE")
      .reset_index(drop=True)
)

results_df

Unnamed: 0,Model,RMSE,R2
0,XGBoost (Tabular),108089.416022,0.905541
1,Random Forest (Tabular),121494.083412,0.880659
2,XGBoost (Multimodal),122478.749047,0.878717
3,Linear Regression (Tabular),189900.018066,0.70844


In [15]:
best_model_name = results_df.iloc[0]["Model"]
best_rmse = results_df.iloc[0]["RMSE"]
best_r2 = results_df.iloc[0]["R2"]

print(f"Best model selected: {best_model_name}")
print(f"RMSE: {best_rmse:.2f}")
print(f"R2 score: {best_r2:.4f}")

Best model selected: XGBoost (Tabular)
RMSE: 108089.42
R2 score: 0.9055


In [16]:
final_tabular_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

final_tabular_model.fit(X_tab_scaled, y)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
import pickle
import os

os.makedirs("../models", exist_ok=True)

with open("../models/final_tabular_model.pkl", "wb") as f:
    pickle.dump(final_tabular_model, f)

print("Saved models/final_tabular_model.pkl")

Saved models/final_tabular_model.pkl


In [18]:
# Retraining the best model on full data
final_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_tab_scaled, y)

# Save model
os.makedirs("../models", exist_ok=True)
with open("../models/final_tabular_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

print("Final model trained and saved")

Final model trained and saved


In [19]:
FEATURE_COLS = [
    "bedrooms", "bathrooms", "sqft_living",
    "sqft_lot", "floors", "waterfront",
    "view", "condition", "grade",
    "sqft_above", "sqft_basement",
    "yr_built", "yr_renovated",
    "lat", "long"
]

In [20]:
train_df = pd.read_csv("../data/train.csv")

X_train_full = train_df[FEATURE_COLS]

scaler = StandardScaler()
scaler.fit(X_train_full)

os.makedirs("../models", exist_ok=True)
with open("../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [23]:
# Load test data
test_df = pd.read_csv("../data/test.csv")

X_test = test_df[FEATURE_COLS]

# Load scaler
with open("../models/scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

X_test_scaled = scaler.transform(X_test)

# Load final model
with open("../models/final_tabular_model.pkl", "rb") as f:
    model = pickle.load(f)

# Predict
test_preds = model.predict(X_test_scaled)

# Save submission
os.makedirs("../outputs", exist_ok=True)
submission = pd.DataFrame({
    "id": test_df["id"],
    "predicted_price": test_preds
})

submission.to_csv("../24322024_final.csv", index=False)

print("final submission csv file created successfully")

final submission csv file created successfully
