# Baseline Model

This notebook implements a baseline model for predicting Airbnb prices.

## Steps:
1. Load Data
2. Preprocessing (Clean price, Log transform, Impute, One-Hot Encoding)
3. Train Baseline Model (Ridge)
4. Evaluate (RMSE)
5. Generate Submission

In [1]:
import pandas as pd
import numpy as np

from src.features import (
    add_features, add_log_target, get_feature_columns,
    build_reviews_features, merge_reviews_features,
    build_calendar_features, merge_calendar_features
)
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score


In [2]:
# load data
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")
reviews_df  = pd.read_csv("../data/reviews.csv")
calendar_df = pd.read_csv("../data/calendar.csv")


In [3]:
ref_date = train_df["last_scraped"].max()

reviews_feat = build_reviews_features(reviews_df, ref_date)
cal_feat     = build_calendar_features(calendar_df)

train_df = merge_reviews_features(train_df, reviews_feat)
test_df  = merge_reviews_features(test_df,  reviews_feat)

train_df = merge_calendar_features(train_df, cal_feat)
test_df  = merge_calendar_features(test_df,  cal_feat)

train_df = add_features(train_df)
test_df  = add_features(test_df)
train_df = add_log_target(train_df)


In [4]:
feature_cols = get_feature_columns(train_df)

train_model_df = train_df.dropna(subset=["log_price"]).copy()

X_train = train_model_df[feature_cols].copy()
y_train = train_model_df["log_price"].copy()
X_test  = test_df[feature_cols].copy()

print("X_train:", X_train.shape, "y_train:", y_train.shape, "X_test:", X_test.shape)
print("NaNs in X_train:", X_train.isna().sum().sum())
print("NaNs in X_test :", X_test.isna().sum().sum())


X_train: (20804, 21) y_train: (20804,) X_test: (4750, 21)
NaNs in X_train: 45287
NaNs in X_test : 10611


In [5]:
X_train_filled = X_train.copy()
X_test_filled  = X_test.copy()

num_cols = X_train_filled.columns  

for col in num_cols:
    med = X_train_filled[col].median()
    X_train_filled[col] = X_train_filled[col].fillna(med)
    X_test_filled[col]  = X_test_filled[col].fillna(med)

print("NaNs after fill (train):", X_train_filled.isna().sum().sum())
print("NaNs after fill (test) :", X_test_filled.isna().sum().sum())


NaNs after fill (train): 0
NaNs after fill (test) : 0


In [6]:
# Baseline model


model = Ridge(alpha=1.0, random_state=42)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Scikit uses negative MSE, we convert to RMSE
scores = cross_val_score(
    model,
    X_train_filled,
    y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE mean:", (-scores).mean())
print("CV RMSE std :", (-scores).std())


CV RMSE mean: 0.6434748745595933
CV RMSE std : 0.020366170540238832


In [7]:
# Fit and predict
model.fit(X_train_filled, y_train)
test_pred_log = model.predict(X_test_filled)

print("pred_log min/mean/max:", test_pred_log.min(), test_pred_log.mean(), test_pred_log.max())


pred_log min/mean/max: 6.031275837753995 7.78292019544858 10.958068979852001


In [8]:
# Build submission with required column names
sub = pd.DataFrame({
    "ID": test_df["id"],
    "TARGET": np.expm1(test_pred_log)
})

# Safety: no negative prices
sub["TARGET"] = sub["TARGET"].clip(lower=0)

sub_path = "../submissions/submission_ridge.csv"
sub.to_csv(sub_path, index=False)

print("Saved:", sub_path)
display(sub.head())


Saved: ../submissions/submission_ridge.csv


Unnamed: 0,ID,TARGET
0,536526,2962.513925
1,124137,4678.604506
2,164216,4225.032638
3,541629,4181.400734
4,572504,1204.323623
