# Baseline Model

This notebook implements a baseline model for predicting Airbnb prices.

## Steps:
1. Load Data
2. Preprocessing (Clean price, Log transform, Impute, One-Hot Encoding)
3. Train Baseline Model (Ridge)
4. Evaluate (RMSE)
5. Generate Submission

In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl (8.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m363.4 kB/s[0m  [33m0:00:22[0mm0:00:01[0m00:02[0m
[?25hUsing cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.3 scikit-learn-1.8.0 threadpoolctl-3.6.0


In [22]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score

from src.features import (
    add_features, add_log_target, get_feature_columns,
    build_reviews_features, merge_reviews_features,
    build_calendar_features, merge_calendar_features,
    add_room_type_dummies
)

In [23]:
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")
reviews_df  = pd.read_csv("../data/reviews.csv")
calendar_df = pd.read_csv("../data/calendar.csv")

ref_date = train_df["last_scraped"].max()


In [24]:
reviews_feat = build_reviews_features(reviews_df, ref_date)
cal_feat     = build_calendar_features(calendar_df)


In [25]:
train_df = merge_reviews_features(train_df, reviews_feat)
test_df  = merge_reviews_features(test_df,  reviews_feat)

train_df = merge_calendar_features(train_df, cal_feat)
test_df  = merge_calendar_features(test_df,  cal_feat)


In [26]:
train_df = add_features(train_df)
test_df  = add_features(test_df)


In [27]:
train_df, room_cols = add_room_type_dummies(train_df, dummy_cols=None)
test_df, _          = add_room_type_dummies(test_df, dummy_cols=room_cols)


In [28]:
train_df = add_log_target(train_df)


In [29]:
train_model_df = train_df.dropna(subset=["log_price"]).copy()

feature_cols = get_feature_columns(train_model_df)

X_train = train_model_df[feature_cols].copy()
y_train = train_model_df["log_price"].copy()

X_test = test_df.reindex(columns=feature_cols, fill_value=0).copy()

assert list(X_train.columns) == list(X_test.columns)


In [30]:
X_train_filled = X_train.copy()
X_test_filled  = X_test.copy()

num_cols = X_train_filled.select_dtypes(include=[np.number]).columns

for col in num_cols:
    med = X_train_filled[col].median()
    X_train_filled[col] = X_train_filled[col].fillna(med)
    X_test_filled[col]  = X_test_filled[col].fillna(med)

print("NaNs train:", X_train_filled.isna().sum().sum())
print("NaNs test :", X_test_filled.isna().sum().sum())


NaNs train: 0
NaNs test : 0


In [31]:
model = Ridge(alpha=1.0, random_state=42)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    model,
    X_train_filled,
    y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE mean:", (-scores).mean())
print("CV RMSE std :", (-scores).std())


CV RMSE mean: 0.6072237425054208
CV RMSE std : 0.11471064747584236


In [32]:
model.fit(X_train_filled, y_train)
test_pred_log = model.predict(X_test_filled)


In [33]:
sub = pd.DataFrame({
    "ID": test_df["id"],
    "TARGET": np.expm1(test_pred_log)
})

sub["TARGET"] = sub["TARGET"].clip(lower=0)
sub.to_csv("../submissions/submission_ridge.csv", index=False)

sub.head()


Unnamed: 0,ID,TARGET
0,536526,2310.152101
1,124137,4039.066545
2,164216,3555.400274
3,541629,3085.838446
4,572504,894.590576
