# Baseline Model

This notebook implements a baseline model for predicting Airbnb prices.

## Steps:
1. Load Data
2. Preprocessing (Clean price, Log transform, Impute, One-Hot Encoding)
3. Train Baseline Model (Ridge)
4. Evaluate (RMSE)
5. Generate Submission

In [1]:
import pandas as pd
import numpy as np

from src.features import (
    add_features, add_log_target, get_feature_columns,
    build_reviews_features, merge_reviews_features,
    build_calendar_features, merge_calendar_features,
    add_room_type_dummies
)

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score


In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")
reviews_df  = pd.read_csv("../data/reviews.csv")
calendar_df = pd.read_csv("../data/calendar.csv")

ref_date = train_df["last_scraped"].max()


In [3]:
reviews_feat = build_reviews_features(reviews_df, ref_date)
cal_feat     = build_calendar_features(calendar_df)


In [4]:
train_df = merge_reviews_features(train_df, reviews_feat)
test_df  = merge_reviews_features(test_df,  reviews_feat)

train_df = merge_calendar_features(train_df, cal_feat)
test_df  = merge_calendar_features(test_df,  cal_feat)


In [5]:
train_df = add_features(train_df)
test_df  = add_features(test_df)


In [6]:
train_df, room_cols = add_room_type_dummies(train_df, dummy_cols=None)
test_df, _          = add_room_type_dummies(test_df, dummy_cols=room_cols)


In [7]:
train_df = add_log_target(train_df)


In [8]:
train_model_df = train_df.dropna(subset=["log_price"]).copy()

feature_cols = get_feature_columns(train_model_df)

X_train = train_model_df[feature_cols].copy()
y_train = train_model_df["log_price"].copy()

X_test = test_df.reindex(columns=feature_cols, fill_value=0).copy()

assert list(X_train.columns) == list(X_test.columns)


In [9]:
X_train_filled = X_train.copy()
X_test_filled  = X_test.copy()

num_cols = X_train_filled.select_dtypes(include=[np.number]).columns

for col in num_cols:
    med = X_train_filled[col].median()
    X_train_filled[col] = X_train_filled[col].fillna(med)
    X_test_filled[col]  = X_test_filled[col].fillna(med)

print("NaNs train:", X_train_filled.isna().sum().sum())
print("NaNs test :", X_test_filled.isna().sum().sum())


NaNs train: 0
NaNs test : 0


In [11]:
model = Ridge(alpha=1.0, random_state=42)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    model,
    X_train_filled,
    y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE mean:", (-scores).mean())
print("CV RMSE std :", (-scores).std())


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py", line 1254, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2919, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1314, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 940, in check_array
    array = array.astype(new_dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/generic.py", line 6665, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py", line 449, in astype
    return self.apply(
           ^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/internals/blocks.py", line 784, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py", line 237, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py", line 182, in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ibrahimbancar/YZV311_2526_7/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py", line 133, in _astype_nansafe
    return arr.astype(dtype, copy=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Entire home/apt'


In [None]:
model.fit(X_train_filled, y_train)
test_pred_log = model.predict(X_test_filled)


In [None]:
sub = pd.DataFrame({
    "ID": test_df["id"],
    "TARGET": np.expm1(test_pred_log)
})

sub["TARGET"] = sub["TARGET"].clip(lower=0)
sub.to_csv("../submissions/submission_ridge.csv", index=False)

sub.head()
