# Baseline Model

This notebook implements a baseline model for predicting Airbnb prices.

## Steps:
1. Load Data
2. Preprocessing (Clean price, Log transform, Impute, One-Hot Encoding)
3. Train Baseline Model (Ridge)
4. Evaluate (RMSE)
5. Generate Submission

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import os

In [None]:
# Load data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

In [None]:
# Clean price column
def clean_price(price):
    if isinstance(price, str):
        return float(price.replace('$', '').replace(',', ''))
    return price

train_df['price'] = train_df['price'].apply(clean_price)

# Drop rows with missing price
train_df = train_df.dropna(subset=['price'])

# Log transform target
train_df['log_price'] = np.log1p(train_df['price'])

print("Price cleaned and log-transformed.")

In [None]:
# Impute missing values
for col in ['bedrooms', 'bathrooms']:
    if col in train_df.columns:
        median_val = train_df[col].median()
        if pd.isna(median_val):
            median_val = 0
        train_df[col] = train_df[col].fillna(median_val)
        test_df[col] = test_df[col].fillna(median_val)

print("Missing values imputed.")

In [None]:
# Select features
categorical_cols = ['room_type', 'neighbourhood_group_cleansed']
numerical_cols = ['bedrooms', 'bathrooms']

# One-hot encoding
all_data = pd.concat([train_df, test_df], axis=0, sort=False)
all_data = pd.get_dummies(all_data, columns=categorical_cols, drop_first=True)

# Split back
train_processed = all_data[:len(train_df)]
test_processed = all_data[len(train_df):]

# Define features
encoded_cols = [c for c in train_processed.columns if c.startswith('room_type_') or c.startswith('neighbourhood_group_cleansed_')]
feature_cols = numerical_cols + encoded_cols

X = train_processed[feature_cols]
y = train_processed['log_price']
X_test = test_processed[feature_cols]

# Handle NaNs
X = X.fillna(0)
X_test = X_test.fillna(0)

print(f"Training with {len(feature_cols)} features: {feature_cols}")

In [None]:
# Split train/val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# Evaluate
y_pred_val = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f"Validation RMSE (log_price): {rmse}")

In [None]:
# Predict on test
y_pred_test_log = model.predict(X_test)
y_pred_test = np.expm1(y_pred_test_log)

# Save submission
submission = pd.DataFrame({'id': test_df['id'], 'price': y_pred_test})
if not os.path.exists('../submissions'):
    os.makedirs('../submissions')
submission.to_csv('../submissions/baseline_submission.csv', index=False)
print("Submission saved to ../submissions/baseline_submission.csv")