# Advanced Model (XGBoost)

This notebook implements an advanced model using XGBoost with extensive feature engineering.

## Steps:
1. Load Data
2. Feature Engineering
    - Amenities (Count, Top items)
    - Text Features (TF-IDF on Name)
    - Bathrooms extraction
    - Categorical Encoding
3. Train XGBoost Model (CV)
4. Evaluate (RMSE)
5. Generate Submission

In [6]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl (2.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.2


In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os

In [8]:
# Load data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (24153, 58)
Test shape: (4750, 56)


In [9]:
# Clean price
def clean_price(price):
    if isinstance(price, str):
        return float(price.replace('$', '').replace(',', ''))
    return price

train_df['price'] = train_df['price'].apply(clean_price)
train_df = train_df.dropna(subset=['price'])
train_df['log_price'] = np.log1p(train_df['price'])

# Combine for processing
train_len = len(train_df)
all_data = pd.concat([train_df.drop(['price', 'log_price'], axis=1), test_df], axis=0).reset_index(drop=True)

print("Data combined for processing.")

Data combined for processing.


In [10]:
# 1. Bathrooms
def extract_bathrooms(text):
    if pd.isna(text):
        return np.nan
    match = re.search(r'(\d+(\.\d+)?)', str(text))
    if match:
        return float(match.group(1))
    return np.nan

all_data['bathrooms_count'] = all_data['bathrooms_text'].apply(extract_bathrooms)
# Fill missing bathrooms with median
all_data['bathrooms_count'] = all_data['bathrooms_count'].fillna(all_data['bathrooms_count'].median())

# 2. Host Superhost
all_data['host_is_superhost'] = all_data['host_is_superhost'].map({'t': 1, 'f': 0}).fillna(0)

# 3. Amenities
all_data['amenities'] = all_data['amenities'].fillna('[]')
all_data['amenities_count'] = all_data['amenities'].apply(lambda x: len(str(x).split(',')))

# Top amenities
top_amenities = ['Wifi', 'Air conditioning', 'Kitchen', 'Heating', 'Washer', 'Dryer', 'Essentials', 
                 'Shampoo', 'Hangers', 'Iron', 'TV', 'Hot water', 'Hair dryer', 'Refrigerator', 
                 'Dishes and silverware', 'Cooking basics', 'Oven', 'Stove', 'Microwave', 'Coffee maker']

for amenity in top_amenities:
    all_data[f'amenity_{amenity}'] = all_data['amenities'].apply(lambda x: 1 if amenity in str(x) else 0)

print("Amenities processed.")

Amenities processed.


In [11]:
# 4. Text Features (TF-IDF)
print("Processing text features...")

# Initialize TF-IDF Vectorizer
# max_features=50: Keep only the top 50 most frequent words to avoid high dimensionality
# stop_words='english': Remove common English words (e.g., 'the', 'is') that don't add value
tfidf_name = TfidfVectorizer(max_features=50, stop_words='english')

# Fit and transform the 'name' column
# fillna('') handles missing values by replacing them with empty strings
name_features = tfidf_name.fit_transform(all_data['name'].fillna(''))

# Convert the resulting sparse matrix to a DataFrame
# Columns are named 'name_tfidf_0' to 'name_tfidf_49'
name_df = pd.DataFrame(name_features.toarray(), columns=[f'name_tfidf_{i}' for i in range(50)])

# Concatenate the new features with the original data
all_data = pd.concat([all_data, name_df], axis=1)

Processing text features...


In [12]:
# 5. Categorical Encoding
cat_cols = ['room_type', 'neighbourhood_group_cleansed', 'property_type']
# Simplify property type (keep top 10, others 'Other')
top_properties = all_data['property_type'].value_counts().head(10).index
all_data['property_type'] = all_data['property_type'].apply(lambda x: x if x in top_properties else 'Other')

all_data = pd.get_dummies(all_data, columns=cat_cols, drop_first=True)

# 6. Numerical Features
num_cols = ['accommodates', 'bedrooms', 'beds', 'minimum_nights', 'calculated_host_listings_count', 
            'review_scores_rating', 'bathrooms_count', 'amenities_count', 'host_is_superhost']

# Fill missing numericals
for col in num_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(all_data[col].median())

# Select features for model
feature_cols = num_cols + [c for c in all_data.columns if c.startswith('amenity_') or c.startswith('name_tfidf_') or c.startswith('room_type_') or c.startswith('neighbourhood_group_') or c.startswith('property_type_')]

print(f"Total features: {len(feature_cols)}")

Total features: 92


In [13]:
# Split back
X = all_data.iloc[:train_len][feature_cols]
y = train_df['log_price']
X_test = all_data.iloc[train_len:][feature_cols]

# Validation
print("Training XGBoost model...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42, n_jobs=-1)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"Fold {fold+1} RMSE: {rmse}")

print(f"Average RMSE: {np.mean(rmse_scores)}")

Training XGBoost model...
Fold 1 RMSE: 0.5475474477421488
Fold 2 RMSE: 0.5205557528544259
Fold 3 RMSE: 0.5621941390505845
Fold 4 RMSE: 0.5591110468535713
Fold 5 RMSE: 0.5298412096916764
Average RMSE: 0.5438499192384814


In [14]:
# Final prediction
print("Generating submission...")
final_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42, n_jobs=-1)
final_model.fit(X, y)
y_pred_test_log = final_model.predict(X_test)
y_pred_test = np.expm1(y_pred_test_log)

submission = pd.DataFrame({'ID': test_df['id'], 'TARGET': y_pred_test})
if not os.path.exists('../submissions'):
    os.makedirs('../submissions')
submission.to_csv('../submissions/advanced_submission.csv', index=False)
print("Submission saved to ../submissions/advanced_submission.csv")

Generating submission...
Submission saved to ../submissions/advanced_submission.csv
