## Instructions:

- Put the parts of your code under the corresponding sections. (0.25/2 points will be taken off for not doing this.)
- Do not include any redundant/irrelevant code, text or comments. (0.5/2 points will be taken off for not doing this.)
- **Your code must run without any errors or runtime issues.** (Failure to meet this condition will result in a 0.)
- **Your code must return your Public Leaderboard score.** (Failure to meet this condition will result in a 0.)
- **Submit both your ipynb and your html file for grading purposes.**

## 1) Libraries

Put all the Python libraries and tools you imported here.

In [25]:
import numpy as np
import pandas as pd

# sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

## 2) Data

- This section is required to include the code that reads, cleans and preprocesses the datasets.
- Note that both the training and test datasets should undergo the same sequence of operations.

In [29]:
# datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [30]:
# clean price
train["price"] = train["price"].replace('[$/,]', '', regex=True).astype(float)

test_ids = test["id"]
X = train.drop(columns=["price", "id"])
X_test = test.drop(columns=["id"])

# merge
all_data = pd.concat([X, X_test], axis=0)

In [31]:
# extract first digits and converts to float
train['bathrooms'] = train['bathrooms_text'].str.extract(r'(\d+\.?\d*)').astype(float)

# create bools for important amenities and creates number of amenities column
train['n_amenities'] = train['amenities'].apply(lambda x: len(eval(x)) if pd.notnull(x) else 0)
train['has_wifi'] = train['amenities'].str.contains('wifi', case=False, na=False).astype(int)
train['has_kitchen'] = train['amenities'].str.contains('kitchen', case=False, na=False).astype(int)
train['has_ac'] = train['amenities'].str.contains('air conditioning', case=False, na=False).astype(int)

In [32]:
# change obs and bools to strings
for feature in all_data.select_dtypes(include=["object", "bool"]).columns:
    all_data[feature] = all_data[feature].astype(str)

# clean numeric strings
for feature in all_data.columns:
    if all_data[feature].dtype == "object":
        cleaned = (
            all_data[feature]
            .str.replace("$", "", regex=False)
            .str.replace(",", "", regex=False)
            .str.replace("%", "", regex=False)
        )
        try:
            all_data[feature] = pd.to_numeric(cleaned)
        except ValueError:
            all_data[feature] = cleaned

if "description" in all_data.columns:
    desc_lengths = all_data["description"].fillna("").map(len)
    all_data["desc_length"] = desc_lengths
    all_data.drop("description", axis=1, inplace=True)

# host_about
if "host_about" in all_data.columns:
    about_lengths = all_data["host_about"].fillna("").map(len)
    all_data["about_length"] = about_lengths
    all_data.drop("host_about", axis=1, inplace=True)

# host_since
if "host_since" in all_data.columns:
    ref_date = pd.to_datetime("2025-06-01")
    parsed_host_dates = pd.to_datetime(all_data["host_since"], errors="coerce")
    all_data["host_days_active"] = (ref_date - parsed_host_dates).dt.days
    all_data.drop("host_since", axis=1, inplace=True)

In [33]:
# time
if {"first_review", "last_review"}.issubset(all_data.columns):
    first = pd.to_datetime(all_data["first_review"], errors="coerce")
    last = pd.to_datetime(all_data["last_review"], errors="coerce")
    all_data["review_span"] = (last - first).dt.days
    all_data["days_since_last_review"] = (pd.to_datetime("2025-06-01") - last).dt.days
    all_data.drop(["first_review", "last_review"], axis=1, inplace=True)

# missing reveiw
score_columns = [c for c in all_data.columns if c.startswith("review_scores")]
all_data["missing_review_scores"] = all_data[score_columns].isnull().sum(axis=1)
numeric_columns = all_data.select_dtypes(include=["number"]).columns
skew_vals = all_data[numeric_columns].skew().abs()
to_transform = skew_vals[skew_vals > 1].index
all_data[to_transform] = np.log1p(all_data[to_transform])

y = train["price"]
y_log = np.log1p(y)

# split train and test
X = all_data.iloc[:len(y_log), :]
X_test = all_data.iloc[len(y_log):, :]

## 3) Machine Learning Model

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

import os
import re
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

os.environ["OMP_NUM_THREADS"] = "1"
n_jobs = int(os.getenv("SLURM_NPROCS", 1)) if "SLURM_NPROCS" in os.environ else -1

In [35]:
# num and cat columns
num_cols = X.select_dtypes(include=["number"]).columns.to_list()
cat_cols = X.select_dtypes(include="object").columns.to_list()

num_pipeline = Pipeline([
    ("fillna_num", SimpleImputer(strategy="median")), ("normalize", StandardScaler())])
cat_pipeline = Pipeline([
    ("fillna_cat", SimpleImputer(strategy="most_frequent")), ("encode", OneHotEncoder(handle_unknown="ignore"))])
feature_processor = ColumnTransformer([("numerics", num_pipeline, num_cols),
                                       ("categories", cat_pipeline, cat_cols)])
reg = XGBRegressor(objective="reg:squarederror", n_jobs=n_jobs, verbosity=0)

In [36]:
# pipeline
full_pipeline = Pipeline([("features", feature_processor), ("regressor", reg)])

# hyperparam grid
tuning_grid = {
    "regressor__n_estimators": [675, 725],
    "regressor__max_depth": [7, 8],
    "regressor__learning_rate": [0.1],
    "regressor__subsample": [0.8, 0.9],
    "regressor__colsample_bytree": [0.8, 1],
    "regressor__reg_lambda": [1]
}

X_train, X_holdout, y_train_log, y_holdout_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [37]:
# gridsearchCV
search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=tuning_grid,
    cv=3,
    scoring="neg_mean_absolute_error",
    verbose=1,
    n_jobs=n_jobs
)
search.fit(X_train, y_train_log)

# best pipeline on HO
best_model = search.best_estimator_
holdout_log_preds = best_model.predict(X_holdout)
holdout_preds = np.expm1(holdout_log_preds)
actual_holdout = np.expm1(y_holdout_log)
mae_score = mean_absolute_error(actual_holdout, holdout_preds)
holdout_results = []

for setting in search.cv_results_["params"]:
    temp_model = search.estimator.set_params(**setting)
    temp_model.fit(X_train, y_train_log)
    preds_log = temp_model.predict(X_holdout)
    preds = np.expm1(preds_log)
    temp_mae = mean_absolute_error(actual_holdout, preds)
    holdout_results.append((temp_mae, setting))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [38]:
# best combo
lowest_mae, top_config = min(holdout_results, key=lambda x: x[0])
print("best mae:", lowest_mae)

# final model
final_model = search.estimator.set_params(**top_config)
final_model.fit(X, y_log)

best mae: 102.25281480470956


## 4) Exporting the Predictions

In [40]:
# export
y_pred_log = final_pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)

submission = pd.DataFrame({
    "id": test_ids,
    "predicted": y_pred
})
submission.to_csv("final_submission.csv", index=False)

print('My Leaderboard Score is 93.33')

My Leaderboard Score is 93.33
