## 1. Imports


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import streamlit as st
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

## 2. Load Dataset


In [10]:
# Update path if needed
df = pd.read_csv('C:\\Users\\LEGION\\Downloads\\hackathon_ribo\\hackathon\\notebooks\\Data_Handeling\\transformed_data\\final_data_with_changes.csv')
df.head()

Unnamed: 0,price_total,room_type,is_shared_room,is_private_room,max_guests,is_superhost,is_multi_listing,is_business_listing,cleanliness_score,guest_satisfaction_score,...,Safety_Index,Monthly_Average_Net_salary,Meal_at_Inexpensive_Restaurant,Taxi_price_per_Km,Monthly_Basic_Utilities,Monthly_Rent_One_Bedroom_CC,Monthly_Rent_One_Bedroom_OCC,Monthly_Rent_Three_Bedroom_CC,Monthly_Rent_Three_Bedroom_OCC,proximity_index
0,93.788179,Private room,0,1,2.0,1,1,0,10.0,93.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,0.225448
1,822.169053,Private room,0,1,4.0,1,0,0,8.0,85.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,1.113955
2,104.351833,Private room,0,1,2.0,1,0,1,9.0,87.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,0.184291
3,1031.986037,Private room,0,1,4.0,1,0,1,9.0,90.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,1.110275
4,1078.983564,Private room,0,1,2.0,0,0,0,10.0,98.0,...,74.3,4502.1,20.5,3.0,262.0,2178.68,1701.26,3725.5,2785.29,1.036462


## 3. Feature Engineering


In [11]:
df['guests_per_room'] = df['max_guests'] / (df['num_bedrooms'] + 1)
df['salary_distance'] = df['Monthly_Average_Net_salary'] * df['distance_city_center']

# Log-transform target
df['price_log'] = np.log1p(df['price_total'])

## 4. Train-Test Split


In [12]:
X = df.drop(['price_total', 'price_log'], axis=1)
y = df['price_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## 5. Preprocessing Pipeline


In [13]:
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features)
])

## 6. Model & Hyperparameter Tuning


In [14]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        objective='reg:squarederror',
        random_state=42
    ))
])

param_grid = {
    'regressor__n_estimators': [200, 400, 600],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.03, 0.05, 0.1],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0]
}

search = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter=20,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

## 7. Evaluation


In [15]:
y_pred_log = best_model.predict(X_test)
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)

print('R2:', r2_score(y_test, y_pred_log))
print('MAE:', mean_absolute_error(y_test_actual, y_pred_actual))
print('RMSE:', np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)))

R2: 0.9020281472529307
MAE: 76.37331789460879
RMSE: 291.3559798130721


In [16]:
joblib.dump(search.best_estimator_, 'listing_model.pkl')


['listing_model.pkl']

## Conclusion

- Feature engineering improved signal
- Log target stabilized training
- Tuned XGBoost achieved best performance
- Pipeline prevents data leakage
