## 1. Data Preparation

### 1.1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import time

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV , KFold
from sklearn.metrics import mean_squared_error 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge , ElasticNet , Lasso
from sklearn.ensemble import RandomForestRegressor  , AdaBoostRegressor 
from sklearn.preprocessing import MinMaxScaler

- Import data, check for comlumns and variables
### 1.2. Load and Clean Data
Load the dataset, inspect its structure, and perform initial cleaning. We drop the original 'Price' column (in Toman) and use 'Price(USD)' for our analysis, renaming it to 'Price'.

In [2]:
houses = pd.read_csv("./data/housePrice.csv")
houses = houses.drop('Price', axis=1)
houses = houses.rename(columns={'Price(USD)': 'Price'})
houses.info()
houses.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3474 entries, 0 to 3473
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Area       3474 non-null   object 
 1   Room       3474 non-null   int64  
 2   Parking    3474 non-null   bool   
 3   Warehouse  3474 non-null   bool   
 4   Elevator   3474 non-null   bool   
 5   Address    3451 non-null   object 
 6   Price      3474 non-null   float64
dtypes: bool(3), float64(1), int64(1), object(2)
memory usage: 118.9+ KB


Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
0,420,4,True,True,True,Zaferanieh,3080000.0
1,705,5,True,True,False,Abazar,3033333.33
2,400,5,True,True,False,Lavasan,2833333.33
3,680,5,True,True,False,Ekhtiarieh,2720000.0
4,350,4,True,True,True,Niavaran,2683333.33


- Since the Address column has some missing data in certain rows, we will drop these rows.

In [3]:
houses = houses.dropna(subset=['Address'])

- Check the header again

In [4]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3451 entries, 0 to 3473
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Area       3451 non-null   object 
 1   Room       3451 non-null   int64  
 2   Parking    3451 non-null   bool   
 3   Warehouse  3451 non-null   bool   
 4   Elevator   3451 non-null   bool   
 5   Address    3451 non-null   object 
 6   Price      3451 non-null   float64
dtypes: bool(3), float64(1), int64(1), object(2)
memory usage: 144.9+ KB


- Check the correlation between variables and the price variable before converting them into categories (actually this is feature engineering xD)

In [5]:
# houses['Price'] = np.log1p(houses['Price'])
# houses['Area'] = np.log1p(pd.to_numeric(houses['Area'], errors='coerce'))
houses['Area'] = pd.to_numeric(houses['Area'], errors='coerce')
houses = houses.dropna(subset=['Area'])
houses['Room'] = houses['Room'].astype('int64')
houses['Area_per_Room'] = houses['Area'] / (houses['Room']+1)
houses['Area_per_Room'] = houses['Area_per_Room'].replace([np.inf, -np.inf], np.nan)
address_avg_price = houses.groupby('Address')['Price'].mean()
# houses['Avg_Price_by_Address'] = houses['Address'].map(address_avg_price)
houses['Price_per_m2'] = houses['Price'] / houses['Area']
address_avg_price_per_m2 = houses.groupby('Address')['Price_per_m2'].mean()
houses['Avg_Price_per_m2_by_Address'] = houses['Address'].map(address_avg_price_per_m2)
houses = houses.drop('Price_per_m2', axis=1)

# houses['Parking_Warehouse'] = (houses['Parking'] == True) & (houses['Warehouse'] == True)
# houses['Parking_Elevator'] = (houses['Parking'] == True) & (houses['Elevator'] == True)
# houses['Elevator_Warehouse'] = (houses['Elevator'] == True) & (houses['Warehouse'] == True)
# houses['All_Amenities'] = (houses['Parking'] == True) & (houses['Warehouse'] == True) & (houses['Elevator'] == True)

numeric_houses = houses.select_dtypes(include=[np.number, bool]) 
corr_matrix = numeric_houses.corr()
corr_matrix["Price"].sort_values(ascending=False)

Price                          1.000000
Area                           0.722864
Avg_Price_per_m2_by_Address    0.656375
Room                           0.567802
Area_per_Room                  0.522454
Parking                        0.190935
Elevator                       0.111443
Warehouse                      0.109224
Name: Price, dtype: float64

- That said, the data is now complete. HOWEVER, we have not yet categorized the data into the two main types: Numerical and Categorical. Now, we will do that.

In [6]:
# Convert specified columns to categorical data type
houses['Room'] = houses['Room'].astype('category')
houses['Parking'] = houses['Parking'].astype('category')
houses['Warehouse'] = houses['Warehouse'].astype('category')
houses['Elevator'] = houses['Elevator'].astype('category')
# Verify the changes
print("Data types after conversion:")
print(houses.dtypes)
houses.info()

Data types after conversion:
Area                            float64
Room                           category
Parking                        category
Warehouse                      category
Elevator                       category
Address                          object
Price                           float64
Area_per_Room                   float64
Avg_Price_per_m2_by_Address     float64
dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 3449 entries, 0 to 3473
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Area                         3449 non-null   float64 
 1   Room                         3449 non-null   category
 2   Parking                      3449 non-null   category
 3   Warehouse                    3449 non-null   category
 4   Elevator                     3449 non-null   category
 5   Address                      3449 non-null   object  
 6   Price              

### 1.3. Outlier Removal
To prevent extreme values from skewing the analysis, we remove the top and bottom 1% of properties based on price.

In [7]:
# Calculate 1st and 99th percentiles
lower_percentile = houses['Price'].quantile(0.01)
upper_percentile = houses['Price'].quantile(0.95) # !!

price_min = lower_percentile
price_max = upper_percentile

print(price_max, price_min)

print(f"1st percentile (1% lowest): ${lower_percentile:.2f}")
print(f"99th percentile (1% highest): ${upper_percentile:.2f}")

# Remove outliers
houses_before = len(houses)
houses = houses[(houses['Price'] >= lower_percentile) & (houses['Price'] <= upper_percentile)]
houses_after = len(houses)

print(f"\nDataset size before removing outliers: {houses_before}")
print(f"Dataset size after removing outliers: {houses_after}")
print(f"Removed {houses_before - houses_after} outliers ({((houses_before - houses_after) / houses_before * 100):.1f}%)")

595626.667999999 9865.3316
1st percentile (1% lowest): $9865.33
99th percentile (1% highest): $595626.67

Dataset size before removing outliers: 3449
Dataset size after removing outliers: 3241
Removed 208 outliers (6.0%)


## 2. Data Preparation


- first 10 rows

In [8]:
houses.head(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Area_per_Room,Avg_Price_per_m2_by_Address
173,188.0,3,True,True,True,Ozgol,589066.67,47.0,2004.03038
174,165.0,3,True,True,True,Niavaran,588500.0,41.25,3426.020637
175,160.0,3,True,True,True,Farmanieh,586666.67,40.0,3410.992898
176,135.0,2,True,True,True,Zaferanieh,583333.33,45.0,3790.462594
177,160.0,3,True,True,True,Niavaran,583333.33,40.0,3426.020637
178,175.0,3,True,True,True,Elahieh,583333.33,43.75,3740.419497
179,195.0,3,True,True,True,Farmanieh,576000.0,48.75,3410.992898
180,138.0,2,True,True,True,Zaferanieh,575000.0,46.0,3790.462594
181,165.0,3,True,True,True,Niavaran,573333.33,41.25,3426.020637
182,155.0,3,True,True,True,Niavaran,568333.33,38.75,3426.020637


- This part was already carried out in quite detail in the file `houses_price.ipynb`; you can refer to that notebook for more information.

- One-hot encoding

In [9]:
# One-hot encoding for Address column
address_encoded = pd.get_dummies(houses['Address'], prefix='Address')

# Concatenate the one-hot encoded columns with the original dataframe
houses_encoded = pd.concat([houses, address_encoded], axis=1)

# Drop the original Address column since we now have encoded versions
houses_encoded = houses_encoded.drop('Address', axis=1)

print(f"Original dataset shape: {houses.shape}")
print(f"Dataset shape after one-hot encoding: {houses_encoded.shape}")
print(f"Added {address_encoded.shape[1]} new columns for Address categories")

# Display the new column names
print(f"\nNew Address columns created:")
print(list(address_encoded.columns))

# Update our main dataframe
houses = houses_encoded
houses.head()

Original dataset shape: (3241, 9)
Dataset shape after one-hot encoding: (3241, 197)
Added 189 new columns for Address categories

New Address columns created:
['Address_Abazar', 'Address_Abbasabad', 'Address_Absard', 'Address_Abuzar', 'Address_Afsarieh', 'Address_Ahang', 'Address_Air force', 'Address_Ajudaniye', 'Address_Alborz Complex', 'Address_Aliabad South', 'Address_Amir Bahador', 'Address_Amirabad', 'Address_Amirieh', 'Address_Andisheh', 'Address_Aqdasieh', 'Address_Araj', 'Address_Argentina', 'Address_Atabak', 'Address_Azadshahr', 'Address_Azarbaijan', 'Address_Azari', 'Address_Baghestan', 'Address_Bahar', 'Address_Baqershahr', 'Address_Beryanak', 'Address_Boloorsazi', 'Address_Central Janatabad', 'Address_Chahardangeh', 'Address_Chardangeh', 'Address_Chardivari', 'Address_Chidz', 'Address_Damavand', 'Address_Darabad', 'Address_Darakeh', 'Address_Darband', 'Address_Daryan No', 'Address_Dehkade Olampic', 'Address_Dezashib', 'Address_Dolatabad', 'Address_Dorous', 'Address_East Fer

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Area_per_Room,Avg_Price_per_m2_by_Address,Address_Abazar,Address_Abbasabad,...,Address_Waterfall,Address_West Ferdows Boulevard,Address_West Pars,Address_Yaftabad,Address_Yakhchiabad,Address_Yousef Abad,Address_Zafar,Address_Zaferanieh,Address_Zargandeh,Address_Zibadasht
173,188.0,3,True,True,True,589066.67,47.0,2004.03038,False,False,...,False,False,False,False,False,False,False,False,False,False
174,165.0,3,True,True,True,588500.0,41.25,3426.020637,False,False,...,False,False,False,False,False,False,False,False,False,False
175,160.0,3,True,True,True,586666.67,40.0,3410.992898,False,False,...,False,False,False,False,False,False,False,False,False,False
176,135.0,2,True,True,True,583333.33,45.0,3790.462594,False,False,...,False,False,False,False,False,False,False,True,False,False
177,160.0,3,True,True,True,583333.33,40.0,3426.020637,False,False,...,False,False,False,False,False,False,False,False,False,False


- Transform cols

In [10]:
scaler = MinMaxScaler()
houses[houses.columns] = scaler.fit_transform(houses)

houses.head(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Area_per_Room,Avg_Price_per_m2_by_Address,Address_Abazar,Address_Abbasabad,...,Address_Waterfall,Address_West Ferdows Boulevard,Address_West Pars,Address_Yaftabad,Address_Yakhchiabad,Address_Yousef Abad,Address_Zafar,Address_Zaferanieh,Address_Zargandeh,Address_Zibadasht
173,0.181609,0.6,1.0,1.0,1.0,1.0,0.123786,0.452699,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174,0.155172,0.6,1.0,1.0,1.0,0.999022,0.102852,0.799646,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,0.149425,0.6,1.0,1.0,1.0,0.995856,0.098301,0.795979,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,0.12069,0.4,1.0,1.0,1.0,0.990101,0.116505,0.888565,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
177,0.149425,0.6,1.0,1.0,1.0,0.990101,0.098301,0.799646,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178,0.166667,0.6,1.0,1.0,1.0,0.990101,0.111954,0.876355,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.189655,0.6,1.0,1.0,1.0,0.977439,0.130158,0.795979,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,0.124138,0.4,1.0,1.0,1.0,0.975712,0.120146,0.888565,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
181,0.155172,0.6,1.0,1.0,1.0,0.972835,0.102852,0.799646,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,0.143678,0.6,1.0,1.0,1.0,0.964201,0.09375,0.799646,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3. Model training

- Split into test_set and train_set

In [11]:
X = houses.drop('Price', axis=1)  
y = houses['Price']              
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns:")
print(list(X.columns))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,     
    random_state=42     
)

print(f"\nTraining set:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

print(f"\nTesting set:")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"\nData split summary:")
print(f"Total samples: {len(houses)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(houses)*100:.1f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/len(houses)*100:.1f}%)")

Features shape: (3241, 196)
Target shape: (3241,)

Feature columns:
['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Area_per_Room', 'Avg_Price_per_m2_by_Address', 'Address_Abazar', 'Address_Abbasabad', 'Address_Absard', 'Address_Abuzar', 'Address_Afsarieh', 'Address_Ahang', 'Address_Air force', 'Address_Ajudaniye', 'Address_Alborz Complex', 'Address_Aliabad South', 'Address_Amir Bahador', 'Address_Amirabad', 'Address_Amirieh', 'Address_Andisheh', 'Address_Aqdasieh', 'Address_Araj', 'Address_Argentina', 'Address_Atabak', 'Address_Azadshahr', 'Address_Azarbaijan', 'Address_Azari', 'Address_Baghestan', 'Address_Bahar', 'Address_Baqershahr', 'Address_Beryanak', 'Address_Boloorsazi', 'Address_Central Janatabad', 'Address_Chahardangeh', 'Address_Chardangeh', 'Address_Chardivari', 'Address_Chidz', 'Address_Damavand', 'Address_Darabad', 'Address_Darakeh', 'Address_Darband', 'Address_Daryan No', 'Address_Dehkade Olampic', 'Address_Dezashib', 'Address_Dolatabad', 'Address_Dorous', 'Address

- try with models

- just convert function

In [12]:

def convert_scaled_to_original_price(scaled_value, price_min=None, price_max=None):
    """
    Convert scaled price value back to original USD price
    
    Parameters:
    scaled_value: float (0-1) - giá trị đã được MinMaxScaler
    price_min: float - giá trị min của Price sau khi log1p 
    price_max: float - giá trị max của Price sau khi log1p 
    
    Returns:
    float - giá trị USD gốc
    """
    
    # Bước 1: Unscale từ [0,1] về log scale
    log_price = scaled_value * (price_max - price_min) + price_min
    
    # Bước 2: Inverse log transform (expm1 là ngược lại của log1p)
    # original_price = np.expm1(log_price)
    original_price = log_price
    
    return original_price


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
import pandas as pd

models = {
    'Ridge Regression': Ridge(random_state=42),
    'Lasso Regression': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'K-Neighbors': KNeighborsRegressor(),
    'AdaBoost': AdaBoostRegressor(random_state=42)
}

# Store results
results = {}

print("Running K-Fold Cross Validation (k=10) for each model...")
print("=" * 60)

# Test each model with k-fold cross validation
for model_name, model in models.items():
    print(f"\nTesting {model_name}...")
    
    # Perform 10-fold cross validation
    # Using neg_root_mean_squared_error (negative RMSE)
    cv_scores = -cross_val_score(
        model, X_train, y_train,
        scoring="neg_root_mean_squared_error",
        cv=10,
        n_jobs=-1  # Use all available processors
    )
    
    # Store results
    results[model_name] = cv_scores
    
    # Display statistics
    score_stats = pd.Series(cv_scores).describe()
    print(f"Cross-validation RMSE scores for {model_name}:")
    print(score_stats)
    print(f"Mean RMSE: {cv_scores.mean():.6f} (+/- {cv_scores.std() * 2:.6f})")
    
    # Convert to original USD
    mean_rmse_usd = convert_scaled_to_original_price(cv_scores.mean(), price_min, price_max)
    print(f"Mean RMSE in original USD: ${mean_rmse_usd:,.2f}")

print("\n" + "=" * 60)
print("SUMMARY OF ALL MODELS")
print("=" * 60)

# Create summary comparison
summary_data = []
for model_name, scores in results.items():
    mean_rmse = scores.mean()
    mean_rmse_usd = convert_scaled_to_original_price(mean_rmse, price_min, price_max)
    
    summary_data.append({
        'Model': model_name,
        'Mean RMSE': mean_rmse,
        'Mean RMSE (USD)': mean_rmse_usd,
        'Std RMSE': scores.std(),
        'Min RMSE': scores.min(),
        'Max RMSE': scores.max()
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Mean RMSE')

# Format the USD column for better display
summary_df_display = summary_df.copy()
summary_df_display['Mean RMSE (USD)'] = summary_df_display['Mean RMSE (USD)'].apply(lambda x: f"${x:,.2f}")

print(summary_df_display[['Model', 'Mean RMSE', 'Mean RMSE (USD)', 'Std RMSE']].to_string(index=False))

# Find the best model
best_model_name = summary_df.iloc[0]['Model']
best_rmse = summary_df.iloc[0]['Mean RMSE']
best_rmse_usd = summary_df.iloc[0]['Mean RMSE (USD)']

print(f"\nBest performing model: {best_model_name}")
print(f"Best RMSE: {best_rmse:.6f}")
print(f"Best RMSE in original USD: ${best_rmse_usd:,.2f}")

Running K-Fold Cross Validation (k=10) for each model...

Testing Ridge Regression...
Cross-validation RMSE scores for Ridge Regression:
count    10.000000
mean      0.093487
std       0.013143
min       0.070023
25%       0.086731
50%       0.094349
75%       0.100478
max       0.115486
dtype: float64
Mean RMSE: 0.093487 (+/- 0.024938)
Mean RMSE in original USD: $64,626.45

Testing Lasso Regression...
Cross-validation RMSE scores for Lasso Regression:
count    10.000000
mean      0.204120
std       0.016059
min       0.173553
25%       0.200680
50%       0.205945
75%       0.208352
max       0.235700
dtype: float64
Mean RMSE: 0.204120 (+/- 0.030470)
Mean RMSE in original USD: $129,430.72

Testing ElasticNet...
Cross-validation RMSE scores for ElasticNet:
count    10.000000
mean      0.204120
std       0.016059
min       0.173553
25%       0.200680
50%       0.205945
75%       0.208352
max       0.235700
dtype: float64
Mean RMSE: 0.204120 (+/- 0.030470)
Mean RMSE in original USD: $129,

In [14]:
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train, y_train)

y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

rmse_train_usd = convert_scaled_to_original_price(rmse_train, price_min, price_max)
rmse_test_usd = convert_scaled_to_original_price(rmse_test, price_min, price_max)

print("Random Forest Model Performance:")
print("=" * 50)
print(f"Training Set:")
print(f"  R² Score: {r2_train:.6f}")
print(f"  RMSE: {rmse_train:.6f}")
print(f"  RMSE (USD): ${rmse_train_usd:,.2f}")

print(f"\nTest Set:")
print(f"  R² Score: {r2_test:.6f}")
print(f"  RMSE: {rmse_test:.6f}")
print(f"  RMSE (USD): ${rmse_test_usd:,.2f}")

print(f"\nModel Comparison:")
print(f"  Difference in R² (Train - Test): {r2_train - r2_test:.6f}")
print(f"  Difference in RMSE (Test - Train): {rmse_test - rmse_train:.6f}")

if r2_train - r2_test > 0.1:
    print("Potential overfitting detected (R² difference > 0.1)")
else:
    print("Good generalization performance")

Random Forest Model Performance:
Training Set:
  R² Score: 0.972887
  RMSE: 0.033687
  RMSE (USD): $29,597.59

Test Set:
  R² Score: 0.888211
  RMSE: 0.074093
  RMSE (USD): $53,266.27

Model Comparison:
  Difference in R² (Train - Test): 0.084676
  Difference in RMSE (Test - Train): 0.040407
Good generalization performance


In [16]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for RandomizedSearchCV
param_distribs = {
    'n_estimators': randint(low=50, high=300),
    'max_features': ['sqrt', 'log2', None],
    'max_depth': randint(low=3, high=20),
    'min_samples_split': randint(low=2, high=20),
    'min_samples_leaf': randint(low=1, high=10),
    'bootstrap': [True, False]
}

# Create RandomizedSearchCV
rnd_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_distribs,
    n_iter=100,  # Number of parameter settings sampled
    cv=5,        # 5-fold cross validation
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1    # Use all available processors
)

print("Starting Randomized Search CV...")
start_time = time.time()

# Fit the randomized search
rnd_search.fit(X_train, y_train)

end_time = time.time()
print(f"Randomized Search completed in {end_time - start_time:.2f} seconds")

# Get the best parameters and score
print("\nBest parameters found:")
print(rnd_search.best_params_)
print(f"\nBest cross-validation RMSE: {-rnd_search.best_score_:.6f}")
print(f"Best RMSE in USD: ${convert_scaled_to_original_price(-rnd_search.best_score_, price_min, price_max):,.2f}")

# Train the best model on full training set and evaluate
best_rf_model = rnd_search.best_estimator_
y_train_pred_best = best_rf_model.predict(X_train)
y_test_pred_best = best_rf_model.predict(X_test)

# Calculate performance metrics
r2_train_best = r2_score(y_train, y_train_pred_best)
r2_test_best = r2_score(y_test, y_test_pred_best)
rmse_train_best = np.sqrt(mean_squared_error(y_train, y_train_pred_best))
rmse_test_best = np.sqrt(mean_squared_error(y_test, y_test_pred_best))

print(f"\nOptimized Random Forest Performance:")
print(f"Training R²: {r2_train_best:.6f}")
print(f"Test R²: {r2_test_best:.6f}")
print(f"Training RMSE: {rmse_train_best:.6f} (${convert_scaled_to_original_price(rmse_train_best, price_min, price_max):,.2f})")
print(f"Test RMSE: {rmse_test_best:.6f} (${convert_scaled_to_original_price(rmse_test_best, price_min, price_max):,.2f})")

# Compare with original model
print(f"\nComparison with original Random Forest:")
print(f"Original Test RMSE: {rmse_test:.6f} vs Optimized: {rmse_test_best:.6f}")
print(f"Improvement: {((rmse_test - rmse_test_best) / rmse_test * 100):.2f}%")

Starting Randomized Search CV...
Randomized Search completed in 114.33 seconds

Best parameters found:
{'bootstrap': True, 'max_depth': 18, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 15, 'n_estimators': 207}

Best cross-validation RMSE: 0.066953
Best RMSE in USD: $49,083.56

Optimized Random Forest Performance:
Training R²: 0.934371
Test R²: 0.900425
Training RMSE: 0.052410 ($40,565.24)
Test RMSE: 0.069928 ($50,826.65)

Comparison with original Random Forest:
Original Test RMSE: 0.074093 vs Optimized: 0.069928
Improvement: 5.62%
