In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# STEP 1: Load Dataset
url = 'https://raw.githubusercontent.com/ppitchaporn/DADS5001-Condo/main/data_cleaned.csv'
df = pd.read_csv(url)
df['rent_cd_price'] = df['rent_cd_price'].astype(str).str.replace(',', '', regex=False).astype(float)
df = df[df['rent_cd_price'] <= 100000]

#Create New Value
df['price_per_sqm'] = df['rent_cd_price'] / df['rent_cd_floorarea']

df.dropna(inplace=True)

# RandomForest

In [62]:
# STEP 2: Feature Selection
features = ['rent_cd_bed', 'rent_cd_bath', 'rent_cd_floorarea', 'near_rail_meter','price_per_sqm']
X = df[features]
y = df['rent_cd_price']

# STEP 3: Split data BEFORE standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 4: Fit scaler ONLY on training set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [68]:
# STEP 5: Define model and parameter grid
rf = RandomForestRegressor(random_state=42, max_depth = None, min_samples_split= 2, n_estimators=50)
param_grid = {
    'n_estimators': [5, 10, 50],
    'max_depth': [None, 5, 8, 10, 20],
    'min_samples_split': [2, 5]
}

# STEP 6: GridSearchCV
grid = GridSearchCV(rf, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
grid.fit(X_train_scaled, y_train)

# STEP 7: Best model and evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("\n📊 Model Evaluation (Best Estimator):")
print(f"Best Parameters: {grid.best_params_}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):,.2f} THB")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):,.2f} THB")
print(f"R² Score: {r2_score(y_test, y_pred):.3f}")

Fitting 3 folds for each of 30 candidates, totalling 90 fits

📊 Model Evaluation (Best Estimator):
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
MAE: 398.50 THB
RMSE: 1,178.41 THB
R² Score: 0.989


In [69]:
# STEP 8: User input
try:
    user_bed = int(input("Enter number of bedrooms: "))
    user_bath = int(input("Enter number of bathrooms: "))
    user_floorarea = int(input("Enter floor area (m²): "))
    user_near_rail = float(input("Max distance to rail station (meters): "))
    user_price_per_sqm = float(input("Enter estimated price per sqm (THB/m²): "))
except ValueError:
    print("Please enter valid inputs.")

Enter number of bedrooms: 1
Enter number of bathrooms: 1
Enter floor area (m²): 30
Max distance to rail station (meters): 800
Enter estimated price per sqm (THB/m²): 100000


In [70]:
# STEP 10: Predict
predicted_price = best_model.predict(user_scaled)[0]
print(f"\n💰 Predicted Price: {predicted_price:,.0f} THB")


💰 Predicted Price: 30,840 THB


# Filter Condo: Top 5 Cheapest and Most Expensive Condos Based on user input

In [66]:
# STEP 7: Filter condos similar to user input (bed, bath, floor_area and near_rail within ±100m)
similar_condos = df[
    (df['rent_cd_bed'] == user_bed) &
    (df['rent_cd_bath'] == user_bath) &
    (df['rent_cd_floorarea'] == user_floorarea) &
    (df['near_rail_meter'].between(user_near_rail - 100, user_near_rail + 100))
].copy()

# Check if any results, fallback to just bed/bath match if needed
if similar_condos.empty:
    print("⚠️ No exact match found within 100 meters. Relaxing the filter...")
    similar_condos = df[
        (df['rent_cd_bed'] == user_bed) &
        (df['rent_cd_bath'] == user_bath) &
        (df['rent_cd_floorarea'== user_floorarea])
    ].copy()

# STEP 8: Get Top 5 Cheapest Condos
top_5_cheapest = similar_condos.sort_values(by='rent_cd_price').head(5)
top_5_cheapest['label'] = 'CHEAPEST'

# STEP 9: Get Top 5 Most Expensive Condos
top_5_expensive = similar_condos.sort_values(by='rent_cd_price', ascending=False).head(5)
top_5_expensive['label'] = 'EXPENSIVE'

# STEP 10: Combine
recommended = pd.concat([top_5_cheapest, top_5_expensive], ignore_index=True)

In [67]:
cols = ['label', 'new_condo_name', 'rent_cd_price', 'rent_cd_bed', 'rent_cd_bath', 'rent_cd_floorarea',
        'near_rail_meter', 'star', 'rent_cd_agent', 'rent_cd_tel']

print(f"\n💰 Predicted Price: {predicted_price:,.0f} THB")
print("📊 Top 5 Cheapest and Most Expensive Condos Based on Your Input:")
display(recommended[cols])


💰 Predicted Price: 30,840 THB
📊 Top 5 Cheapest and Most Expensive Condos Based on Your Input:


Unnamed: 0,label,new_condo_name,rent_cd_price,rent_cd_bed,rent_cd_bath,rent_cd_floorarea,near_rail_meter,star,rent_cd_agent,rent_cd_tel
0,CHEAPEST,Lumpini Ville Phahol-Suthisarn,12000.0,1,1,30,770.0,4.3,Agentbkk,+66 95 896 6656
1,CHEAPEST,Lumpini Place Phahol-Saphankhwai,12000.0,1,1,30,880.0,4.2,Connex Property,+66 99 019 9900
2,CHEAPEST,Lumpini Place Phahol-Saphankhwai,12000.0,1,1,30,880.0,4.2,ต้องตา ทิพปภานาท,+66 81 988 2491
3,CHEAPEST,Ivy Ampio,17000.0,1,1,30,770.0,4.4,Kittithorn Khongkhaluang,+66 81 508 6066
4,CHEAPEST,Ivy Ampio,18000.0,1,1,30,770.0,4.4,Prakoonvit Manakorkiert,+66 62 536 0108
5,EXPENSIVE,Lumpini Place Rama 9-Ratchada,21000.0,1,1,30,880.0,4.4,Narattawan Wangsan,+66 80 641 1019
6,EXPENSIVE,Ivy Ampio,18000.0,1,1,30,770.0,4.4,Prakoonvit Manakorkiert,+66 62 536 0108
7,EXPENSIVE,Ivy Ampio,17000.0,1,1,30,770.0,4.4,Kittithorn Khongkhaluang,+66 81 508 6066
8,EXPENSIVE,Lumpini Ville Phahol-Suthisarn,12000.0,1,1,30,770.0,4.3,Agentbkk,+66 95 896 6656
9,EXPENSIVE,Lumpini Place Phahol-Saphankhwai,12000.0,1,1,30,880.0,4.2,Connex Property,+66 99 019 9900
