## 1. Data Preparation

### 1.1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import time

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV , KFold
from sklearn.metrics import mean_squared_error 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge , ElasticNet , Lasso
from sklearn.ensemble import RandomForestRegressor  , AdaBoostRegressor 

- Import data, check for comlumns and variables
### 1.2. Load and Clean Data
Load the dataset, inspect its structure, and perform initial cleaning. We drop the original 'Price' column (in Toman) and use 'Price(USD)' for our analysis, renaming it to 'Price'.

In [2]:
houses = pd.read_csv("./data/housePrice.csv")
houses = houses.drop('Price', axis=1)
houses = houses.rename(columns={'Price(USD)': 'Price'})
houses.info()
houses.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Area       3479 non-null   object 
 1   Room       3479 non-null   int64  
 2   Parking    3479 non-null   bool   
 3   Warehouse  3479 non-null   bool   
 4   Elevator   3479 non-null   bool   
 5   Address    3456 non-null   object 
 6   Price      3479 non-null   float64
dtypes: bool(3), float64(1), int64(1), object(2)
memory usage: 119.0+ KB


Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
0,63,1,True,True,True,Shahran,61666.67
1,60,1,True,True,True,Shahran,61666.67
2,79,2,True,True,True,Pardis,18333.33
3,95,2,True,True,True,Shahrake Qods,30083.33
4,123,2,True,True,True,Shahrake Gharb,233333.33


- Since the Address column has some missing data in certain rows, we will drop these rows.

In [3]:
houses = houses.dropna(subset=['Address'])

- Check the header again

In [4]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3456 entries, 0 to 3478
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Area       3456 non-null   object 
 1   Room       3456 non-null   int64  
 2   Parking    3456 non-null   bool   
 3   Warehouse  3456 non-null   bool   
 4   Elevator   3456 non-null   bool   
 5   Address    3456 non-null   object 
 6   Price      3456 non-null   float64
dtypes: bool(3), float64(1), int64(1), object(2)
memory usage: 145.1+ KB


- That said, the data is now complete. HOWEVER, we have not yet categorized the data into the two main types: Numerical and Categorical. Now, we will do that.

In [5]:
# Convert specified columns to categorical data type
houses['Room'] = houses['Room'].astype('category')
houses['Parking'] = houses['Parking'].astype('category')
houses['Warehouse'] = houses['Warehouse'].astype('category')
houses['Elevator'] = houses['Elevator'].astype('category')

# Fix 'Area' datatype
houses['Area'] = pd.to_numeric(houses['Area'], errors='coerce')
houses = houses.dropna(subset=['Area'])
# Verify the changes
print("Data types after conversion:")
print(houses.dtypes)
houses.info()

Data types after conversion:
Area          float64
Room         category
Parking      category
Warehouse    category
Elevator     category
Address        object
Price         float64
dtype: object


<class 'pandas.core.frame.DataFrame'>
Index: 3450 entries, 0 to 3478
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Area       3450 non-null   float64 
 1   Room       3450 non-null   category
 2   Parking    3450 non-null   category
 3   Warehouse  3450 non-null   category
 4   Elevator   3450 non-null   category
 5   Address    3450 non-null   object  
 6   Price      3450 non-null   float64 
dtypes: category(4), float64(2), object(1)
memory usage: 121.7+ KB


### 1.3. Outlier Removal
To prevent extreme values from skewing the analysis, we remove the top and bottom 1% of properties based on price.

In [6]:
# Calculate 1st and 99th percentiles
lower_percentile = houses['Price'].quantile(0.01)
upper_percentile = houses['Price'].quantile(0.99)

print(f"1st percentile (1% lowest): ${lower_percentile:.2f}")
print(f"99th percentile (1% highest): ${upper_percentile:.2f}")

# Remove outliers
houses_before = len(houses)
houses = houses[(houses['Price'] >= lower_percentile) & (houses['Price'] <= upper_percentile)]
houses_after = len(houses)

print(f"\nDataset size before removing outliers: {houses_before}")
print(f"Dataset size after removing outliers: {houses_after}")
print(f"Removed {houses_before - houses_after} outliers ({((houses_before - houses_after) / houses_before * 100):.1f}%)")

1st percentile (1% lowest): $9833.33
99th percentile (1% highest): $1333333.33

Dataset size before removing outliers: 3450
Dataset size after removing outliers: 3389
Removed 61 outliers (1.8%)


## 2. Feature Engineering & Data Preprocessing

- Convert the True/False values in the amenities to 0/1.

In [7]:
# Convert True/False values to 1/0 for utility features
houses['Parking'] = houses['Parking'].astype(int)
houses['Warehouse'] = houses['Warehouse'].astype(int)
houses['Elevator'] = houses['Elevator'].astype(int)

    # Display the updated dataframe
houses.tail()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
3474,86.0,2,1,1,1,Southern Janatabad,116666.67
3475,83.0,2,1,1,1,Niavaran,226666.67
3476,75.0,2,0,0,0,Parand,12166.67
3477,105.0,2,1,1,1,Dorous,186666.67
3478,82.0,2,0,1,1,Parand,12000.0


In [8]:
# Create categorical features using get_dummies for Address, Area_Group, and Price_Group
categorical_features = pd.get_dummies(houses[["Address"]])
houses = houses.merge(categorical_features, left_index=True, right_index=True)
houses.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Address_Abazar,Address_Abbasabad,Address_Absard,...,Address_Waterfall,Address_West Ferdows Boulevard,Address_West Pars,Address_Yaftabad,Address_Yakhchiabad,Address_Yousef Abad,Address_Zafar,Address_Zaferanieh,Address_Zargandeh,Address_Zibadasht
0,63.0,1,1,1,1,Shahran,61666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,60.0,1,1,1,1,Shahran,61666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,79.0,2,1,1,1,Pardis,18333.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,95.0,2,1,1,1,Shahrake Qods,30083.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,123.0,2,1,1,1,Shahrake Gharb,233333.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
prepared_dataset = houses.drop(["Address"], axis=1)
prepared_dataset.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Address_Abazar,Address_Abbasabad,Address_Absard,Address_Abuzar,...,Address_Waterfall,Address_West Ferdows Boulevard,Address_West Pars,Address_Yaftabad,Address_Yakhchiabad,Address_Yousef Abad,Address_Zafar,Address_Zaferanieh,Address_Zargandeh,Address_Zibadasht
0,63.0,1,1,1,1,61666.67,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,60.0,1,1,1,1,61666.67,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,79.0,2,1,1,1,18333.33,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,95.0,2,1,1,1,30083.33,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,123.0,2,1,1,1,233333.33,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 3. Model training

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Define features (X) and target (y)
X = prepared_dataset.drop(["Price"], axis=1)
y = prepared_dataset["Price"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Initialize the MinMaxScaler to scale features to a 0-1 range
scaler = MinMaxScaler()

# Fit on training data and transform both train and test data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Data split and scaled with MinMaxScaler. Ready for training.")

Data split and scaled with MinMaxScaler. Ready for training.


### 3.1. Feature Scaling

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

- Train shape

In [12]:
print("X Train and Test shape:")
print(f'X_train : {X_train.shape}')
print(f'X_test  : {X_test.shape}')
print("---" * 30)
print("y Train and Test shape:")
print(f'y_train : {y_train.shape}')
print(f'y_test  : {y_test.shape}')
print("---" * 30)

# Also print the feature count and sample count
print("Dataset Summary:")
print(f"Total samples: {len(X_train) + len(X_test)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/(len(X_train) + len(X_test)):.2%})")
print(f"Testing samples: {len(X_test)} ({len(X_test)/(len(X_train) + len(X_test)):.2%})")
print(f"Number of features: {X_train.shape[1]}")

X Train and Test shape:
X_train : (2372, 195)
X_test  : (1017, 195)
------------------------------------------------------------------------------------------
y Train and Test shape:
y_train : (2372,)
y_test  : (1017,)
------------------------------------------------------------------------------------------
Dataset Summary:
Total samples: 3389
Training samples: 2372 (69.99%)
Testing samples: 1017 (30.01%)
Number of features: 195


- Save the index and performance of each type of model on the dataset.

In [13]:
Model_Name  = []
Train_Score = []
Test_Score  = []
MSE_Score   = []
RMSE_Score  = []
Time_Score  = []

In [14]:
def model_prediction(model):
    
    start_time = time.time()
    model.fit(X_train, y_train)
    y_prediction = model.predict(X_test)
    y_prediction = [i if i>0 else 0 for i in y_prediction]
    TestScore = r2_score(y_test, y_prediction)
    TrainScore = model.score(X_train, y_train)
    # TestScore = model.score(X_test, y_test) 
    MSEscore = mean_squared_error(y_test, y_prediction)
    RMSEscore = np.sqrt(MSEscore)
    end_time = time.time()
    Timescore = end_time - start_time
    
    Name = str(model).split('(')[0]
    Model_Name.append(Name)
    Train_Score.append(TrainScore)
    Test_Score.append(TestScore)
    MSE_Score.append(MSEscore)
    RMSE_Score.append(RMSEscore)
    Time_Score.append(Timescore)

    print(f'Model: {Name}')
    print('=' * 50)
    print(f'Training R² Score: {TrainScore:.4f} ({TrainScore*100:.2f}%)')
    print(f'Testing R² Score:  {TestScore:.4f} ({TestScore*100:.2f}%)')
    print('=' * 50)
    print(f'Mean Squared Error:  ${MSEscore:,.0f}')
    print(f'Root Mean Squared Error: ${RMSEscore:,.0f}')
    print('=' * 50)
    print(f'Training Time: {Timescore:.3f} seconds')
    print('\n')


1.  Regression Model

In [15]:
model_prediction(ElasticNet())

Model: ElasticNet
Training R² Score: 0.7100 (71.00%)
Testing R² Score:  0.6815 (68.15%)
Mean Squared Error:  $13,252,965,674
Root Mean Squared Error: $115,122
Training Time: 0.160 seconds




In [16]:
model_prediction(Lasso())

Model: Lasso
Training R² Score: 0.7582 (75.82%)
Testing R² Score:  0.7496 (74.96%)
Mean Squared Error:  $10,418,575,305
Root Mean Squared Error: $102,071
Training Time: 2.742 seconds




  model = cd_fast.enet_coordinate_descent(


In [17]:
model_prediction(Ridge())

Model: Ridge
Training R² Score: 0.7582 (75.82%)
Testing R² Score:  0.7488 (74.88%)
Mean Squared Error:  $10,452,821,726
Root Mean Squared Error: $102,239
Training Time: 0.128 seconds




In [18]:
model_prediction(LinearRegression())
model_prediction(KNeighborsRegressor())
model_prediction(DecisionTreeRegressor())
model_prediction(RandomForestRegressor())
model_prediction(AdaBoostRegressor())

Model: LinearRegression
Training R² Score: 0.7578 (75.78%)
Testing R² Score:  0.7646 (76.46%)
Mean Squared Error:  $9,795,978,243
Root Mean Squared Error: $98,975
Training Time: 0.139 seconds


Model: KNeighborsRegressor
Training R² Score: 0.8615 (86.15%)
Testing R² Score:  0.7783 (77.83%)
Mean Squared Error:  $9,223,426,539
Root Mean Squared Error: $96,039
Training Time: 0.931 seconds


Model: DecisionTreeRegressor
Training R² Score: 0.9840 (98.40%)
Testing R² Score:  0.7955 (79.55%)
Mean Squared Error:  $8,507,477,510
Root Mean Squared Error: $92,236
Training Time: 0.112 seconds


Model: RandomForestRegressor
Training R² Score: 0.9618 (96.18%)
Testing R² Score:  0.8312 (83.12%)
Mean Squared Error:  $7,021,690,345
Root Mean Squared Error: $83,796
Training Time: 2.136 seconds


Model: AdaBoostRegressor
Training R² Score: 0.7130 (71.30%)
Testing R² Score:  0.7136 (71.36%)
Mean Squared Error:  $11,915,117,384
Root Mean Squared Error: $109,156
Training Time: 0.376 seconds




In [19]:
from sklearn.model_selection import cross_val_score
import warnings

# Perform cross-validation for all models
warnings.filterwarnings('ignore')

# Define models to test
models = [
    ElasticNet(),
    Lasso(),
    Ridge(),
    LinearRegression(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor()
]

# Store cross-validation results
cv_results = {
    'Model': [],
    'Mean R² Score': [],
    'Std R² Score': [],
    'Mean RMSE': [],
    'Mean Training Time (s)': []
}

print("Cross-Validation Results:")
print("=" * 80)

for model in models:
    model_name = str(model).split('(')[0]
    print(f"\nEvaluating {model_name}...")
    
    # Perform 5-fold cross-validation for R² score
    start_time = time.time()
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    # Perform cross-validation for RMSE
    cv_rmse_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse_scores = np.sqrt(-cv_rmse_scores)  # Convert to positive RMSE
    
    end_time = time.time()
    training_time = (end_time - start_time) / 5  # Average time per fold
    
    # Store results
    cv_results['Model'].append(model_name)
    cv_results['Mean R² Score'].append(cv_scores.mean())
    cv_results['Std R² Score'].append(cv_scores.std())
    cv_results['Mean RMSE'].append(cv_rmse_scores.mean())
    cv_results['Mean Training Time (s)'].append(training_time)
    
    print(f"Mean R² Score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    print(f"Mean RMSE: ${cv_rmse_scores.mean():,.0f} (±${cv_rmse_scores.std():,.0f})")
    print(f"Average Training Time: {training_time:.3f} seconds")

# Convert to DataFrame for better visualization
cv_df = pd.DataFrame(cv_results)
cv_df['Mean R² Score'] = cv_df['Mean R² Score'].apply(lambda x: f"{x:.4f} ({x*100:.2f}%)")
cv_df['Mean RMSE'] = cv_df['Mean RMSE'].apply(lambda x: f"${x:,.0f}")

# Sort by R² score (convert back to float for sorting)
cv_df_sorted = cv_df.copy()
cv_df_sorted['R2_numeric'] = [float(score.split(' ')[0]) for score in cv_df['Mean R² Score']]
cv_df_sorted = cv_df_sorted.sort_values('R2_numeric', ascending=False).drop('R2_numeric', axis=1)

print("\n" + "=" * 80)
print("CROSS-VALIDATION SUMMARY (Sorted by R² Score):")
print("=" * 80)
print(cv_df_sorted.to_string(index=False))

Cross-Validation Results:

Evaluating ElasticNet...
Mean R² Score: 0.6800 (±0.0132)
Mean RMSE: $107,923 (±$6,467)
Average Training Time: 0.043 seconds

Evaluating Lasso...
Mean R² Score: 0.7099 (±0.0401)
Mean RMSE: $102,271 (±$4,533)
Average Training Time: 1.444 seconds

Evaluating Ridge...
Mean R² Score: 0.7095 (±0.0400)
Mean RMSE: $102,346 (±$4,502)
Average Training Time: 0.036 seconds

Evaluating LinearRegression...
Mean R² Score: -24764840845909345438533681152.0000 (±25883646277259507520870809600.0000)
Mean RMSE: $25,837,418,820,381,515,776 (±$16,364,701,857,607,477,248)
Average Training Time: 0.073 seconds

Evaluating KNeighborsRegressor...
Mean R² Score: 0.7418 (±0.0516)
Mean RMSE: $96,098 (±$5,158)
Average Training Time: 0.167 seconds

Evaluating DecisionTreeRegressor...
Mean R² Score: 0.6972 (±0.0473)
Mean RMSE: $106,132 (±$11,116)
Average Training Time: 0.578 seconds

Evaluating RandomForestRegressor...
Mean R² Score: 0.7949 (±0.0428)
Mean RMSE: $85,939 (±$9,696)
Average Train

In [20]:
from scipy import stats

# Perform statistical comparison between RandomForest and other models
print("Statistical Comparison: RandomForestRegressor vs Other Models")
print("=" * 70)

# Get RandomForest cross-validation scores
rf_model = RandomForestRegressor(random_state=42)
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='r2')

print(f"RandomForest CV Scores: {rf_cv_scores}")
print(f"RandomForest Mean R²: {rf_cv_scores.mean():.4f} (±{rf_cv_scores.std():.4f})")
print()

# Compare with other top models
comparison_models = [
    ('KNeighborsRegressor', KNeighborsRegressor()),
    ('Lasso', Lasso(random_state=42)),
    ('Ridge', Ridge(random_state=42)),
    ('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42))
]

for model_name, model in comparison_models:
    # Get CV scores for comparison model
    model_cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    # Perform paired t-test
    t_stat, p_value = stats.ttest_rel(rf_cv_scores, model_cv_scores)
    
    # Calculate effect size (Cohen's d)
    pooled_std = np.sqrt((rf_cv_scores.var() + model_cv_scores.var()) / 2)
    cohens_d = (rf_cv_scores.mean() - model_cv_scores.mean()) / pooled_std
    
    print(f"RandomForest vs {model_name}:")
    print(f"  {model_name} Mean R²: {model_cv_scores.mean():.4f} (±{model_cv_scores.std():.4f})")
    print(f"  Difference: {rf_cv_scores.mean() - model_cv_scores.mean():.4f}")
    print(f"  T-statistic: {t_stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Cohen's d: {cohens_d:.4f}")
    
    # Interpret results
    if p_value < 0.01:
        significance = "highly significant (p < 0.01)"
    elif p_value < 0.05:
        significance = "significant (p < 0.05)"
    else:
        significance = "not significant (p >= 0.05)"
    
    if abs(cohens_d) < 0.2:
        effect_size = "small"
    elif abs(cohens_d) < 0.5:
        effect_size = "medium"
    else:
        effect_size = "large"
    
    print(f"  Result: RandomForest is {significance} better with {effect_size} effect size")
    print("-" * 50)

# Overall conclusion
print("\nCONCLUSION:")
print("=" * 70)
best_competitor = max(comparison_models, key=lambda x: cross_val_score(x[1], X_train, y_train, cv=5, scoring='r2').mean())
competitor_scores = cross_val_score(best_competitor[1], X_train, y_train, cv=5, scoring='r2')
final_t_stat, final_p_value = stats.ttest_rel(rf_cv_scores, competitor_scores)

if final_p_value < 0.05:
    print(f"✅ RandomForestRegressor significantly outperforms all other models")
    print(f"   Best competitor: {best_competitor[0]} (Mean R²: {competitor_scores.mean():.4f})")
    print(f"   Statistical significance: p = {final_p_value:.4f}")
else:
    print(f"❌ RandomForestRegressor does not significantly outperform all models")
    print(f"   Best competitor: {best_competitor[0]} performs similarly")

Statistical Comparison: RandomForestRegressor vs Other Models
RandomForest CV Scores: [0.82083379 0.80961879 0.74577877 0.77163777 0.84299266]
RandomForest Mean R²: 0.7982 (±0.0349)

RandomForest vs KNeighborsRegressor:
  KNeighborsRegressor Mean R²: 0.7418 (±0.0516)
  Difference: 0.0564
  T-statistic: 1.6705
  P-value: 0.1701
  Cohen's d: 1.2782
  Result: RandomForest is not significant (p >= 0.05) better with large effect size
--------------------------------------------------
RandomForest vs Lasso:
  Lasso Mean R²: 0.7099 (±0.0401)
  Difference: 0.0883
  T-statistic: 3.0874
  P-value: 0.0367
  Cohen's d: 2.3488
  Result: RandomForest is significant (p < 0.05) better with large effect size
--------------------------------------------------
RandomForest vs Ridge:
  Ridge Mean R²: 0.7095 (±0.0400)
  Difference: 0.0887
  T-statistic: 3.1017
  P-value: 0.0362
  Cohen's d: 2.3621
  Result: RandomForest is significant (p < 0.05) better with large effect size
-------------------------------