In [1]:
#!pip install xgboost

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
base_url = "https://raw.githubusercontent.com/pranta-iitp/Real-Estate-Property-Price-Prediction-Project/main/gurgaon_properties_post_feature_selection%20(1).csv"
df = pd.read_csv(base_url)

In [4]:
df.shape

(3499, 13)

In [5]:
df.sample(3)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,furnishing_type,luxury_category,floor_category
414,flat,sector 65,2.35,3,3,2,Relatively New,1650.0,1,1,0,High,Mid Floor
2317,flat,sector 82a,0.62,1,1,0,Relatively New,650.0,0,0,0,Low,Mid Floor
3225,house,sector 14,4.15,5,4,2,Old Property,2250.0,0,0,1,Medium,Low Floor


In [6]:
X = df.drop(columns=['price'])
y = df['price']
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

#### We will use three encoding technique *(One Hot Encoding, Ordinal Encoding and Target Encoding)* to convert categorical column to numerical columns and then we will train multiple model. Once training done, we will choose the best model.

### Ordinal Encoding

In [7]:
# Define columns for encoding (verify these exist in your DataFrame)
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Define numerical columns (adjust based on your actual column names)
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room']

# Create robust preprocessor
preprocessor_OE = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), columns_to_encode)
    ],
    remainder='passthrough'
)

In [8]:
def scorer(model_name, model, preprocessor, X, y_transformed):
    """
    Enhanced model evaluation function with comprehensive metrics
    """
    import time

    output = []
    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation with R2
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    # CV R2 mean and standard deviation
    output.append(scores.mean())
    output.append(scores.std())

    # Train-test split for additional metrics
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_transformed, test_size=0.2, random_state=42
    )

    # Measure training time
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Convert back from log scale to original price scale
    y_pred_original = np.expm1(y_pred)
    y_test_original = np.expm1(y_test)

    # # Test set R2 score
    # test_r2 = r2_score(y_test_original, y_pred_original)
    # output.append(test_r2)

    # Mean Absolute Error (in crores)
    mae = mean_absolute_error(y_test_original, y_pred_original)
    output.append(mae)

    # Root Mean Squared Error (in crores) - FIXED VERSION
    rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
    output.append(rmse)

    # # Mean Absolute Percentage Error
    # mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original)) * 100
    # output.append(mape)

    # # Training time (seconds)
    # output.append(training_time)

    return output


In [9]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [10]:
# CORRECTED: Include all required parameters
model_output = []
for model_name, model in model_dict.items():
    result = scorer(model_name, model, preprocessor_OE, X, y_transformed)
    model_output.append(result)

# Convert to DataFrame for analysis
columns = ['Model', 'CV_R2_Mean', 'CV_R2_Std', 'MAE_Crores', 'RMSE_Crores']
results_df = pd.DataFrame(model_output, columns=columns)



In [11]:
results_df.sort_values(['MAE_Crores'])

Unnamed: 0,Model,CV_R2_Mean,CV_R2_Std,MAE_Crores,RMSE_Crores
6,extra trees,0.893363,0.019066,0.413509,0.95081
10,xgboost,0.901101,0.017442,0.424302,0.867886
9,mlp,0.889452,0.014526,0.440066,0.950229
5,random forest,0.884826,0.022312,0.445565,0.974155
1,svr,0.896979,0.014095,0.476935,1.076749
7,gradient boosting,0.859214,0.015172,0.527171,1.020052
4,decision tree,0.812231,0.026798,0.556428,1.320859
2,ridge,0.850301,0.01537,0.568205,1.474227
0,linear_reg,0.845999,0.016818,0.571927,1.489647
8,adaboost,0.709358,0.027362,0.8117,1.450096


#### OneHotEncoding

In [12]:
df.sample(3)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,furnishing_type,luxury_category,floor_category
2456,flat,sector 14,2.2,3,4,3+,Old Property,1956.0,0,0,0,Low,Mid Floor
2044,flat,sector 37d,1.25,3,3,3,Under Construction,2283.0,0,0,1,Medium,Mid Floor
2948,house,sector 3,0.9,3,1,0,Old Property,1008.0,0,0,0,Low,Low Floor


In [13]:
# Define columns for encoding (verify these exist in your DataFrame)
columns_to_encode_OE = ['balcony', 'luxury_category', 'floor_category','furnishing_type']
columns_to_encode_OHE = ['property_type','sector','agePossession']
# Define numerical columns (adjust based on your actual column names)
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room']

# Creating a column transformer for preprocessing
preprocessor_OHE = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(), columns_to_encode_OE),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',),columns_to_encode_OHE)
    ],
    remainder='passthrough'
)

In [14]:
# CORRECTED: Include all required parameters
model_output = []
for model_name, model in model_dict.items():
    result = scorer(model_name, model, preprocessor_OHE, X, y_transformed)
    model_output.append(result)

# Convert to DataFrame for analysis
columns = ['Model', 'CV_R2_Mean', 'CV_R2_Std', 'MAE_Crores', 'RMSE_Crores']
results_df = pd.DataFrame(model_output, columns=columns)




In [15]:
results_df.sort_values(['MAE_Crores'])

Unnamed: 0,Model,CV_R2_Mean,CV_R2_Std,MAE_Crores,RMSE_Crores
6,extra trees,0.896041,0.020086,0.404567,0.94394
10,xgboost,0.899935,0.020593,0.4081,0.863213
9,mlp,0.88804,0.017039,0.445304,0.943305
5,random forest,0.883063,0.024033,0.451315,1.007883
1,svr,0.887856,0.016007,0.463816,1.003561
7,gradient boosting,0.859357,0.015866,0.518829,1.012886
4,decision tree,0.814628,0.032862,0.561954,1.334069
2,ridge,0.84616,0.018058,0.573462,1.47645
0,linear_reg,0.842747,0.01765,0.573827,1.472754
8,adaboost,0.721491,0.027479,0.811204,1.441649


### Target Encoder

In [16]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [17]:
import category_encoders as ce

# Define columns for encoding (verify these exist in your DataFrame)
columns_to_encode_OE = ['balcony', 'luxury_category', 'floor_category','furnishing_type']
columns_to_encode_OHE = ['property_type','agePossession']
# Define numerical columns (adjust based on your actual column names)
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room']

# Creating a column transformer for preprocessing
preprocessor_target_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(), columns_to_encode_OE),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',),columns_to_encode_OHE),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [18]:
# CORRECTED: Include all required parameters
model_output = []
for model_name, model in model_dict.items():
    result = scorer(model_name, model, preprocessor_target_encoding, X, y_transformed)
    model_output.append(result)

# Convert to DataFrame for analysis
columns = ['Model', 'CV_R2_Mean', 'CV_R2_Std', 'MAE_Crores', 'RMSE_Crores']
results_df = pd.DataFrame(model_output, columns=columns)

In [19]:
results_df.sort_values(['MAE_Crores'])

Unnamed: 0,Model,CV_R2_Mean,CV_R2_Std,MAE_Crores,RMSE_Crores
10,xgboost,0.91109,0.012912,0.430498,0.973963
6,extra trees,0.900864,0.013161,0.438835,1.021589
5,random forest,0.905644,0.012406,0.438908,0.975069
7,gradient boosting,0.888008,0.013556,0.481218,0.980557
1,svr,0.868739,0.014558,0.500788,1.04925
9,mlp,0.858184,0.020671,0.521806,1.064605
4,decision tree,0.828808,0.029092,0.569089,1.426326
0,linear_reg,0.824153,0.024262,0.618354,1.486588
2,ridge,0.824176,0.024237,0.618636,1.487523
8,adaboost,0.815651,0.018756,0.65524,1.283651


#### Hyperparameter Tuning of Random Forest

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
# Fixed parameter grid
param_grid_fixed = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2],
    'regressor__max_features': ['sqrt', 'log2', 0.3],  # NO 'auto'
    'regressor__max_samples': [0.8, 0.9, 1.0]
}


In [37]:
import category_encoders as ce

# Define columns for encoding (verify these exist in your DataFrame)
columns_to_encode_OE = ['balcony', 'luxury_category', 'floor_category','furnishing_type']
columns_to_encode_OHE = ['property_type','agePossession']
# Define numerical columns (adjust based on your actual column names)
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room']

# Creating a column transformer for preprocessing
preprocessor_target_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(), columns_to_encode_OE),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',),columns_to_encode_OHE),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [40]:
pipeline = Pipeline([
    ('preprocessor', preprocessor_target_encoding),
    ('regressor', RandomForestRegressor())
])

In [41]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [42]:
search = GridSearchCV(
    pipeline,
    param_grid_fixed,
    cv=kfold,
    scoring='r2',
    n_jobs=-1,
    verbose=2,
    error_score='raise'  # This will help debug any remaining issues
)

In [43]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


In [44]:
final_pipe = search.best_estimator_

In [45]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 0.3,
 'regressor__max_samples': 1.0,
 'regressor__min_samples_leaf': 1,
 'regressor__min_samples_split': 2,
 'regressor__n_estimators': 200}

In [46]:
search.best_score_

np.float64(0.9021976839322849)

#### Exporting the model

In [47]:
# Define columns for encoding (verify these exist in your DataFrame)
columns_to_encode_OE = ['balcony', 'luxury_category', 'floor_category','furnishing_type']
columns_to_encode_OHE = ['property_type','agePossession']
# Define numerical columns (adjust based on your actual column names)
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room']

# Creating a column transformer for preprocessing
preprocessor_target_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(), columns_to_encode_OE),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',),columns_to_encode_OHE),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [48]:
pipeline = Pipeline([
    ('preprocessor', preprocessor_target_encoding),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [49]:
pipeline.fit(X,y_transformed)

In [50]:
import pickle

with open('real_estate_model_RF.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [51]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)