In [67]:
## import library
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder,TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
import re
from xgboost import XGBRegressor
from gc import callbacks
from skopt.callbacks import DeltaYStopper

In [62]:
warnings.filterwarnings("ignore")
# Load and preprocess data
df = pd.read_csv('/content/gurgaon_property_real_estate_data_before_ordinal_encoding')

# Clean and preprocess
df = df.drop(columns=['Study Room', 'Pooja Room', 'Others'])
df['furnish_type'] = df['furnish_type'].replace({0: 'unfurnished', 1: 'semifurnished', 2: 'furnished'})
df['Property_type'] = df['Property_type'].str.strip().str.lower()
df['agePossession'] = df['agePossession'].str.strip().str.lower()
df.columns = df.columns.str.strip().str.lower()
df.rename(columns={'servant room': 'servant_room', 'store room': 'store_room'}, inplace=True)

# Handle sector inconsistencies
df['sector'] = df['sector'].str.strip().str.lower()
df['sector'] = df['sector'].replace({
    'sohna road road': 'sohna road',
    'sector 37c': 'sector 37',
    'sector 3 phase 3 extension gurgaon': 'sector 3',
    'sector 3 phase 3 extension': 'sector 3',
    'sector 3 phase 2': 'sector 3'
})
df['sector']=df['sector'].apply(lambda x: re.sub(r'(sector \d+)[a-zA-Z]*', r'\1', x))
# Define target and features
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform target

## `Ordinal encoding`

In [None]:
# Separate feature types
numerical_columns = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_columns_ordinal = ['property_type', 'balcony', 'agepossession', 'luxury_category', 'floor_category','sector', 'furnish_type']

# Define preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_columns),
    ('ordinal', OrdinalEncoder(), categorical_columns_ordinal)
], remainder='passthrough')

# Define pipeline
def build_pipeline(model):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

# Define scoring function
def evaluate_model(model_name, model):
    pipeline = build_pipeline(model)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_actual = np.expm1(y_pred)  # Transform predictions back
    y_test_actual = np.expm1(y_test)  # Transform target back
    mae = mean_absolute_error(y_test_actual, y_pred_actual)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
    r2 = r2_score(y_test_actual, y_pred_actual)
    return {
        'Model': model_name,
        'CV_R2': cv_scores.mean(),
        'MAE': mae,
        'RMSE': rmse,
        'Test_R2': r2
    }

# Define models
models = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor()}

# Evaluate all models
results = []
for model_name, model in models.items():
    results.append(evaluate_model(model_name, model))

# Display results
results_df1= pd.DataFrame(results).sort_values(by='Test_R2', ascending=False)
print(results_df1)

               Model     CV_R2       MAE      RMSE   Test_R2
7  gradient boosting  0.864014  0.541362  1.111811  0.797786
5      random forest  0.869802  0.499792  1.196707  0.765726
6        extra trees  0.845654  0.550140  1.236815  0.749759
9                mlp  0.792187  0.674362  1.359879  0.697484
8           adaboost  0.749229  0.768249  1.448944  0.656559
4      decision tree  0.771018  0.616495  1.471553  0.645758
1                svr  0.744865  0.782754  1.670724  0.543377
0         linear_reg  0.662923  0.844837  1.682557  0.536886
2              ridge  0.662948  0.844826  1.682585  0.536870
3              LASSO  0.051466  1.350791  2.460519  0.009620


In [None]:
# Define feature columns
numerical_features = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_features = ['property_type', 'balcony', 'agepossession', 'luxury_category', 'floor_category', 'sector', 'furnish_type']

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical features
        ('cat', OrdinalEncoder(), categorical_features)  # Encode categorical features
    ]
)

# Load and preprocess data
X = df.drop(columns=['price'])  # Feature matrix
y = np.log1p(df['price'])  # Log-transform target variable for better scaling

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manually preprocess data
X_train = preprocessor.fit_transform(X_train)  # Apply preprocessing
X_test = preprocessor.transform(X_test)  # Apply preprocessing to test data

# Fit XGBoost model
xgb_model = XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Reverse the log-transform for predictions and actual values
y_pred_actual = np.expm1(y_pred)
y_test_actual = np.expm1(y_test)

# Calculate metrics
mae = mean_absolute_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
r2 = r2_score(y_test_actual, y_pred_actual)

# Cross-validation
# cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=10, scoring='r2')
# cv = cv_scores.mean()

# Compile results
results1 = {
    'Model': 'XGBRegressor',
    'MAE': mae,
    'RMSE': rmse,
    'Test_R2': r2
}

# Display result
print(results1)


{'Model': 'XGBRegressor', 'MAE': 0.48234742821186355, 'RMSE': 1.079170409811177, 'Test_R2': 0.809485227748641}


In [None]:
# Display results
results1= pd.DataFrame([results1])
print(results)

[{'Model': 'linear_reg', 'CV_R2': 0.6629231831883418, 'MAE': 0.8448370542897369, 'RMSE': 1.682556649439279, 'Test_R2': 0.5368859157866723}, {'Model': 'svr', 'CV_R2': 0.7448645746433813, 'MAE': 0.7827543277786966, 'RMSE': 1.6707240521287825, 'Test_R2': 0.543376721245472}, {'Model': 'ridge', 'CV_R2': 0.6629476556531605, 'MAE': 0.8448264835832731, 'RMSE': 1.6825847308551036, 'Test_R2': 0.5368704571598242}, {'Model': 'LASSO', 'CV_R2': 0.05146581700586837, 'MAE': 1.3507912110030271, 'RMSE': 2.4605190163934756, 'Test_R2': 0.009619569733788746}, {'Model': 'decision tree', 'CV_R2': 0.7710181825032002, 'MAE': 0.6164949546750109, 'RMSE': 1.4715527098024117, 'Test_R2': 0.6457578427159789}, {'Model': 'random forest', 'CV_R2': 0.8698020596247197, 'MAE': 0.4997918080054338, 'RMSE': 1.1967069270534325, 'Test_R2': 0.7657259376934398}, {'Model': 'extra trees', 'CV_R2': 0.8456537501007251, 'MAE': 0.5501398260883019, 'RMSE': 1.2368146184609592, 'Test_R2': 0.749759373960156}, {'Model': 'gradient boosting'

In [None]:
result_df=pd.concat([results_df1,results1])
result_df.sort_values(by=['MAE','Test_R2'],ascending=[True,False])

Unnamed: 0,Model,CV_R2,MAE,RMSE,Test_R2
0,XGBRegressor,,0.482347,1.07917,0.809485
5,random forest,0.869802,0.499792,1.196707,0.765726
7,gradient boosting,0.864014,0.541362,1.111811,0.797786
6,extra trees,0.845654,0.55014,1.236815,0.749759
4,decision tree,0.771018,0.616495,1.471553,0.645758
9,mlp,0.792187,0.674362,1.359879,0.697484
8,adaboost,0.749229,0.768249,1.448944,0.656559
1,svr,0.744865,0.782754,1.670724,0.543377
2,ridge,0.662948,0.844826,1.682585,0.53687
0,linear_reg,0.662923,0.844837,1.682557,0.536886


#`One hot encoding`

In [None]:
df.columns

Index(['property_type', 'sector', 'price', 'bedroom', 'bathroom', 'balcony',
       'agepossession', 'built_up_area', 'servant_room', 'store_room',
       'furnish_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [None]:
# Define feature columns
numerical_columns = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_columns = ['agepossession', 'sector', 'furnish_type']
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']
# Preprocessor
preprocessor1 = ColumnTransformer([
    ('num', StandardScaler(), numerical_columns),('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1', OneHotEncoder(drop='first'), categorical_columns)
])

In [None]:
pipeline1=Pipeline([
        ('preprocessor', preprocessor1),
        ('regressor', LinearRegression())
    ])

In [None]:
kfold1 = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation and calculate the scores
cv_scores = cross_val_score(pipeline1,X,y,cv=kfold1, scoring='r2',error_score='raise')
print('training_data',np.mean(cv_scores)*100)
print('training_data',np.std(cv_scores))

training_data 81.61510993367388
training_data 0.07851525816515205


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline1.fit(X_train,y_train)

In [None]:
y_pred=pipeline1.predict(X_test)

In [None]:
y_pred=np.expm1(y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.5969193256511214

In [None]:
# Define feature columns
numerical_columns = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_columns = ['agepossession', 'sector', 'furnish_type']
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform target
# Preprocessor
preprocessor1 = ColumnTransformer([
    ('num', StandardScaler(), numerical_columns),('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1', OneHotEncoder(drop='first',sparse_output=False), categorical_columns)
])
# Define pipeline
def build_pipeline(model):
    return Pipeline([
        ('preprocessor', preprocessor1),
        ('regressor', model)
    ])

# Define scoring function
def evaluate_model(model_name, model):
    pipeline = build_pipeline(model)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2',error_score='raise')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_actual = np.expm1(y_pred)  # Transform predictions back
    y_test_actual = np.expm1(y_test)  # Transform target back
    mae = mean_absolute_error(y_test_actual, y_pred_actual)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
    r2 = r2_score(y_test_actual, y_pred_actual)
    return {
        'Model': model_name,
        'CV_R2': cv_scores.mean(),
        'MAE': mae,
        'RMSE': rmse,
        'Test_R2': r2
    }

# Define models
models = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor()}

# Evaluate all models
results = []
for model_name, model in models.items():
    results.append(evaluate_model(model_name, model))
# Display results
results_df = pd.DataFrame(results).sort_values(by='Test_R2', ascending=False)
print(results_df)

               Model     CV_R2       MAE      RMSE   Test_R2
7  gradient boosting  0.869965  0.540975  1.099114  0.802379
5      random forest  0.885381  0.462431  1.101504  0.801518
6        extra trees  0.884647  0.432984  1.103032  0.800967
9                mlp  0.869334  0.529401  1.112463  0.797549
4      decision tree  0.799500  0.593396  1.383254  0.686994
2              ridge  0.816177  0.595959  1.446800  0.657575
0         linear_reg  0.816109  0.597019  1.453080  0.654596
8           adaboost  0.735574  0.821813  1.454289  0.654021
1                svr  0.751302  0.769208  1.654180  0.552375
3              LASSO  0.051466  1.350791  2.460519  0.009620


In [None]:
results_df = pd.DataFrame(results).sort_values(by=['MAE','Test_R2'], ascending=[True,False])
results_df

Unnamed: 0,Model,CV_R2,MAE,RMSE,Test_R2
6,extra trees,0.884647,0.432984,1.103032,0.800967
5,random forest,0.885381,0.462431,1.101504,0.801518
9,mlp,0.869334,0.529401,1.112463,0.797549
7,gradient boosting,0.869965,0.540975,1.099114,0.802379
4,decision tree,0.7995,0.593396,1.383254,0.686994
2,ridge,0.816177,0.595959,1.4468,0.657575
0,linear_reg,0.816109,0.597019,1.45308,0.654596
1,svr,0.751302,0.769208,1.65418,0.552375
8,adaboost,0.735574,0.821813,1.454289,0.654021
3,LASSO,0.051466,1.350791,2.460519,0.00962


In [None]:
# Define feature columns
numerical_columns = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_columns = ['agepossession', 'sector', 'furnish_type']
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']
# Preprocessor
preprocessor2 = ColumnTransformer([
    ('num', StandardScaler(), numerical_columns),('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1', OneHotEncoder(drop='first'), categorical_columns)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manually preprocess data
X_train = preprocessor2.fit_transform(X_train)  # Apply preprocessing
X_test = preprocessor2.transform(X_test)  # Apply preprocessing to test data

# Fit XGBoost model
xgb_model = XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Calculate metrics
y_pred_actual = np.expm1(y_pred)  # In case the target was log-transformed
y_test_actual = np.expm1(y_test)  # Same for the test target

mae = mean_absolute_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
r2 = r2_score(y_test_actual, y_pred_actual)


# cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=10, scoring='r2')
# cv = cv_scores.mean()

# Result
results = {
    'Model': 'XGBRegressor',
    'MAE': mae,
    'RMSE': rmse,
    'Test_R2': r2
}

# Display result
print(results)
results=pd.DataFrame([results])

{'Model': 'XGBRegressor', 'MAE': 0.49635366737926506, 'RMSE': 1.1684181966812206, 'Test_R2': 0.7766709480330387}


In [None]:
result_df=pd.concat([results_df,results])
result_df.sort_values(by=['MAE','Test_R2'],ascending=[True,False])

Unnamed: 0,Model,CV_R2,MAE,RMSE,Test_R2
6,extra trees,0.884647,0.432984,1.103032,0.800967
5,random forest,0.885381,0.462431,1.101504,0.801518
0,XGBRegressor,,0.496354,1.168418,0.776671
9,mlp,0.869334,0.529401,1.112463,0.797549
7,gradient boosting,0.869965,0.540975,1.099114,0.802379
4,decision tree,0.7995,0.593396,1.383254,0.686994
2,ridge,0.816177,0.595959,1.4468,0.657575
0,linear_reg,0.816109,0.597019,1.45308,0.654596
1,svr,0.751302,0.769208,1.65418,0.552375
8,adaboost,0.735574,0.821813,1.454289,0.654021


##`One hot encoding with pca`

In [None]:
# !pip install --upgrade scikit-learn xgboost

In [None]:
from sklearn.decomposition import PCA
# Define feature columns
numerical_columns = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_columns = ['agepossession', 'sector', 'furnish_type']
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']
# Preprocessor
preprocessor3= ColumnTransformer([
    ('num', StandardScaler(), numerical_columns),('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1', OneHotEncoder(drop='first', sparse_output=False), categorical_columns)
])
# Define pipeline
def build_pipeline(model):
    return Pipeline([
        ('preprocessor', preprocessor3),('feature_extraction',PCA(n_components=0.95, svd_solver='auto')),
        ('regressor', model)
    ])

# Define scoring function
def evaluate_model(model_name, model):
    pipeline = build_pipeline(model)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_actual = np.expm1(y_pred)  # Transform predictions back
    y_test_actual = np.expm1(y_test)  # Transform target back
    mae = mean_absolute_error(y_test_actual, y_pred_actual)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
    r2 = r2_score(y_test_actual, y_pred_actual)
    return {
        'Model': model_name,
        'CV_R2': cv_scores.mean(),
        'MAE': mae,
        'RMSE': rmse,
        'Test_R2': r2
    }

# Define models
models = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor()}

# Evaluate all models
results2 = []
for model_name, model in models.items():
    results2.append(evaluate_model(model_name, model))
# Display results
results_df2= pd.DataFrame(results2).sort_values(by='Test_R2', ascending=False)
print(results_df2)

               Model     CV_R2       MAE      RMSE   Test_R2
5      random forest  0.749324  0.651467  1.360176  0.697351
6        extra trees  0.720272  0.704488  1.499273  0.632286
4      decision tree  0.673052  0.732094  1.580702  0.591259
7  gradient boosting  0.607078  0.886277  1.663272  0.547441
8           adaboost  0.284387  1.285805  2.206923  0.203249
9                mlp  0.205087  1.284736  2.319871  0.119608
1                svr  0.214199  1.211117  2.378071  0.074880
0         linear_reg  0.055262  1.354705  2.452730  0.015880
2              ridge  0.055262  1.354705  2.452730  0.015880
3              LASSO  0.051685  1.350699  2.460330  0.009772


In [None]:
# Define feature columns
numerical_columns = ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']
categorical_columns = ['agepossession', 'sector', 'furnish_type']
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']
# Preprocessor
preprocessor2 = ColumnTransformer([
    ('num', StandardScaler(), numerical_columns),('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1', OneHotEncoder(drop='first'), categorical_columns)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manually preprocess data
X_train = preprocessor3.fit_transform(X_train)  # Apply preprocessing
X_test = preprocessor3.transform(X_test)  # Apply preprocessing to test data

# Fit XGBoost model
xgb_model = XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Calculate metrics
y_pred_actual = np.expm1(y_pred)  # In case the target was log-transformed
y_test_actual = np.expm1(y_test)  # Same for the test target

mae = mean_absolute_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
r2 = r2_score(y_test_actual, y_pred_actual)


# cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=10, scoring='r2')
# cv = cv_scores.mean()

# Result
results2 = {
    'Model': 'XGBRegressor',
    'MAE': mae,
    'RMSE': rmse,
    'Test_R2': r2
}

# Display result
print(results2)
results1=pd.DataFrame([results2])

{'Model': 'XGBRegressor', 'MAE': 0.4948926882458404, 'RMSE': 1.1981857682273673, 'Test_R2': 0.7651465674426877}


In [None]:
result_df=pd.concat([results_df2,results1])
result_df.sort_values(by=['MAE','Test_R2'],ascending=[True,False])

Unnamed: 0,Model,CV_R2,MAE,RMSE,Test_R2
0,XGBRegressor,,0.494893,1.198186,0.765147
5,random forest,0.749324,0.651467,1.360176,0.697351
6,extra trees,0.720272,0.704488,1.499273,0.632286
4,decision tree,0.673052,0.732094,1.580702,0.591259
7,gradient boosting,0.607078,0.886277,1.663272,0.547441
1,svr,0.214199,1.211117,2.378071,0.07488
9,mlp,0.205087,1.284736,2.319871,0.119608
8,adaboost,0.284387,1.285805,2.206923,0.203249
3,LASSO,0.051685,1.350699,2.46033,0.009772
2,ridge,0.055262,1.354705,2.45273,0.01588


##`Target Encoding`

In [None]:
y.name

'price'

In [None]:
# Columns to encode
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor4 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agepossession'])
    ],
    remainder='passthrough'
)
# Assuming X and y are your features and target
# Apply manual Target Encoding on 'sector' column
sector_encoding =df.groupby('sector')[y.name].mean()
df['sector_encoded'] =df['sector'].map(sector_encoding)
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform target

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor4),
    ('regressor', LinearRegression())
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2', error_score='raise')

# Print mean and standard deviation of R2 scores
print(scores.mean(), scores.std())

0.7777993930120749 0.08578677273764702


In [None]:
# Columns to encode
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor4 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agepossession'])
    ],
    remainder='passthrough'
)
# Assuming X and y are your features and target
# Apply manual Target Encoding on 'sector' column
sector_encoding =df.groupby('sector')[y.name].mean()
df['sector_encoded'] =df['sector'].map(sector_encoding)
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform target

# Creating a pipeline
def build_pipeline(model):
  return Pipeline([
    ('preprocessor', preprocessor4),
    ('regressor',model)
])
# Define scoring function
def evaluate_model(model_name, model):
    pipeline =build_pipeline(model)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_actual = np.expm1(y_pred)  # Transform predictions back
    y_test_actual = np.expm1(y_test)  # Transform target back
    mae = mean_absolute_error(y_test_actual, y_pred_actual)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
    r2 = r2_score(y_test_actual, y_pred_actual)
    return {
        'Model': model_name,
        'CV_R2': cv_scores.mean(),
        'MAE': mae,
        'RMSE': rmse,
        'Test_R2': r2
    }

# Define models
models = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor()}

# Evaluate all models
results3 = []
for model_name, model in models.items():
    results3.append(evaluate_model(model_name, model))
# Display results
results_df3 = pd.DataFrame(results3).sort_values(by=['MAE','Test_R2'], ascending=[True,False])
results_df3

Unnamed: 0,Model,CV_R2,MAE,RMSE,Test_R2
5,random forest,0.894661,0.431525,1.041363,0.8226
6,extra trees,0.889855,0.446103,1.117718,0.795632
7,gradient boosting,0.884545,0.476376,0.996774,0.837467
9,mlp,0.850786,0.53906,1.133706,0.789744
4,decision tree,0.800481,0.590527,1.450831,0.655664
1,svr,0.807145,0.676372,1.825936,0.454595
8,adaboost,0.810488,0.687565,1.252516,0.743366
2,ridge,0.777818,0.702843,2.227278,0.188483
0,linear_reg,0.777799,0.702875,2.228118,0.187871
3,LASSO,0.051466,1.350791,2.460519,0.00962


In [None]:
import category_encoders as ce
# Columns to encode
columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor4 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agepossession'])
    ],
    remainder='passthrough'
)
# Assuming X and y are your features and target
# Apply manual Target Encoding on 'sector' column
sector_encoding =df.groupby('sector')[y.name].mean()
df['sector_encoded'] =df['sector'].map(sector_encoding)
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manually preprocess data
X_train = preprocessor4.fit_transform(X_train)  # Apply preprocessing
X_test = preprocessor4.transform(X_test)  # Apply preprocessing to test data

# Fit XGBoost model
xgb_model = XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Calculate metrics
y_pred_actual = np.expm1(y_pred)  # In case the target was log-transformed
y_test_actual = np.expm1(y_test)  # Same for the test target

mae = mean_absolute_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
r2 = r2_score(y_test_actual, y_pred_actual)


# cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=10, scoring='r2')
# cv = cv_scores.mean()

# Result
results = {
    'Model': 'XGBRegressor',
    'MAE': mae,
    'RMSE': rmse,
    'Test_R2': r2
}

# Display result
print(results)
results2=pd.DataFrame([results])

{'Model': 'XGBRegressor', 'MAE': 0.43973966351865046, 'RMSE': 1.051307993553215, 'Test_R2': 0.8191957929966034}


In [None]:
result_df=pd.concat([results_df3,results2])
result_df.sort_values(by=['MAE','Test_R2'],ascending=[True,False])

Unnamed: 0,Model,CV_R2,MAE,RMSE,Test_R2
5,random forest,0.894661,0.431525,1.041363,0.8226
0,XGBRegressor,,0.43974,1.051308,0.819196
6,extra trees,0.889855,0.446103,1.117718,0.795632
7,gradient boosting,0.884545,0.476376,0.996774,0.837467
9,mlp,0.850786,0.53906,1.133706,0.789744
4,decision tree,0.800481,0.590527,1.450831,0.655664
1,svr,0.807145,0.676372,1.825936,0.454595
8,adaboost,0.810488,0.687565,1.252516,0.743366
2,ridge,0.777818,0.702843,2.227278,0.188483
0,linear_reg,0.777799,0.702875,2.228118,0.187871


#`Random forest and xgboost is ml algo. which are consistently good`

#`HyperParameter Tuning`

In [None]:
param_grid = {
    'regressor__n_estimators': [100, 150, 200],  # Number of trees in the forest
    'regressor__max_depth': [20, 30, None],  # Depth of each tree
    'regressor__max_samples': [0.25, 0.5, 1.0],  # Proportion of samples to use for each tree
    'regressor__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'regressor__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'regressor__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'regressor__criterion': ['squared_error', 'absolute_error'],  # Function to measure the quality of a split
}

In [17]:
# # Columns to encode
# columns_to_encode = ['property_type','sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']

# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']),
#         ('cat', OrdinalEncoder(), columns_to_encode),
#         ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agepossession'])
#     ],
#     remainder='passthrough'
# )

In [4]:
pipeline=Pipeline([('preprocessor',preprocessor),('regressor',RandomForestRegressor())])

In [3]:
# Assuming X and y are your features and target
# Apply manual Target Encoding on 'sector' column
sector_encoding =df.groupby('sector')[y.name].mean()
df['sector_encoded'] =df['sector'].map(sector_encoding)
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform target

In [4]:
X.head()

Unnamed: 0,property_type,sector,bedroom,bathroom,balcony,agepossession,built_up_area,servant_room,store_room,furnish_type,luxury_category,floor_category,sector_encoded
0,flat,sector 7,2.0,2.0,2,relatively new property,1013.0,0,0,semifurnished,Low,Medium,1.757941
1,flat,sector 3,2.0,2.0,2,old property,731.0,0,0,semifurnished,Low,Low,0.83525
2,flat,sohna road,2.0,2.0,1,new property,669.0,0,0,unfurnished,Low,High rise,0.656946
3,flat,sector 61,2.0,2.0,1,new property,1350.0,0,0,unfurnished,moderate,Low,2.088293
4,flat,sector 92,2.0,2.0,3+,under construction,1210.0,0,0,unfurnished,Low,Medium,0.929604


In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=216 , # Number of random samples to try
    cv=kfold,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=4
)

In [None]:
random_search.fit(X,y)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits


In [None]:
random_search.best_score_

0.892996556781599

In [None]:
random_search.best_params_

{'regressor__n_estimators': 150,
 'regressor__min_samples_split': 2,
 'regressor__min_samples_leaf': 1,
 'regressor__max_samples': 1.0,
 'regressor__max_features': 'sqrt',
 'regressor__max_depth': 20,
 'regressor__criterion': 'absolute_error'}

In [None]:
pipeline.fit(X,y)

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
print(cv_scores)

[0.89970126 0.87978996 0.9187024  0.90829117 0.85890981 0.92439778
 0.89726175 0.90673653 0.87945707 0.87816681]


In [None]:
print(np.mean(cv_scores))

0.8951414538996852


In [8]:
# ! pip install scikit-optimize

In [43]:
# ! pip install --upgrade scikit-optimize

##`bayersion opt.`

In [50]:
%%time
from skopt.callbacks import DeltaYStopper
early_stopping = DeltaYStopper(delta=0.01, n_best=10)
# Define the hyperparameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 500, 1000],  # Specific values for number of trees
    'regressor__max_features': [0.5, 0.6, 0.7, 0.8, 1.0],  # Specific values for max features
    'regressor__min_samples_split': [2, 5, 10, 15],  # Minimum number of samples required to split an internal node
    'regressor__min_samples_leaf': [1, 2, 5, 10],  # Minimum number of samples required to be at a leaf node
    'regressor__max_leaf_nodes': [10, 20, 50, 100],  # Maximum number of leaf nodes in each tree
    'regressor__max_depth': [5, 10, 20, 50],  # Maximum depth of the tree
    'regressor__criterion': ['squared_error', 'absolute_error']  # The function to measure the quality of a split
}
# Create a pipeline with target encoding
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_grid,
    n_iter=50,  # Number of iterations
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
# Fit the model
bayes_search.fit(X, y,callback=[early_stopping])
# Evaluate on the test set
best_model=bayes_search.best_estimator_
y_pred =best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Best Hyperparameters score : {bayes_search.best_score_}")
print(f"Best Hyperparameters: {bayes_search.best_params_}")
print(f"R-squared Score on Test Data: {r2:.4f}")

Best Hyperparameters score : -0.1775987773940358
Best Hyperparameters: OrderedDict([('regressor__criterion', 'absolute_error'), ('regressor__max_depth', 50), ('regressor__max_features', 0.7), ('regressor__max_leaf_nodes', 100), ('regressor__min_samples_leaf', 2), ('regressor__min_samples_split', 10), ('regressor__n_estimators', 1000)])
R-squared Score on Test Data: 0.9122
CPU times: user 3min 21s, sys: 10.2 s, total: 3min 31s
Wall time: 39min 39s


In [51]:
Param_grid = {
    'regressor__n_estimators': (50, 100, 200, 300, 400, 500, 600),
    'regressor__max_features': (0.5, 0.7, 0.8, 0.9, 1.0),
    'regressor__bootstrap': [True],
    'regressor__max_samples': (0.5, 0.6, 0.7, 0.8, 1.0),
    'regressor__min_samples_split': (2, 5, 10, 20, 50),
    'regressor__min_samples_leaf': (1, 2, 4, 8, 10),
    'regressor__min_weight_fraction_leaf': (0.0, 0.1, 0.2, 0.3, 0.5),
    'regressor__max_leaf_nodes': (10, 20, 50, 100, 150),
    'regressor__min_impurity_decrease': (0.0, 0.1, 0.2, 0.5, 1.0),
    'regressor__ccp_alpha': (0.0, 0.1, 0.2, 0.5, 1.0)
}
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_grid,
    n_iter=50,  # Number of iterations
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
# Fit the model
bayes_search.fit(X, y,callback=[early_stopping])
# Evaluate on the test set
best_model=bayes_search.best_estimator_
y_pred =best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Best Hyperparameters score : {bayes_search.best_score_}")
print(f"Best Hyperparameters: {bayes_search.best_params_}")
print(f"R-squared Score on Test Data: {r2:.4f}")

Best Hyperparameters score : -0.17742617290645496
Best Hyperparameters: OrderedDict([('regressor__criterion', 'absolute_error'), ('regressor__max_depth', 50), ('regressor__max_features', 0.6), ('regressor__max_leaf_nodes', 100), ('regressor__min_samples_leaf', 5), ('regressor__min_samples_split', 10), ('regressor__n_estimators', 500)])
R-squared Score on Test Data: 0.9048


In [53]:
param_grid = {
    'regressor__n_estimators': (50, 100, 150, 200, 300, 400, 500, 600, 700, 800,900 ,1000,1100),
    'regressor__max_features': (0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'regressor__bootstrap': [True],
    'regressor__max_samples': (0.4, 0.5, 0.6, 0.7, 0.8, 1.0),
    'regressor__min_samples_split': (2, 5, 10, 15, 20, 25, 30, 50),
    'regressor__min_samples_leaf': (1, 2, 4, 6, 8, 10, 15),
    'regressor__min_weight_fraction_leaf': (0.0, 0.05, 0.1, 0.2, 0.3, 0.5),
    'regressor__max_leaf_nodes': (10, 20, 30, 50, 100, 150, 200),
    'regressor__min_impurity_decrease': (0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0),
    'regressor__ccp_alpha': (0.0, 0.01, 0.1, 0.2, 0.5, 1.0, 2.0)
}
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_grid,
    n_iter=50,  # Number of iterations
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
# Fit the model
bayes_search.fit(X, y,callback=[early_stopping])
# Evaluate on the test set
best_model=bayes_search.best_estimator_
y_pred =best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Best Hyperparameters score : {bayes_search.best_score_}")
print(f"Best Hyperparameters: {bayes_search.best_params_}")
print(f"R-squared Score on Test Data: {r2:.4f}")

Best Hyperparameters score : -0.2386742697752447
Best Hyperparameters: OrderedDict([('regressor__bootstrap', True), ('regressor__ccp_alpha', 0.01), ('regressor__max_features', 0.7), ('regressor__max_leaf_nodes', 100), ('regressor__max_samples', 0.4), ('regressor__min_impurity_decrease', 0.0), ('regressor__min_samples_leaf', 8), ('regressor__min_samples_split', 2), ('regressor__min_weight_fraction_leaf', 0.0), ('regressor__n_estimators', 1100)])
R-squared Score on Test Data: 0.7791


In [54]:
param_grid = {
    'regressor__n_estimators': (50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1500),
    'regressor__max_features': (0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 'sqrt', 'log2'),
    'regressor__bootstrap': [True],
    'regressor__max_samples': (0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'regressor__min_samples_split': (2, 5, 10, 15, 20, 25, 30, 40, 50, 100),
    'regressor__min_samples_leaf': (1, 2, 4, 6, 8, 10, 12, 15, 20),
    'regressor__min_weight_fraction_leaf': (0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5),
    'regressor__max_leaf_nodes': (10, 20, 30, 50, 100, 150, 200, 250, 300),
    'regressor__min_impurity_decrease': (0.0, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0),
    'regressor__ccp_alpha': (0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0),
    'regressor__max_depth': (None, 10, 20, 30, 40, 50),
    'regressor__criterion': ['squared_error', 'absolute_error', 'friedman_mse']
}
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_grid,
    n_iter=50,  # Number of iterations
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
# Fit the model
bayes_search.fit(X, y,callback=[early_stopping])
# Evaluate on the test set
best_model=bayes_search.best_estimator_
y_pred =best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Best Hyperparameters score : {bayes_search.best_score_}")
print(f"Best Hyperparameters: {bayes_search.best_params_}")
print(f"R-squared Score on Test Data: {r2:.4f}")

Best Hyperparameters score : -0.186517618363216
Best Hyperparameters: OrderedDict([('regressor__bootstrap', True), ('regressor__ccp_alpha', 0.0), ('regressor__criterion', 'friedman_mse'), ('regressor__max_depth', 20), ('regressor__max_features', 0.5), ('regressor__max_leaf_nodes', 300), ('regressor__max_samples', 0.8), ('regressor__min_impurity_decrease', 0.1), ('regressor__min_samples_leaf', 15), ('regressor__min_samples_split', 5), ('regressor__min_weight_fraction_leaf', 0.0), ('regressor__n_estimators', 800)])
R-squared Score on Test Data: 0.8901


In [55]:
param_grid = {
    'regressor__n_estimators': (50, 100, 200, 300, 400, 500, 600, 800, 1000, 1200),
    'regressor__max_features': (0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'regressor__bootstrap': [True],
    'regressor__max_samples': (0.5, 0.6, 0.7, 0.8, 1.0),
    'regressor__min_samples_split': (2, 5, 10, 20, 30, 40, 50),
    'regressor__min_samples_leaf': (1, 2, 3, 4, 5, 8, 10),
    'regressor__min_weight_fraction_leaf': (0.0, 0.05, 0.1, 0.2, 0.3, 0.5),
    'regressor__max_leaf_nodes': (10, 20, 50, 100, 150, 200, 300),
    'regressor__min_impurity_decrease': (0.0, 0.01, 0.1, 0.2, 0.3, 0.5, 1.0),
    'regressor__ccp_alpha': (0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0),
    'regressor__max_depth': (None, 5, 10, 20, 30, 40, 50),  # Limiting depth of the tree
    'regressor__criterion': ['squared_error', 'absolute_error', 'friedman_mse']  # Criterion for splitting
}
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_grid,
    n_iter=50,  # Number of iterations
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
# Fit the model
bayes_search.fit(X, y,callback=[early_stopping])
# Evaluate on the test set
best_model=bayes_search.best_estimator_
y_pred =best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Best Hyperparameters score : {bayes_search.best_score_}")
print(f"Best Hyperparameters: {bayes_search.best_params_}")
print(f"R-squared Score on Test Data: {r2:.4f}")

Best Hyperparameters score : -0.22562387392698996
Best Hyperparameters: OrderedDict([('regressor__bootstrap', True), ('regressor__ccp_alpha', 0.01), ('regressor__criterion', 'absolute_error'), ('regressor__max_depth', 40), ('regressor__max_features', 0.8), ('regressor__max_leaf_nodes', 300), ('regressor__max_samples', 0.6), ('regressor__min_impurity_decrease', 0.01), ('regressor__min_samples_leaf', 5), ('regressor__min_samples_split', 2), ('regressor__min_weight_fraction_leaf', 0.0), ('regressor__n_estimators', 500)])
R-squared Score on Test Data: 0.7916


In [57]:
param_grid = {
    'regressor__n_estimators': (50, 100, 200, 300, 400, 500, 600, 800, 1000, 1200),
    'regressor__max_features': (0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,'sqrt', 'log2'),
    'regressor__bootstrap': [True],
    'regressor__max_samples': (0.5, 0.6, 0.7, 0.8, 1.0),
    'regressor__min_samples_split': (2, 5, 10, 20, 30, 40, 50),
    'regressor__min_samples_leaf': (1, 2, 3, 4, 5, 8, 10),
    'regressor__min_weight_fraction_leaf': (0.0, 0.05, 0.1, 0.2, 0.3, 0.5),
    'regressor__max_leaf_nodes': (10, 20, 50, 100, 150, 200, 300),
    'regressor__min_impurity_decrease': (0.0, 0.01, 0.1, 0.2, 0.3, 0.5, 1.0),
    'regressor__ccp_alpha': (0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0),
    'regressor__max_depth': (None, 5, 10, 20, 30, 40, 50),  # Limiting depth of the tree
    'regressor__criterion': ['squared_error', 'absolute_error', 'friedman_mse']  # Criterion for splitting
}
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_grid,
    n_iter=50,  # Number of iterations
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
# Fit the model
bayes_search.fit(X, y,callback=[early_stopping])
# Evaluate on the test set
best_model=bayes_search.best_estimator_
y_pred =best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Best Hyperparameters score : {bayes_search.best_score_}")
print(f"Best Hyperparameters: {bayes_search.best_params_}")
print(f"R-squared Score on Test Data: {r2:.4f}")

In [58]:
%%time
from skopt import BayesSearchCV
Param_grid = {
    'regressor__n_estimators': (50, 100, 200, 300, 400, 500, 600,700,800,900,1000),
    'regressor__max_features': (0.3,0.4,0.5, 0.7, 0.8, 0.9, 1.0,'sqrt','log2'),
    'regressor__bootstrap': [True],
    'regressor__max_samples': (0.5, 0.6, 0.7, 0.8,0.9, 1.0),
    'regressor__min_samples_split': (2, 5, 10, 20, 50),
    'regressor__min_samples_leaf': (1, 2, 4, 8, 10),
    'regressor__min_weight_fraction_leaf': (0.0, 0.1, 0.2, 0.3, 0.5),
    'regressor__max_leaf_nodes': (10, 20, 50, 100, 150),
    'regressor__min_impurity_decrease': (0.0, 0.1, 0.2, 0.5, 1.0),
    'regressor__ccp_alpha': (0.0, 0.1, 0.2, 0.5, 1.0),
    'regressor__max_depth': (None, 5, 10, 20, 30, 40, 50),  # Limiting depth of the tree
    'regressor__criterion': ['squared_error', 'absolute_error', 'friedman_mse']  # Criterion for splitting
}
# Use BayesSearchCV to tune the hyperparameters
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=Param_grid,
    n_iter=125,  # Number of iterations
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,verbose=1
)
# Manually split the data
train_size = int(0.8 * len(X))  # 80% for training
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# Fit the model
bayes_search.fit(X, y)

# Get the best parameters and model
best_params = bayes_search.best_params_
best_model = bayes_search.best_estimator_

# Evaluate on the test set
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"R-squared Score on Test Data: {r2:.4f}")

In [1]:
## best paramters
#('regressor__bootstrap', True), ('regressor__ccp_alpha', 0.0),
#('regressor__criterion', 'friedman_mse'), ('regressor__max_depth', 50),
#('regressor__max_features', 0.7), ('regressor__max_leaf_nodes', 150),
#('regressor__max_samples', 1.0), ('regressor__min_impurity_decrease', 0.1), ('regressor__min_samples_leaf', 2),
#('regressor__min_samples_split', 2), ('regressor__min_weight_fraction_leaf', 0.0), ('regressor__n_estimators', 1000)

##`Final model after model selection`

In [63]:

# Columns to encode
columns_to_encode = ['property_type', 'sector', 'balcony', 'agepossession', 'furnish_type', 'luxury_category', 'floor_category']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant_room', 'store_room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['agepossession']),
        ('target', TargetEncoder(target_type='continuous'),['sector'])

    ],
    remainder='passthrough'
)

In [64]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        bootstrap=True,
        ccp_alpha=0.0,
        criterion='friedman_mse',
        max_depth=50,
        max_features=0.7,
        max_leaf_nodes=150,
        max_samples=1.0,
        min_impurity_decrease=0.1,
        min_samples_leaf=2,
        min_samples_split=2,
        min_weight_fraction_leaf=0.0,
        n_estimators=1000
    ))
])

In [65]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [66]:
pipeline.fit(X,y)

In [32]:
y_pred=pipeline.predict(X_test)
print(r2_score(y_test,y_pred))

0.8807758739620891


In [33]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores_train = cross_val_score(
    pipeline, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error'
)
print("Mean Absolute Error (Train):", -np.mean(cv_scores_train))

cv_scores_test = cross_val_score(
    pipeline, X_test, y_test, cv=kfold, scoring='neg_mean_absolute_error'
)
print("Mean Absolute Error (Test):", -np.mean(cv_scores_test))

Mean Absolute Error (Train): 0.12915412329900797
Mean Absolute Error (Test): 0.143116568971925


In [82]:
cv_scores_test = cross_val_score(
    pipeline, X, y, cv=kfold, scoring='neg_mean_absolute_error'
)
print("Mean Absolute Error (Test):", -np.mean(cv_scores_test))

Mean Absolute Error (Test): 0.12596045631972236


In [34]:
kfold = KFold(n_splits=25, shuffle=True, random_state=42)
cv_scores_train = cross_val_score(
    pipeline, X_train, y_train, cv=kfold, scoring='r2'
)
print("r2 (Train):",np.mean(cv_scores_train))

cv_scores_test = cross_val_score(
    pipeline, X_test, y_test, cv=kfold, scoring='r2'
)
print("r2 (Test):", np.mean(cv_scores_test))

r2 (Train): 0.8811929942369247
r2 (Test): 0.8094400318286322


In [68]:
with open('model(random_forest).pikle','wb') as file:
  pickle.dump(pipeline,file)

In [69]:
with open('data.df','wb') as file:
  pickle.dump(X,file)

In [70]:
X.columns

Index(['property_type', 'sector', 'bedroom', 'bathroom', 'balcony',
       'agepossession', 'built_up_area', 'servant_room', 'store_room',
       'furnish_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [79]:
X['sector'].unique()

array(['sector 7', 'sector 3', 'sohna road', 'sector 61', 'sector 92',
       'sector 36', 'dwarka expressway', 'sector 104', 'sector 88',
       'sector 78', 'sector 81', 'sector 79', 'sector 33', 'sector 90',
       'sector 108', 'sector 62', 'sector 102', 'sector 89', 'sector 113',
       'sector 65', 'sector 48', 'sector 37', 'sector 110', 'sector 43',
       'sector 68', 'sector 109', 'sector 106', 'sector 82', 'sector 85',
       'sector 28', 'sector 10', 'manesar', 'sector 84', 'sector 71',
       'sector 77', 'sector 67', 'sector 57', 'sector 95', 'sector 99',
       'sector 103', 'sector 49', 'sector 30', 'sector 86', 'sector 66',
       'sector 4', 'sector 22', 'sector 63', 'sector 52', 'sector 107',
       'sector 12', 'sector 2', 'sector 91', 'sector 70', 'gwal pahari',
       'sector 47', 'sector 54', 'sector 111', 'sector 41', 'sector 73',
       'sector 56', 'sector 83', 'sector 53', 'sector 72', 'sector 69',
       'sector 9', 'sector 14', 'sector 50', 'sector 25', 'sec

In [80]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedroom', 'bathroom', 'balcony',
       'agepossession', 'built_up_area', 'servant_room', 'store_room',
       'furnish_type', 'luxury_category', 'floor_category']
# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedroom,bathroom,balcony,agepossession,built_up_area,servant_room,store_room,furnish_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [81]:
np.expm1(pipeline.predict(one_df))

array([2.37243812])