In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Load dataset
file_path = "Makaan_Properties_No_Duplicates.csv"
data = pd.read_csv(file_path)

# Data Cleaning
data["Size"] = data["Size"].str.replace(",", "").str.extract(r"(\d+)").astype(float)
data["No_of_BHK"] = data["No_of_BHK"].str.extract(r"(\d+)").astype(int)
data["Price"] = data["Price"].str.replace(",", "", regex=True).astype(float)

# Log transformation to reduce skewness
data["Price"] = np.log1p(data["Price"])

# Feature Engineering
features = ["Size", "No_of_BHK", "City_name", "Property_type"]
target = "Price"

numerical_cols = ["Size", "No_of_BHK"]
categorical_cols = ["City_name", "Property_type"]

# Handle missing values
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(handle_unknown="ignore")

# Scaling numerical features
scaler = StandardScaler()

# Polynomial Features (degree=2 for better feature interaction)
poly = PolynomialFeatures(degree=2, include_bias=False)

# Combine preprocessing steps
preprocessor = ColumnTransformer([
    ("num", Pipeline([("imputer", num_imputer), ("poly", poly), ("scaler", scaler)]), numerical_cols),
    ("cat", Pipeline([("imputer", cat_imputer), ("encoder", encoder)]), categorical_cols)
])

# Split data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestRegressor(n_jobs=-1, random_state=42, bootstrap=True, oob_score=True, criterion="squared_error"),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=200, max_depth=30, n_jobs=-1, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=7, weights='distance'),
    "SVR": SVR(kernel="rbf", C=150, epsilon=0.05)
}

# Train models and evaluate performance
results = {}
for model_name, model in models.items():
    pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    # Compute metrics
    results[model_name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R²": r2_score(y_test, y_pred),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
    }

# Display results
results_df = pd.DataFrame(results).T
print("\nModel Performance Before Hyperparameter Tuning:\n", results_df)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    "model__n_estimators": [400, 800, 1200, 1500],
    "model__max_depth": [10, 30, 50, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
   "model__max_features": ["sqrt", "log2", None],
    "model__bootstrap": [True, False]
}

random_search_rf = RandomizedSearchCV(
    Pipeline([("preprocessor", preprocessor), ("model", RandomForestRegressor(n_jobs=-1, random_state=42))]),
    param_distributions=param_grid_rf,
    n_iter=30,  # Testing 30 random parameter combinations
    cv=5,
    scoring="r2",
    random_state=42,
    n_jobs=-1
)

# Train the tuned Random Forest model
random_search_rf.fit(X_train, y_train)
tuned_rf = random_search_rf.best_estimator_

# Predict with tuned model
y_pred_rf = tuned_rf.predict(X_test)

# Update results with tuned Random Forest model
results_df.loc["Tuned Random Forest"] = {
    "MAE": mean_absolute_error(y_test, y_pred_rf),
    "MSE": mean_squared_error(y_test, y_pred_rf),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_rf)),
    "R²": r2_score(y_test, y_pred_rf),
    "MAPE": mean_absolute_percentage_error(y_test, y_pred_rf),
}

print("\nFinal Results After Hyperparameter Tuning:\n", results_df)

# Accuracy Comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=results_df.index, y=results_df["R²"], palette="viridis")
plt.ylim(0, 1)
plt.ylabel("R² Score (Accuracy)")
plt.title("Model Accuracy Comparison (R² Score)")
plt.xticks(rotation=30)
plt.show()


Model Performance Before Hyperparameter Tuning:
                         MAE       MSE      RMSE        R²      MAPE
Random Forest      0.400469  0.305992  0.553166  0.778040  0.025780
Gradient Boosting  0.407396  0.300341  0.548033  0.782140  0.026280
Extra Trees        0.411447  0.330871  0.575214  0.759994  0.026473
KNN                0.413308  0.326580  0.571472  0.763107  0.026595
SVR                0.407495  0.329957  0.574419  0.760657  0.026358
