In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load dataset
file_path = "Makaan_Properties_No_Duplicates.csv"
data = pd.read_csv(file_path)

# Data Cleaning
data["Size"] = data["Size"].str.replace(",", "").str.extract(r"(\d+)").astype(float)
data["No_of_BHK"] = data["No_of_BHK"].str.extract(r"(\d+)").astype(float)
data["Price"] = data["Price"].str.replace(",", "", regex=True).astype(float)

# Convert categorical columns to string to prevent encoding issues
data["City_name"] = data["City_name"].astype(str)
data["Property_type"] = data["Property_type"].astype(str)

# Remove outliers using IQR
for col in ["Size", "No_of_BHK", "Price"]:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[col] >= Q1 - 1.5 * IQR) & (data[col] <= Q3 + 1.5 * IQR)]

# Feature Selection
features = ["Size", "No_of_BHK", "City_name", "Property_type"]
target = "Price"

numerical_cols = ["Size", "No_of_BHK"]
categorical_cols = ["City_name", "Property_type"]

# Handle missing values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(handle_unknown="ignore")

# Scaling numerical features
scaler = StandardScaler()

# Preprocessor
preprocessor = ColumnTransformer([
    ("num", Pipeline([("imputer", num_imputer), ("scaler", scaler)]), numerical_cols),
    ("cat", Pipeline([("imputer", cat_imputer), ("encoder", encoder)]), categorical_cols)
])

# Split data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define KNN Model
knn = KNeighborsRegressor()

# Hyperparameter tuning with K-Fold Cross Validation
param_grid = {
    "model__n_neighbors": range(2, 15, 2),
    "model__weights": ["uniform", "distance"],
    "model__metric": ["euclidean", "manhattan"]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", knn)
])

grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring="r2", n_jobs=-1)

# Train the model
grid_search.fit(X_train, y_train)

# Best model
tuned_knn = grid_search.best_estimator_

# Predict
y_pred = tuned_knn.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print("Best Hyperparameters:", grid_search.best_params_)
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], "r--")  # Perfect prediction line
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Property Prices")
plt.show()