In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score, confusion_matrix

# Load the dataset
data = pd.read_csv('karachi_real_estate_data.csv')

# Display basic info to check if the data is loaded correctly
print(data.head())

# Encode categorical variables (Location and Property Type)
label_encoder_location = LabelEncoder()
data['Location'] = label_encoder_location.fit_transform(data['Location'])

label_encoder_property = LabelEncoder()
data['Property Type'] = label_encoder_property.fit_transform(data['Property Type'])

# Separate features and target variable
X = data.drop(['House ID', 'Price ($)'], axis=1)  # Features
y = data['Price ($)']  # Target variable

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the feature set (important for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the KNN regressor model
knn_model = KNeighborsRegressor(n_neighbors=5)
# Train the model on the training data
knn_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)

# Create a confusion matrix by binning price predictions into categories
# For illustration, we can segment into price ranges: Low, Medium, High
bins = [0, 400000, 800000, 1200000]
labels = ['Low', 'Medium', 'High']
y_test_binned = pd.cut(y_test, bins=bins, labels=labels)
y_pred_binned = pd.cut(y_pred, bins=bins, labels=labels)

# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test_binned, y_pred_binned, labels=labels)
print("Confusion Matrix:\n", conf_matrix)

# Calculate accuracy from confusion matrix (on a scale from 0 to 1)
correct_predictions = np.diagonal(conf_matrix).sum()
total_predictions = conf_matrix.sum()

accuracy = correct_predictions / total_predictions

# Print the accuracy (in a scale of 0 to 1)
print(f"Accuracy: {accuracy:.4f}")

   House ID         Location  Size (sq ft)  Bedrooms  Bathrooms  Price ($)  \
0      4001          Clifton          1200         2          1     500000   
1      4002      DHA Phase 5          2000         3          2     750000   
2      4003  Gulshan-e-Iqbal          1500         2          2     400000   
3      4004  North Nazimabad          1800         3          2     650000   
4      4005    PECHS Block 2          1000         1          1     300000   

   Year Built  Condition Rating  Distance to City Center (km)  Property Type  
0        2010                 4                             5      Apartment  
1        2015                 5                            10  Single Family  
2        2008                 3                             8      Townhouse  
3        2012                 4                             7  Single Family  
4        2005                 2                             6      Apartment  
Mean Absolute Error: 170909.0909090909
R2 Score: -0.25936