In [29]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = "Dataset/cleaned_dataset.csv"
df = pd.read_csv(file_path)

# Drop the "Property_ID" column as it's an identifier not useful for modeling.
df = df.drop(columns=["Property_ID"], errors='ignore')

# Define the numerical features which we will use for analysis and predictions.
num_features = ["Area", "Price"]

# These columns will keep the original, unscaled values to display later in the recommendations.
df["Original_Area"] = df["Area"]
df["Original_Price"] = df["Price"]

# A preprocessing pipeline is created using a StandardScaler to normalize the 'Area' and 'Price' columns for consistent scaling of the data.
preprocessor = Pipeline([
    ('scaler', StandardScaler())  # This scales the features to have zero mean and unit variance.
])

# Normalize the numerical features (Area and Price).
df[num_features] = preprocessor.fit_transform(df[num_features])

# Generate similarity labels for training, This function assigns a 'Similarity_Label' to properties based on their proximity to each other in price and area.
# If a property is similar to others, it gets a label of 1, otherwise, it stays 0.
df["Similarity_Label"] = 0  # Default label is "Not similar".

def assign_similarity_labels(df, threshold=0.1):
    for i, row in df.iterrows():
        area, price = row["Area"], row["Price"]
        # Calculate the Euclidean distance between this property and all others.
        distances = np.sqrt((df["Area"] - area) ** 2 + (df["Price"] - price) ** 2)
        closest_indices = distances.nsmallest(6).index  # Get the closest 5 properties plus itself.
        df.loc[closest_indices, "Similarity_Label"] = 1  # Mark these properties as similar.

assign_similarity_labels(df)

# We will split the data into a training set (80%) and a testing set (20%) for model evaluation.
X = df[num_features]
y = df["Similarity_Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# We will train several models, including KNN, Random Forest, and Gradient Boosting.
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

# Train each model and evaluate it using accuracy and classification report.
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model on the training set.
    y_pred = model.predict(X_test)  # Make predictions on the test set.
    
    # Output the accuracy score and classification report for each model with clear separation
    print(f"\n{'-'*120}")
    print(f" {name} Performance Metrics:")
    print(f"{'-'*120}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"{'-'*120}")

# This function will return a list of recommended properties based on the price and area specified by the user.
def recommend_properties(price, area, model_name, top_n=5):
    # Create a DataFrame with the appropriate columns and pass it to the scaler
    input_data = pd.DataFrame([[area, price]], columns=num_features)
    
    # Preprocess the input data (price and area) using the same scaler as the training data.
    input_data = preprocessor.transform(input_data)
    
    # Get the model
    model = models.get(model_name)
    
    # Calculate the similarity score for all properties in the dataset using the model's predictions.
    probabilities = model.predict_proba(df[num_features])[:, 1]
    df["Similarity_Score"] = probabilities
    
    # Get the top 5 properties with the highest similarity score.
    recommended = df.nlargest(top_n, "Similarity_Score").copy()
    
    # Rename the columns to display in Arabic for the output.
    recommended = recommended.rename(columns={
        "Property Type": "نوع العقار",
        "Location": "الموقع",
        "District": "الحي",
        "Bedrooms": "الغرف",
        "Bathrooms": "دورات المياة",
        "Original_Area": "المساحة",
        "Original_Price": "السعر",
        "Agency_Name": "الوكالة"
    })
    
    # Prepare the headers and results for displaying.
    headers = ["نوع العقار", "الموقع", "الحي", "الغرف", "دورات المياة", "المساحة", "السعر", "الوكالة"]
    results = recommended[headers]
    
    # Print the headers and the corresponding recommended properties in a clean format.
    print("\n" + " | ".join(headers))
    print("-" * 120)  
    for index, row in results.iterrows():
        print(" | ".join(str(x) for x in row.values))
    print("-" * 120)

    return results

# Example usage of the recommendation function:
price_input = 980000
area_input = 300

# Loop through all models and print recommendations for each one.
for model_name in models.keys():
    print(f"\n توصيات العقارات المتوافقة مع الميزانية والمساحة المطلوبة باستخدام مودل {model_name}:\n")
    recommend_properties(price_input, area_input, model_name=model_name)



------------------------------------------------------------------------------------------------------------------------
 KNN Performance Metrics:
------------------------------------------------------------------------------------------------------------------------
Accuracy: 0.9545

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.87      0.79        15
           1       0.99      0.96      0.97       139

    accuracy                           0.95       154
   macro avg       0.85      0.92      0.88       154
weighted avg       0.96      0.95      0.96       154

------------------------------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------
 Random Forest Performance Metrics:
------------------------------------------------------------------------------------