In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, classification_report


In [2]:
# Load the dataset
data = pd.read_csv("real_estate_data.csv")

# Display the first few rows
data.head()


Unnamed: 0,customer_id,total_payments,successful_payments,session_start_time,session_end_time,time_on_site,pages_viewed,income,age,property_type,target_column
0,1,10,8,0,60,45,12,60000,35,Apartment,1
1,2,5,5,0,30,25,8,45000,40,House,0
2,3,20,19,0,120,90,25,80000,28,Villa,1
3,4,15,13,0,90,70,18,75000,50,Apartment,0
4,5,8,7,0,45,35,10,50000,32,House,1


In [3]:
# Create new features
data['session_duration'] = data['session_end_time'] - data['session_start_time']
data['payment_failures'] = data['total_payments'] - data['successful_payments']
data['engagement_score'] = data['time_on_site'] * data['pages_viewed']

# Drop unnecessary columns
features = data.drop(columns=["customer_id", "session_start_time", "session_end_time", "target_column"])
target = data["target_column"]

# Display the first few rows of the updated dataset
features.head()


Unnamed: 0,total_payments,successful_payments,time_on_site,pages_viewed,income,age,property_type,session_duration,payment_failures,engagement_score
0,10,8,45,12,60000,35,Apartment,60,2,540
1,5,5,25,8,45000,40,House,30,0,200
2,20,19,90,25,80000,28,Villa,120,1,2250
3,15,13,70,18,75000,50,Apartment,90,2,1260
4,8,7,35,10,50000,32,House,45,1,350


In [4]:
# Define numeric and categorical features
numeric_features = features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = features.select_dtypes(include=['object']).columns

# Define preprocessors for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [6]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipeline.fit(X_train, y_train)

    # Predictions
    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline.named_steps['classifier'], 'predict_proba') else None

    # Evaluation metrics
    roc_auc = roc_auc_score(y_test, y_pred_prob) if y_pred_prob is not None else None
    f1 = f1_score(y_test, y_pred, average='binary')

    results[model_name] = {
        'ROC-AUC': roc_auc,
        'F1-Score': f1,
        'Classification Report': classification_report(y_test, y_pred, output_dict=True)
    }


In [7]:
# Print results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"ROC-AUC: {metrics['ROC-AUC']:.2f}" if metrics['ROC-AUC'] else "ROC-AUC: Not Available")
    print(f"F1-Score: {metrics['F1-Score']:.2f}")
    print("Classification Report:")
    print(pd.DataFrame(metrics['Classification Report']).transpose())



Model: Logistic Regression
ROC-AUC: 1.00
F1-Score: 1.00
Classification Report:
              precision  recall  f1-score  support
0                   1.0     1.0       1.0      2.0
1                   1.0     1.0       1.0      2.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0      4.0
weighted avg        1.0     1.0       1.0      4.0

Model: Random Forest
ROC-AUC: 1.00
F1-Score: 1.00
Classification Report:
              precision  recall  f1-score  support
0                   1.0     1.0       1.0      2.0
1                   1.0     1.0       1.0      2.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0      4.0
weighted avg        1.0     1.0       1.0      4.0

Model: Gradient Boosting
ROC-AUC: 1.00
F1-Score: 1.00
Classification Report:
              precision  recall  f1-score  support
0                   1.0     1.0       1.0      2.0
1                   1.0     1.0       1.0      2.0
acc