### Import Libraries and Data

In [1]:
# Basic imports
import sys
from pathlib import Path
import os
import warnings

warnings.filterwarnings("ignore")

project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [2]:
os.chdir('/Users/riyanshibohra/Documents/GitHub/metropolitan-climate-profiling')
print(os.getcwd())  # Verify the change

/Users/riyanshibohra/Documents/GitHub/metropolitan-climate-profiling


In [3]:
# Import libraries
import pandas as pd
from scripts.data_loader import load_processed_dataset
from scripts.modeling import (
    prepare_data, train_model, evaluate_model, save_model
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [4]:
# Define paths to processed data

data_folder = Path().resolve() / "data"
dallas_path = data_folder / "enhanced_dallas_with_uhi.csv"
arlington_path = data_folder / "enhanced_arlington_with_uhi.csv"
denton_path = data_folder / "enhanced_denton_with_uhi.csv"

In [5]:
dallas = load_processed_dataset(dallas_path)
arlington = load_processed_dataset(arlington_path)
denton = load_processed_dataset(denton_path)

In [6]:
# Combine datasets for unified modeling
combined_data = pd.concat([dallas, arlington, denton], ignore_index=True)

### Data Preparation

In [7]:
# Prepare data
X_train, X_test, y_train, y_test, label_encoder = prepare_data(combined_data, target_column='UHI Intensity')

# Print feature and target info
print("Training Features Shape:", X_train.shape)
print("Test Features Shape:", X_test.shape)
print("Classes:", label_encoder.classes_)

Features shape: (32248, 11)
Features dtypes:
 HourlyDryBulbTemperature     float64
HourlyWetBulbTemperature     float64
HourlyDewPointTemperature    float64
HourlyRelativeHumidity       float64
HourlyPrecipitation          float64
HourlySeaLevelPressure       float64
HourlyStationPressure        float64
HourlyWindSpeed              float64
HourlyWindDirection          float64
Hour                         float64
Month                        float64
dtype: object
Target unique values: ['Medium' 'Low' 'High']
Training Features Shape: (22573, 11)
Test Features Shape: (9675, 11)
Classes: ['High' 'Low' 'Medium']


### Model Training and Evaluation

In [9]:
# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
}


=== Random Forest ===
Classification Report:
               precision    recall  f1-score   support

        High       0.94      0.84      0.89       180
         Low       0.97      0.97      0.97      2530
      Medium       0.98      0.99      0.99      6965

    accuracy                           0.98      9675
   macro avg       0.97      0.93      0.95      9675
weighted avg       0.98      0.98      0.98      9675


Confusion Matrix:
 [[ 151    0   29]
 [   0 2452   78]
 [   9   84 6872]]

=== Gradient Boosting ===
Classification Report:
               precision    recall  f1-score   support

        High       0.92      0.82      0.87       180
         Low       0.95      0.98      0.96      2530
      Medium       0.99      0.98      0.98      6965

    accuracy                           0.98      9675
   macro avg       0.95      0.92      0.94      9675
weighted avg       0.98      0.98      0.98      9675


Confusion Matrix:
 [[ 147    0   33]
 [   1 2475   54]
 [  11  1

In [None]:
# Train and evaluate each model
for model_name, model in models.items():
    print(f"\n=== {model_name} ===")
    trained_model = train_model(model, X_train, y_train)
    evaluate_model(trained_model, X_test, y_test, label_encoder)

### Model Performance Analysis

After evaluating three different models (Random Forest, Gradient Boosting, and XGBoost) on the UHI Intensity classification task, here's a comparative analysis:

#### XGBoost (Best Performing Model)
- Highest overall accuracy at 99%
- Most balanced performance across all classes
- Best performance for 'High' UHI intensity (93% recall) - crucial for identifying severe urban heat conditions
- Excellent precision and recall for all classes (97% macro average)
- Lowest misclassification rate between classes

#### Random Forest (Second Best)
- Good overall accuracy at 98%
- Strong performance on 'Low' and 'Medium' classes
- Slightly lower performance on 'High' class (84% recall)
- Good balance between precision and recall (95% macro average)

#### Gradient Boosting
- Similar overall accuracy to Random Forest (98%)
- Lowest performance on 'High' class (82% recall)
- More misclassifications between classes compared to XGBoost
- Slightly lower macro average (94%)

#### Conclusion
XGBoost is recommended as the best model for this task because:
1. It shows the most balanced performance across all classes
2. Has the highest accuracy for identifying high UHI intensity areas
3. Shows the least confusion between different intensity levels
4. Demonstrates the best overall metrics (precision, recall, and F1-score)

In [10]:
print("Modeling Complete!")

Modeling Complete!
