In [16]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error
from sklearn.preprocessing import StandardScaler

# In a Jupyter notebook __file__ is not defined — use the current working directory instead
data_dir = Path.cwd() / 'data' / 'processed'
df = pd.read_csv(data_dir / 'GaN2024_enriched.csv', on_bad_lines='skip')

In [2]:
# Convert LightPollutionIndex to numeric (removes 'Out of Range' entries)
df['LightPollutionIndex'] = pd.to_numeric(df['LightPollutionIndex'], errors='coerce')

# Convert CloudCover to numeric
df['CloudCover'] = pd.to_numeric(df['CloudCover'], errors='coerce')

# Remove rows with NaN values in these columns
df = df.dropna(subset=['LightPollutionIndex', 'CloudCover', 'Elevation(m)', 'LimitingMag'])

In [3]:
# Split data into train, validation, and test sets
X = df[['Elevation(m)', 'CloudCover', 'SQMReading', 'LightPollutionIndex']]
y = df['LimitingMag']

# First split: 80% train+val, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
# RandomForest hyperparameter search
rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 25, None],
    'min_samples_split': [2, 10],
}

print("Optimizing RandomForest...")
rf_search = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=-1),
    param_distributions=rf_param_dist,
    n_iter=20,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)
rf_search.fit(X_train, y_train)
rf_test_pred = rf_search.best_estimator_.predict(X_test)

Optimizing RandomForest...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [19]:
# Evaluate both models on validation set
print("\n" + "=" * 60)
print("HYPERPARAMETER OPTIMIZATION RESULTS")
print("=" * 60)
print("\nRandomForest Best Parameters:")
print(rf_search.best_params_)
print(f"Test R²: {r2_score(y_test, rf_test_pred):.4f}")


HYPERPARAMETER OPTIMIZATION RESULTS

RandomForest Best Parameters:
{'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 15}
Test R²: 0.5327


In [20]:
# RandomForest Feature Importance
print("\nFeature Importance (RandomForest):")
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_search.best_estimator_.feature_importances_
}).sort_values('Importance', ascending=False)
print(feature_importance.to_string(index=False))


Feature Importance (RandomForest):
            Feature  Importance
         SQMReading    0.524272
       Elevation(m)    0.245879
         CloudCover    0.131318
LightPollutionIndex    0.098531


In [21]:
import joblib

# Save the final optimized model and scaler
model_path = Path.cwd() / 'best_model.joblib'

joblib.dump(rf_search.best_estimator_, model_path)

print(f"✓ Model saved to: {model_path}")

✓ Model saved to: c:\Users\tolar\code\AI-for-Star-viewing\best_model.joblib
