In [13]:
import os
import rasterio
import numpy as np
import pandas as pd
import re

This code generates the annual median nighttime lights intensity from VIIRS monthly images.

In [14]:
# ==========================================
# 1. SETUP
# ==========================================
base_dir = "/Users/ruben/Desktop/Thesis/TrainingData/VIIRS-PhilSA-50Sample"
output_csv = "viirs_ntl_labels_sample50.csv"

results = []

print("Scanning folders...")

Scanning folders...


In [None]:
# ==========================================
# 2. LOOP WITH EXTRACTION
# ==========================================
for folder_name in os.listdir(base_dir):
    cluster_path = os.path.join(base_dir, folder_name)
    
    # Skip if it's not a folder (e.g., .DS_Store)
    if not os.path.isdir(cluster_path): 
        continue
    
    # Extract the cluster ID from the folder name using regex
    match = re.search(r'VIIRS_PH20220*(\d+)$', folder_name)
    
    if match:
        cluster_id_str = match.group(1)
        cluster_id = int(cluster_id_str)
    else:
        print(f"Skipping {folder_name}: No ID found at end.")
        continue
        
    # ==========================================
    # 3. CALCULATE MEDIAN
    # ==========================================
    # READ ALL TIFS IN THIS FOLDER
    cluster_values = []
    
    for file in os.listdir(cluster_path):
        if file.endswith(".tif"):
            file_path = os.path.join(cluster_path, file)
            try:
                with rasterio.open(file_path) as src:
                    # Read the image data (Band 1)
                    data = src.read(1)
                    
                    # Take the mean of this specific month/image
                    # (Handles cases where the chip is 1x1 or 5x5 pixels)
                    avg_val = np.mean(data)
                    
                    # Filter out error values (negative numbers are common noise in VIIRS)
                    if avg_val >= 0:
                        cluster_values.append(avg_val)
            except Exception as e:
                print(f"Error reading {file}: {e}")
    
    # C. CALCULATE ANNUAL MEDIAN
    if cluster_values:
        # We use Median to avoid outliers (like temporary fires or stray light)
        annual_median = np.median(cluster_values)
        
        results.append({
            'DHSCLUST': cluster_id,
            'NTL_Value': annual_median
        })

In [None]:
# ==========================================
# 3. GENERATE CLASSES & SAVE
# ==========================================
if len(results) > 0:
    df = pd.DataFrame(results)
    
    # Create 3 Classes (0=Dark, 1=Dim, 2=Bright)
    # qcut ensures we have roughly equal training data for each class
    df['NTL_Class'] = pd.qcut(df['NTL_Value'], q=3, labels=[0, 1, 2])
    
    # Save to CSV
    df.to_csv(output_csv, index=False)
    
    print("Label generation complete.")
    print(f"Total Clusters Processed: {len(df)}")
    print("Class Distribution:")
    print(df['NTL_Class'].value_counts())
    print(f"Labels saved to: {output_csv}")
    
else:
    print("No data found. Please check your 'base_dir' path.")

------------------------------
SUCCESS: Label generation complete.
Total Clusters Processed: 43
Class Distribution:
NTL_Class
2    15
0    14
1    14
Name: count, dtype: int64
Labels saved to: viirs_ntl_labels_sample50.csv
------------------------------
