This script will select 50 random fires from annas goodd polygon ids, and ensure there is existing data so I can check the 30 day NDSI composites.  I will need to predicted the combined old, combiined monthly and combined NDSI models on these fires. 

In [19]:
import geopandas as gpd
import pandas as pd
import os
import random

# Step 1: Read the shapefile
shapefile_path = '/explore/nobackup/people/spotter5/cnn_mapping/Russia/good_polys_anna.shp'
gdf = gpd.read_file(shapefile_path).to_crs(epsg=3413)

# Step 2: Calculate the area in square meters
gdf['area_m2'] = gdf.geometry.area

# Step 3: Sort by area to balance small and large fires, with largest areas first
gdf_sorted = gdf.sort_values(by='area_m2', ascending=False).head(500)

# Step 4: Split the GeoDataFrame into northern and southern regions
gdf_sorted['centroid_y'] = gdf_sorted.geometry.centroid.y
median_latitude = gdf_sorted['centroid_y'].median()

northern_half = gdf_sorted[gdf_sorted['centroid_y'] >= median_latitude]
southern_half = gdf_sorted[gdf_sorted['centroid_y'] < median_latitude]

# Step 5: IDs that must be included
required_ids = [1679, 3138, 3336, 3381, 4518, 5311, 15063, 15255, 15868, 15894, 16086]

# Ensure required IDs are in the northern or southern half as appropriate
required_north_ids = [id_ for id_ in required_ids if id_ in northern_half['ID'].tolist()]
required_south_ids = [id_ for id_ in required_ids if id_ in southern_half['ID'].tolist()]

# Adjust the number of remaining samples to take from each half
num_south_samples = 100 - len(required_south_ids)
num_north_samples = 100 - len(required_north_ids)

sampled_south_ids = random.sample([id_ for id_ in southern_half['ID'].tolist() if id_ not in required_ids], num_south_samples)
sampled_north_ids = random.sample([id_ for id_ in northern_half['ID'].tolist() if id_ not in required_ids], num_north_samples)

# Combine the sampled IDs with the required IDs
sampled_ids_100 = sampled_south_ids + sampled_north_ids + required_south_ids + required_north_ids

# Step 6: Filter the corresponding 100 .tif files
ndsi_path = '/explore/nobackup/people/spotter5/cnn_mapping/Russia/anna_ndsi_composites'
all_tif_files = os.listdir(ndsi_path)

# Extract the numbers from the file names, filter out files that don't match the pattern
selected_tif_files = []
for f in all_tif_files:
    try:
        file_id = int(f.split('_')[1].split('.')[0])
        if file_id in sampled_ids_100:
            selected_tif_files.append(f)
    except (ValueError, IndexError):
        continue

# Step 7: Randomly sample 50 files from the selected 100
sampled_tif_files_50 = random.sample(selected_tif_files, 50)

# Extract the IDs corresponding to the sampled 50 files
sampled_ids_50 = [int(f.split('_')[1].split('.')[0]) for f in sampled_tif_files_50]

# Ensure required IDs are included in the final selection
sampled_ids_50.extend([id_ for id_ in required_ids if id_ not in sampled_ids_50])

# Step 8: Filter the shapefile to include only the selected 50 IDs
filtered_gdf = gdf[gdf['ID'].isin(sampled_ids_50)]


# Save the new shapefile
output_shapefile_path = '/explore/nobackup/people/spotter5/cnn_mapping/Russia/anna_good_sampled.shp'
filtered_gdf.to_file(output_shapefile_path)

In [4]:
filtered_gdf.shape

(50, 4)