# Compiling Wildfire Data

This notebook takes the individual JSONs each representing a single "feature" (IE wildfire) produced by the Step 1 Notebook and compiles all relevant fields and calculations into invidual CSV files so that they can later be used to produce visualizations.

In [10]:
#This defines all of the packages and constants to be used in the notebook

import os, json, time
from pyproj import Transformer, Geod
from wildfire.Reader import Reader as WFReader
import pandas as pd
import geojson

feature_list_folder =  "feature_list/"
output_folder = "common_analysis_data/"
#CLOVIS coordinates according to wikipedia: 34.4048° N, 103.2052° W
CITY_LOCATIONS = {
    'clovis' :     {'city'   : 'Clovis',
                       'latlon' : [34.4048, -103.2052] } 
}

In [11]:
#To make things easier, I decided to compile the dataset into three separate CSVs.
#I tried to break them up so each one would have a roughly similar amount of JSONs,
#checking the sizes using File Explorer's file counts. The step 1 notebook breaks up
#the data into year-based subfolders, so I used years as my criteria for breaking
#the data up into chunks. The decision to break up the data was mostly to make the indvidual
#cells' runtimes more manageable.
#
#The first chunk starts in 1963 because the directions in the assignment asked us
#to only look at wildfires that happened between 1963 and 2023.

feature_list_first_chunk = range(1963,2003)

feature_list_second_chunk = range(2003,2013)

feature_list_third_chunk = range(2013,2023)

The below cell was provided by my 512 Professor, David MacDonald, as part of the example code for the assignment (licensed under the Creative Commons licence). It contains helper functions that take in a latitude-longitude point and ring data from a wildfire, and output either the distance between the closest part of the wildfire and that point, or the average distance between the entire wildfire and that point. These calculations will be stored in the CSVs made by this notebook for use in visualizations.

In [12]:
#
#    Transform feature geometry data
#
#    The function takes one parameter, a list of ESRI:102008 coordinates that will be transformed to EPSG:4326
#    The function returns a list of coordinates in EPSG:4326
def convert_ring_to_epsg4326(ring_data=None):
    converted_ring = list()
    #
    # We use a pyproj transformer that converts from ESRI:102008 to EPSG:4326 to transform the list of coordinates
    to_epsg4326 = Transformer.from_crs("ESRI:102008","EPSG:4326")
    # We'll run through the list transforming each ESRI:102008 x,y coordinate into a decimal degree lat,lon
    for coord in ring_data:
        if(type(coord) != list):
            print(type(coord))
            print(len(coord))
            print(coord)
        lat,lon = to_epsg4326.transform(coord[0],coord[1])
        new_coord = lat,lon
        converted_ring.append(new_coord)
    return converted_ring

#    
#    The function takes two parameters
#        A place - which is coordinate point (list or tuple with two items, (lat,lon) in decimal degrees EPSG:4326
#        Ring_data - a list of decimal degree coordinates for the fire boundary
#
#    The function returns a list containing the shortest distance to the perimeter and the point where that is
#
def shortest_distance_from_place_to_fire_perimeter(place=None,ring_data=None):
    # convert the ring data to the right coordinate system
    ring = convert_ring_to_epsg4326(ring_data)    
    # create a epsg4326 compliant object - which is what the WGS84 ellipsoid is
    geodcalc = Geod(ellps='WGS84')
    closest_point = list()
    # run through each point in the converted ring data
    for point in ring:
        # calculate the distance
        d = geodcalc.inv(place[1],place[0],point[1],point[0])
        # convert the distance to miles
        distance_in_miles = d[2]*0.00062137
        # if it's closer to the city than the point we have, save it
        if not closest_point:
            closest_point.append(distance_in_miles)
            closest_point.append(point)
        elif closest_point and closest_point[0]>distance_in_miles:
            closest_point = list()
            closest_point.append(distance_in_miles)
            closest_point.append(point)
    return closest_point



#    
#    The function takes two parameters
#        A place - which is coordinate point (list or tuple with two items, (lat,lon) in decimal degrees EPSG:4326
#        Ring_data - a list of decimal degree coordinates for the fire boundary
#
#    The function returns the average miles from boundary to the place
#
def average_distance_from_place_to_fire_perimeter(place=None,ring_data=None):
    # convert the ring data to the right coordinate system
    ring = convert_ring_to_epsg4326(ring_data)    
    # create a epsg4326 compliant object - which is what the WGS84 ellipsoid is
    geodcalc = Geod(ellps='WGS84')
    # create a list to store our results
    distances_in_meters = list()
    # run through each point in the converted ring data
    for point in ring:
        # calculate the distance
        d = geodcalc.inv(place[1],place[0],point[1],point[0])
        distances_in_meters.append(d[2])
    #print("Got the following list:",distances_in_meters)
    # convert meters to miles
    distances_in_miles = [meters*0.00062137 for meters in distances_in_meters]
    # the esri polygon shape (the ring) requires that the first and last coordinates be identical to 'close the region
    # we remove one of them so that we don't bias our average by having two of the same point
    distances_in_miles_no_dup = distances_in_miles[1:]
    # now, average miles
    average = sum(distances_in_miles_no_dup)/len(distances_in_miles_no_dup)
    return average


The below three cells are almost identical to each other, with each one processing one of the three "chunks" into a CSV file. The bottom two cells have an exception catcher because a minority of the JSONs u them are structured slightly differently from the rest (an issue that became apparent after a first run of the second cell failed midway).

In [13]:
first_chunk_json_paths = []

for year in feature_list_first_chunk:
    year_folder = feature_list_folder + str(year) + "/in_range/"
    if os.path.exists(year_folder):
        first_chunk_json_paths = first_chunk_json_paths + [year_folder + e for e in os.listdir(year_folder)]
        #first_chunk_json_paths.append(year_folder + os.listdir(year_folder))

print("Number of features in this chunk:")
print(len(first_chunk_json_paths))

place = CITY_LOCATIONS["clovis"]
my_index = 0;
first_chunk_data = []
for wf_feature_path in first_chunk_json_paths:

    if (my_index % 500) == 0:
        print(f"Saved {my_index} features")
    
    f = open(wf_feature_path) 
    wf_feature = json.load(f)
    wf_year = wf_feature['attributes']['Fire_Year']
    wf_name = wf_feature['attributes']['Listed_Fire_Names'].split(',')[0]
    wf_size = wf_feature['attributes']['GIS_Acres']
    wf_type = wf_feature['attributes']['Assigned_Fire_Type']
    ring_data = wf_feature['geometry']['rings'][0]

    #
    #     Compute using the shortest distance to any point on the perimeter
    #
    distance = shortest_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)
    avg_distance = average_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)

    wf_feature_dict  = {}
    wf_feature_dict['index'] = wf_feature_path.replace(feature_list_folder + str(year) + "/in_range/", "").replace(".json", "")
    wf_feature_dict['year'] = wf_year
    wf_feature_dict['name'] = wf_name
    wf_feature_dict['size'] = wf_size
    wf_feature_dict['type'] = wf_type
    wf_feature_dict['shortest_distance_dist'] = distance[0]
    wf_feature_dict['shortest_distance_x'] = distance[1][0]
    wf_feature_dict['shortest_distance_y'] = distance[1][1]
    wf_feature_dict['average_distance'] = avg_distance



    first_chunk_data.append(wf_feature_dict)

    my_index = my_index + 1
    #print(f"The closest distance of fire '{wf_name}' ({wf_size:1.2f} acres) from {wf_year} was {distance[0]:1.2f} miles to {place['city']}")
    #print(f"\tThe cloest perimiter point lat,lon {distance[1][0]},{distance[1][1]}")

print("Length of data dict")
print(len(first_chunk_data))

first_chunk_df = pd.DataFrame(first_chunk_data)
first_chunk_df.to_csv(output_folder + "first_chunk_df.csv", index = False)

Number of features in this chunk:
46743
Saved 0 features


Saved 500 features


Saved 1000 features


Saved 1500 features


Saved 2000 features


Saved 2500 features


Saved 3000 features


Saved 3500 features


Saved 4000 features


Saved 4500 features


Saved 5000 features


Saved 5500 features


Saved 6000 features


Saved 6500 features


Saved 7000 features


Saved 7500 features


Saved 8000 features


Saved 8500 features


Saved 9000 features


Saved 9500 features


Saved 10000 features


Saved 10500 features


Saved 11000 features


Saved 11500 features


Saved 12000 features


Saved 12500 features


Saved 13000 features


Saved 13500 features


Saved 14000 features


Saved 14500 features


Saved 15000 features


Saved 15500 features


Saved 16000 features


Saved 16500 features


Saved 17000 features


Saved 17500 features


Saved 18000 features


Saved 18500 features


Saved 19000 features


Saved 19500 features


Saved 20000 features


Saved 20500 features


Saved 21000 features


Saved 21500 features


Saved 22000 features


Saved 22500 features


Saved 23000 features


Saved 23500 features


Saved 24000 features


Saved 24500 features


Saved 25000 features


Saved 25500 features


Saved 26000 features


Saved 26500 features


Saved 27000 features


Saved 27500 features


Saved 28000 features


Saved 28500 features


Saved 29000 features


Saved 29500 features


Saved 30000 features


Saved 30500 features


Saved 31000 features


Saved 31500 features


Saved 32000 features


Saved 32500 features


Saved 33000 features


Saved 33500 features


Saved 34000 features


Saved 34500 features


Saved 35000 features


Saved 35500 features


Saved 36000 features


Saved 36500 features


Saved 37000 features


Saved 37500 features


Saved 38000 features


Saved 38500 features


Saved 39000 features


Saved 39500 features


Saved 40000 features


Saved 40500 features


Saved 41000 features


Saved 41500 features


Saved 42000 features


Saved 42500 features


Saved 43000 features


Saved 43500 features


Saved 44000 features


Saved 44500 features


Saved 45000 features


Saved 45500 features


Saved 46000 features


Saved 46500 features


Length of data dict
46743


In [14]:
second_chunk_json_paths = []

for year in feature_list_second_chunk:
    year_folder = feature_list_folder + str(year) + "/in_range/"
    if os.path.exists(year_folder):
        second_chunk_json_paths = second_chunk_json_paths + [year_folder + e for e in os.listdir(year_folder)]
        #second_chunk_json_paths.append(year_folder + os.listdir(year_folder))

print("Number of features in this chunk:")
print(len(second_chunk_json_paths))

place = CITY_LOCATIONS["clovis"]
my_index = 0;
second_chunk_data = []
for wf_feature_path in second_chunk_json_paths:
    try:
        if (my_index % 500) == 0:
            print(f"Saved {my_index} features")
        
        f = open(wf_feature_path) 
        wf_feature = json.load(f)
        wf_year = wf_feature['attributes']['Fire_Year']
        wf_name = wf_feature['attributes']['Listed_Fire_Names'].split(',')[0]
        wf_size = wf_feature['attributes']['GIS_Acres']
        wf_type = wf_feature['attributes']['Assigned_Fire_Type']
        ring_data = wf_feature['geometry']['rings'][0]

        #
        #     Compute using the shortest distance to any point on the perimeter
        #
        distance = shortest_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)
        avg_distance = average_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)

        wf_feature_dict  = {}
        wf_feature_dict['index'] = wf_feature_path.replace(feature_list_folder + str(year) + "/in_range/", "").replace(".json", "")
        wf_feature_dict['year'] = wf_year
        wf_feature_dict['name'] = wf_name
        wf_feature_dict['size'] = wf_size
        wf_feature_dict['type'] = wf_type
        wf_feature_dict['shortest_distance_dist'] = distance[0]
        wf_feature_dict['shortest_distance_x'] = distance[1][0]
        wf_feature_dict['shortest_distance_y'] = distance[1][1]
        wf_feature_dict['average_distance'] = avg_distance



        second_chunk_data.append(wf_feature_dict)

        my_index = my_index + 1
        #print(f"The closest distance of fire '{wf_name}' ({wf_size:1.2f} acres) from {wf_year} was {distance[0]:1.2f} miles to {place['city']}")
        #print(f"\tThe cloest perimiter point lat,lon {distance[1][0]},{distance[1][1]}")
    except:
        print(wf_feature_path)

print("Length of data dict")
print(len(second_chunk_data))

second_chunk_df = pd.DataFrame(second_chunk_data)
second_chunk_df.to_csv(output_folder + "second_chunk_df.csv", index = False)

Number of features in this chunk:
36565
Saved 0 features


Saved 500 features


Saved 1000 features


Saved 1500 features


Saved 2000 features


Saved 2500 features


Saved 3000 features


Saved 3500 features


Saved 4000 features


Saved 4500 features


Saved 5000 features


Saved 5500 features


Saved 6000 features


Saved 6500 features


Saved 7000 features


Saved 7500 features


Saved 8000 features


Saved 8500 features


Saved 9000 features


Saved 9500 features


Saved 10000 features


Saved 10500 features


Saved 11000 features


Saved 11500 features


Saved 12000 features


feature_list/2007/in_range/109604.json


Saved 12500 features


feature_list/2007/in_range/110223.json


Saved 13000 features


Saved 13500 features


Saved 14000 features


Saved 14500 features


Saved 15000 features


Saved 15500 features


Saved 16000 features


feature_list/2008/in_range/110638.json


Saved 16500 features


Saved 17000 features


feature_list/2008/in_range/111430.json


Saved 17500 features


Saved 18000 features


Saved 18500 features


Saved 19000 features


Saved 19500 features


feature_list/2009/in_range/111775.json


feature_list/2009/in_range/111896.json


Saved 20000 features


feature_list/2009/in_range/112409.json


feature_list/2009/in_range/112414.json


Saved 20500 features


Saved 21000 features


Saved 21500 features


Saved 22000 features


Saved 22500 features


Saved 23000 features


Saved 23500 features


feature_list/2010/in_range/113410.json


Saved 24000 features


feature_list/2010/in_range/113664.json


feature_list/2010/in_range/113737.json


feature_list/2010/in_range/113765.json


feature_list/2010/in_range/113804.json


Saved 24500 features


feature_list/2010/in_range/114308.json


feature_list/2010/in_range/114321.json


Saved 25000 features


Saved 25500 features


Saved 26000 features


Saved 26500 features


Saved 27000 features


Saved 27500 features


Saved 28000 features


Saved 28500 features


Saved 29000 features


Saved 29500 features


Saved 30000 features


Saved 30500 features


Saved 31000 features


Saved 31500 features


Saved 32000 features


Saved 32500 features


feature_list/2012/in_range/115628.json


Saved 33000 features


feature_list/2012/in_range/115973.json


feature_list/2012/in_range/116234.json


Saved 33500 features


Saved 34000 features


Saved 34500 features


Saved 35000 features


Saved 35500 features


Saved 36000 features


Saved 36500 features


Length of data dict
36547


In [15]:
third_chunk_json_paths = []

for year in feature_list_third_chunk:
    year_folder = feature_list_folder + str(year) + "/in_range/"
    if os.path.exists(year_folder):
        third_chunk_json_paths = third_chunk_json_paths + [year_folder + e for e in os.listdir(year_folder)]
        #third_chunk_json_paths.append(year_folder + os.listdir(year_folder))

print("Number of features in this chunk:")
print(len(third_chunk_json_paths))

place = CITY_LOCATIONS["clovis"]
my_index = 0;
third_chunk_data = []
for wf_feature_path in third_chunk_json_paths:
    try:
        if (my_index % 500) == 0:
            print(f"Saved {my_index} features")
        
        f = open(wf_feature_path) 
        wf_feature = json.load(f)
        wf_year = wf_feature['attributes']['Fire_Year']
        wf_name = wf_feature['attributes']['Listed_Fire_Names'].split(',')[0]
        wf_size = wf_feature['attributes']['GIS_Acres']
        wf_type = wf_feature['attributes']['Assigned_Fire_Type']
        ring_data = wf_feature['geometry']['rings'][0]

        #
        #     Compute using the shortest distance to any point on the perimeter
        #
        distance = shortest_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)
        avg_distance = average_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)

        wf_feature_dict  = {}
        wf_feature_dict['index'] = wf_feature_path.replace(feature_list_folder + str(year) + "/in_range/", "").replace(".json", "")
        wf_feature_dict['year'] = wf_year
        wf_feature_dict['name'] = wf_name
        wf_feature_dict['size'] = wf_size
        wf_feature_dict['type'] = wf_type
        wf_feature_dict['shortest_distance_dist'] = distance[0]
        wf_feature_dict['shortest_distance_x'] = distance[1][0]
        wf_feature_dict['shortest_distance_y'] = distance[1][1]
        wf_feature_dict['average_distance'] = avg_distance



        third_chunk_data.append(wf_feature_dict)

        my_index = my_index + 1
        #print(f"The closest distance of fire '{wf_name}' ({wf_size:1.2f} acres) from {wf_year} was {distance[0]:1.2f} miles to {place['city']}")
        #print(f"\tThe cloest perimiter point lat,lon {distance[1][0]},{distance[1][1]}")
    except:
        print(wf_feature_path)

print("Length of data dict")
print(len(third_chunk_data))

third_chunk_df = pd.DataFrame(third_chunk_data)
third_chunk_df.to_csv(output_folder + "third_chunk_df.csv", index = False)

Number of features in this chunk:
34270
Saved 0 features


Saved 500 features


feature_list/2013/in_range/117085.json


Saved 1000 features


Saved 1500 features


Saved 2000 features


Saved 2500 features


Saved 3000 features


Saved 3500 features


Saved 4000 features


Saved 4500 features


Saved 5000 features


Saved 5500 features


Saved 6000 features


Saved 6500 features


Saved 7000 features


Saved 7500 features


feature_list/2015/in_range/119581.json


feature_list/2015/in_range/119616.json


feature_list/2015/in_range/119750.json


feature_list/2015/in_range/119981.json


Saved 8000 features


feature_list/2015/in_range/120211.json


feature_list/2015/in_range/120430.json


Saved 8500 features


Saved 9000 features


Saved 9500 features


Saved 10000 features


Saved 10500 features


Saved 11000 features


feature_list/2016/in_range/120677.json


Saved 11500 features


feature_list/2016/in_range/120742.json


feature_list/2016/in_range/121009.json


Saved 12000 features


Saved 12500 features


Saved 13000 features


Saved 13500 features


Saved 14000 features


Saved 14500 features


Saved 15000 features


Saved 15500 features


Saved 16000 features


feature_list/2017/in_range/122263.json


feature_list/2017/in_range/122530.json


Saved 16500 features


Saved 17000 features


Saved 17500 features


Saved 18000 features


Saved 18500 features


Saved 19000 features


Saved 19500 features


Saved 20000 features


Saved 20500 features


Saved 21000 features


feature_list/2018/in_range/123760.json


Saved 21500 features


feature_list/2018/in_range/124534.json


Saved 22000 features


feature_list/2018/in_range/125045.json


Saved 22500 features


Saved 23000 features


Saved 23500 features


Saved 24000 features


Saved 24500 features


Saved 25000 features


Saved 25500 features


feature_list/2019/in_range/125744.json


Saved 26000 features


Saved 26500 features


Saved 27000 features


Saved 27500 features


Saved 28000 features


Saved 28500 features


Saved 29000 features


Saved 29500 features


feature_list/2020/in_range/127491.json


Saved 30000 features


Saved 30500 features


Saved 31000 features


Saved 31500 features


Saved 32000 features


Saved 32500 features


Saved 33000 features


Saved 33500 features


Saved 34000 features


Length of data dict
34253


In [None]:
list_of_failed_files = ["feature_list/2007/in_range/109604.json",
"feature_list/2007/in_range/110223.json",
"feature_list/2008/in_range/110638.json",
"feature_list/2008/in_range/111430.json",
"feature_list/2009/in_range/111775.json",
"feature_list/2009/in_range/111896.json",
"feature_list/2009/in_range/112409.json",
"feature_list/2009/in_range/112414.json",
"feature_list/2010/in_range/113410.json",
"feature_list/2010/in_range/113664.json",
"feature_list/2010/in_range/113737.json",
"feature_list/2010/in_range/113765.json",
"feature_list/2010/in_range/113804.json",
"feature_list/2010/in_range/114308.json",
"feature_list/2010/in_range/114321.json",
"feature_list/2012/in_range/115628.json",
"feature_list/2012/in_range/115973.json",
"feature_list/2012/in_range/116234.json",
"feature_list/2013/in_range/117085.json",
"feature_list/2015/in_range/119581.json",
"feature_list/2015/in_range/119616.json",
"feature_list/2015/in_range/119750.json",
"feature_list/2015/in_range/119981.json",
"feature_list/2015/in_range/120211.json",
"feature_list/2015/in_range/120430.json",
"feature_list/2016/in_range/120677.json",
"feature_list/2016/in_range/120742.json",
"feature_list/2016/in_range/121009.json",
"feature_list/2017/in_range/122263.json",
"feature_list/2017/in_range/122530.json",
"feature_list/2018/in_range/123760.json",
"feature_list/2018/in_range/124534.json",
"feature_list/2018/in_range/125045.json",
"feature_list/2019/in_range/125744.json",
"feature_list/2020/in_range/127491.json"]

In [None]:
#This cell is my failed attempt at creating a fourth processed CSV with the information from
#the data files that caused errors in the first run. At first, this seemed like a straightforward task,
#because all of those files simply had their ring data in a an object caled "curvedRings" than "rings"
#like in the other files, but I later found that their ring data doesn't consist just of coordinate pairs
#like the other files; it consists of coordinate pairs (two number arrays) paired with objects that commonly
#have a single key, b, and a list of 3 coordinate pairs assigned to that key. I'm not sure how to process those
#data points, and pending further exploration it seems better to leave out individual wildfires than put them in
#only partially processed.

place = CITY_LOCATIONS["clovis"]
my_index = 0;
missing_chunk_data = []
for wf_feature_path in list_of_failed_files:
    #try:
        print("")
        print("---------------------------------------------------------------------------------------------")
        if (my_index % 500) == 0:
            print(f"Saved {my_index} features")
        
        f = open(wf_feature_path) 
        wf_feature = json.load(f)
        print("oepened file")
        wf_year = wf_feature['attributes']['Fire_Year']
        print(wf_year)
        wf_name = wf_feature['attributes']['Listed_Fire_Names'].split(',')[0]
        print(wf_feature_path)
        wf_size = wf_feature['attributes']['GIS_Acres']
        print(wf_size)
        wf_type = wf_feature['attributes']['Assigned_Fire_Type']
        print(wf_type)
        ring_data = wf_feature['geometry']['curveRings'][0]
        #print((ring_data))

        #
        #     Compute using the shortest distance to any point on the perimeter
        #
        print(place)
        distance = shortest_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)
        avg_distance = average_distance_from_place_to_fire_perimeter(place['latlon'],ring_data)

        wf_feature_dict  = {}
        wf_feature_dict['index'] = wf_feature_path.replace(feature_list_folder + str(year) + "/in_range/", "").replace(".json", "")
        wf_feature_dict['year'] = wf_year
        wf_feature_dict['name'] = wf_name
        wf_feature_dict['size'] = wf_size
        wf_feature_dict['type'] = wf_type
        wf_feature_dict['shortest_distance_dist'] = distance[0]
        wf_feature_dict['shortest_distance_x'] = distance[1][0]
        wf_feature_dict['shortest_distance_y'] = distance[1][1]
        wf_feature_dict['average_distance'] = avg_distance

        print(wf_feature_dict)

        print("goodbye")
        missing_chunk_data.append(wf_feature_dict)

        my_index = my_index + 1
        #print(f"The closest distance of fire '{wf_name}' ({wf_size:1.2f} acres) from {wf_year} was {distance[0]:1.2f} miles to {place['city']}")
        #print(f"\tThe cloest perimiter point lat,lon {distance[1][0]},{distance[1][1]}")
    #except Exception as e:
    #    print(e)

print("Length of data dict")
print(len(missing_chunk_data))

missing_chunk_df = pd.DataFrame(missing_chunk_data)
missing_chunk_df.to_csv(output_folder + "missing_chunk_df.csv", index = False)
