# Dataset Creation

This notebook contains the code used to create the dataset used in the thesis. Additionally

In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import numpy as np 
import matplotlib.dates as mdates
import os
import datetime
import xarray as xr
from calendar import monthrange
import rioxarray
from functions import *
import geopandas as gpd
from shapely.geometry import shape, Point

# https://github.com/pysal/pysal

# Loading Data
## CA Fires

In [2]:
df = pd.read_csv("mapdataall.csv")
df["acres_decimal"] = [convert_acres_degrees(df.at[i,"incident_acres_burned"], df.at[i, "incident_latitude"]) for i in range(len(df))]
df["incident_date_created"] = pd.to_datetime(df["incident_date_created"].fillna(df["incident_date_extinguished"]))
df["ex_date"] = pd.to_datetime(df["incident_date_created"])
df = df.sort_values('ex_date', ascending=True)
df = df.set_index("ex_date")
df = df.loc['2016-01-01':'2022-12-31']
wildfire_monthly_acres_df = df[["incident_acres_burned"]].resample('M').sum()

# Prescribed Burns in California

In [3]:
# loading prescribed burn data
gdf = gpd.read_file("California_Fire_Perimeters_(all).geojson")
gdf["lon_center"] = gdf["geometry"].centroid.x
gdf["lat_center"] = gdf["geometry"].centroid.y
gdf["acres_decimal"] = [convert_acres_degrees(gdf.at[i,"TREATED_AC"], gdf.at[i, "lat_center"]) for i in range(len(gdf))]

gdf = gdf.fillna(0)

#gdf = gdf.sort_values('START_DATE', ascending=True)
gdf["START_DATE"] = pd.to_datetime(gdf["START_DATE"], errors = 'coerce')
gdf = gdf.sort_values('START_DATE', ascending=True)
gdf = gdf.set_index("START_DATE")
gdf_mon = gdf[["TREATED_AC"]].resample('M').sum()
gdf_mon = gdf_mon.loc['2015-01-01':'2022-12-31']


  gdf["lon_center"] = gdf["geometry"].centroid.x

  gdf["lat_center"] = gdf["geometry"].centroid.y


In [4]:
# feature engineering

# prescribed acres in past 3 months
gdf_mon["acres_3_month"] = [gdf_mon.loc[i - pd.DateOffset(months=4):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 6 months
gdf_mon["acres_6_month"] = [gdf_mon.loc[i - pd.DateOffset(months=7):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 9 months
gdf_mon["acres_9_month"] = [gdf_mon.loc[i - pd.DateOffset(months=10):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 12 months
gdf_mon["acres_12_month"] = [gdf_mon.loc[i - pd.DateOffset(months=13):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]

In [5]:
prescribed_monthly = gdf_mon.loc['2016-01-01':'2022-12-31']

# Climate Data

In [6]:
soil = get_monthly_soil(r"C:\Users\sequo\OneDrive\Desktop\thesis\soil moisture\soilw.mon.mean.nc").loc['2016-01-01':'2022-12-31']

In [7]:
# temp avg
temp_avg = pd.read_csv("avg_temp.csv", header=4)
temp_avg["date_clean"] = [datetime.datetime(int(str(temp_avg.at[i,"Date"])[0:4]), int(str(temp_avg.at[i,"Date"])[4:]), monthrange(int(str(temp_avg.at[i,"Date"])[0:4]), int(str(temp_avg.at[i,"Date"])[4:]))[1]) for i in range(len(temp_avg))]
temp_avg = temp_avg.set_index("date_clean").drop(["Date"], axis=1)
temp_avg.columns = ["temp_avg_"+col for col in temp_avg.columns]
temp_avg = temp_avg.loc['2016-01-01':'2022-12-31']

In [8]:
# temp min
temp_min = pd.read_csv("temp_min.csv", header=4)
temp_min["date_clean"] = [datetime.datetime(int(str(temp_min.at[i,"Date"])[0:4]), int(str(temp_min.at[i,"Date"])[4:]), monthrange(int(str(temp_min.at[i,"Date"])[0:4]), int(str(temp_min.at[i,"Date"])[4:]))[1]) for i in range(len(temp_min))]
temp_min = temp_min.set_index("date_clean").drop(["Date"], axis=1)
temp_min.columns = ["temp_min_"+col for col in temp_min.columns]
temp_min = temp_min.loc['2016-01-01':'2022-12-31']

In [9]:
# temp max
temp_max = pd.read_csv("temp_max.csv", header=4)
temp_max["date_clean"] = [datetime.datetime(int(str(temp_max.at[i,"Date"])[0:4]), int(str(temp_max.at[i,"Date"])[4:]), monthrange(int(str(temp_max.at[i,"Date"])[0:4]), int(str(temp_max.at[i,"Date"])[4:]))[1]) for i in range(len(temp_max))]
temp_max = temp_max.set_index("date_clean").drop(["Date"], axis=1)
temp_max.columns = ["temp_max_"+col for col in temp_max.columns]
temp_max = temp_max.loc['2016-01-01':'2022-12-31']

In [10]:
# precip
precip_new = pd.read_csv("precip_new.csv", header=4)
precip_new["date_clean"] = [datetime.datetime(int(str(precip_new.at[i,"Date"])[0:4]), int(str(precip_new.at[i,"Date"])[4:]), monthrange(int(str(precip_new.at[i,"Date"])[0:4]), int(str(precip_new.at[i,"Date"])[4:]))[1]) for i in range(len(precip_new))]
precip_new = precip_new.set_index("date_clean").drop(["Date"], axis=1)
precip_new.columns = ["precip_new_"+col for col in precip_new.columns]
precip_new = precip_new.loc['2016-01-01':'2022-12-31']

In [11]:
# phdi
phdi = pd.read_csv("phdi.csv", header=3)
phdi["date_clean"] = [datetime.datetime(int(str(phdi.at[i,"Date"])[0:4]), int(str(phdi.at[i,"Date"])[4:]), monthrange(int(str(phdi.at[i,"Date"])[0:4]), int(str(phdi.at[i,"Date"])[4:]))[1]) for i in range(len(phdi))]
phdi = phdi.set_index("date_clean").drop(["Date"], axis=1)
phdi.columns = ["phdi_"+col for col in phdi.columns]
phdi = phdi.loc['2016-01-01':'2022-12-31']

In [12]:
# pdsi
pdsi = pd.read_csv("pdsi.csv", header=3)
pdsi["date_clean"] = [datetime.datetime(int(str(pdsi.at[i,"Date"])[0:4]), int(str(pdsi.at[i,"Date"])[4:]), monthrange(int(str(pdsi.at[i,"Date"])[0:4]), int(str(pdsi.at[i,"Date"])[4:]))[1]) for i in range(len(pdsi))]
pdsi = pdsi.set_index("date_clean").drop(["Date"], axis=1)
pdsi.columns = ["pdsi_"+col for col in pdsi.columns]
pdsi = pdsi.loc['2016-01-01':'2022-12-31']

In [13]:
# pmdi
pmdi = pd.read_csv("pmdi.csv", header=3)
pmdi["date_clean"] = [datetime.datetime(int(str(pmdi.at[i,"Date"])[0:4]), int(str(pmdi.at[i,"Date"])[4:]), monthrange(int(str(pmdi.at[i,"Date"])[0:4]), int(str(pmdi.at[i,"Date"])[4:]))[1]) for i in range(len(pmdi))]
pmdi = pmdi.set_index("date_clean").drop(["Date"], axis=1)
pmdi.columns = ["pmdi_"+col for col in pmdi.columns]
pmdi = pmdi.loc['2016-01-01':'2022-12-31']

In [14]:
# pzi
pzi = pd.read_csv("pzi.csv", header=3)
pzi["date_clean"] = [datetime.datetime(int(str(pzi.at[i,"Date"])[0:4]), int(str(pzi.at[i,"Date"])[4:]), monthrange(int(str(pzi.at[i,"Date"])[0:4]), int(str(pzi.at[i,"Date"])[4:]))[1]) for i in range(len(pzi))]
pzi = pzi.set_index("date_clean").drop(["Date"], axis=1)
pzi.columns = ["pzi_"+col for col in pzi.columns]
pzi = pzi.loc['2016-01-01':'2022-12-31']

In [15]:
# cooling days
cooling = pd.read_csv("cooling.csv", header=4)
cooling["date_clean"] = [datetime.datetime(int(str(cooling.at[i,"Date"])[0:4]), int(str(cooling.at[i,"Date"])[4:]), monthrange(int(str(cooling.at[i,"Date"])[0:4]), int(str(cooling.at[i,"Date"])[4:]))[1]) for i in range(len(cooling))]
cooling = cooling.set_index("date_clean").drop(["Date"], axis=1)
cooling.columns = ["cooling_days"+col for col in cooling.columns]
cooling = cooling.loc['2016-01-01':'2022-12-31']

In [16]:
# heating days
heating = pd.read_csv("heating.csv", header=4)
heating["date_clean"] = [datetime.datetime(int(str(heating.at[i,"Date"])[0:4]), int(str(heating.at[i,"Date"])[4:]), monthrange(int(str(heating.at[i,"Date"])[0:4]), int(str(heating.at[i,"Date"])[4:]))[1]) for i in range(len(heating))]
heating = heating.set_index("date_clean").drop(["Date"], axis=1)
heating.columns = ["heating_days"+col for col in heating.columns]
heating = heating.loc['2016-01-01':'2022-12-31']

In [17]:
pd.concat([wildfire_monthly_acres_df.tz_localize(None), prescribed_monthly.tz_localize(None), 
           soil.tz_localize(None), temp_avg.tz_localize(None), temp_min.tz_localize(None), 
           temp_max.tz_localize(None), precip_new.tz_localize(None), pdsi.tz_localize(None), 
           phdi.tz_localize(None), pmdi.tz_localize(None), pzi.tz_localize(None), 
           cooling.tz_localize(None), heating.tz_localize(None)], axis=1).fillna(0).to_csv("wildfire_data.csv")

In [18]:
pd.read_csv("wildfire_data.csv", index_col=0).columns

Index(['incident_acres_burned', 'TREATED_AC', 'acres_3_month', 'acres_6_month',
       'acres_9_month', 'acres_12_month', 'soil moisture', 'temp_avg_Value',
       'temp_avg_Anomaly', 'temp_min_Value', 'temp_min_Anomaly',
       'temp_max_Value', 'temp_max_Anomaly', 'precip_new_Value',
       'precip_new_Anomaly', 'pdsi_Value', 'pdsi_Anomaly', 'phdi_Value',
       'phdi_Anomaly', 'pmdi_Value', 'pmdi_Anomaly', 'pzi_Value',
       'pzi_Anomaly', 'cooling_daysValue', 'cooling_daysAnomaly',
       'heating_daysValue', 'heating_daysAnomaly'],
      dtype='object')

# County Datasets

## Tulare - Sequoia National Park

In [19]:
county_boundary = gpd.read_file("California_County_Boundaries.geojson")

In [20]:
county_geo = county_boundary.loc[county_boundary["COUNTY_NAME"]=="Tulare"]["geometry"].reset_index(drop=True).at[0]
gdf_indices = []
county = shape(county_geo)

gdf = gdf.reset_index()

for i in range(len(gdf)):
    point = Point(gdf.at[i,"lon_center"], gdf.at[i,"lat_center"])
    if county.contains(point):
        gdf_indices.append(i)
        
county_gdf = gdf.iloc[gdf_indices][:].set_index("START_DATE")
county_gdf = county_gdf.loc['2015-01-01':'2022-12-31']
gdf_mon = county_gdf[["TREATED_AC"]].resample('M').sum()

In [21]:
# prescribed acres in past 3 months
gdf_mon["acres_3_month"] = [gdf_mon.loc[i - pd.DateOffset(months=4):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 6 months
gdf_mon["acres_6_month"] = [gdf_mon.loc[i - pd.DateOffset(months=7):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 9 months
gdf_mon["acres_9_month"] = [gdf_mon.loc[i - pd.DateOffset(months=10):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 12 months
gdf_mon["acres_12_month"] = [gdf_mon.loc[i - pd.DateOffset(months=13):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
prescribed_monthly = gdf_mon.resample('M').sum().loc['2016-01-01':'2022-12-31']

In [22]:
df = df.reset_index()
df_indices = []

for i in range(len(df)):
    point = Point(df.at[i,"incident_longitude"], df.at[i,"incident_latitude"])
    if county.contains(point):
        df_indices.append(i)

county_df = df.iloc[df_indices][:].set_index("ex_date")
monthly_acres_df = county_df[["incident_acres_burned"]].resample('M').sum()

In [23]:
names = ["wild_fire", "prescribed_fire"]
for i, A in enumerate([county_df, gdf.drop(["geometry"],axis=1)]):
    A.to_csv("tulare"+names[i]+".csv")

In [24]:
soil = get_monthly_soil(r"C:\Users\sequo\OneDrive\Desktop\thesis\soil moisture\soilw.mon.mean.nc", county="Tulare").loc['2016-01-01':'2022-12-31']

In [25]:
# temp avg
temp_avg = pd.read_csv("tulare/tulare_avg.csv", header=4)
temp_avg["date_clean"] = [datetime.datetime(int(str(temp_avg.at[i,"Date"])[0:4]), int(str(temp_avg.at[i,"Date"])[4:]), monthrange(int(str(temp_avg.at[i,"Date"])[0:4]), int(str(temp_avg.at[i,"Date"])[4:]))[1]) for i in range(len(temp_avg))]
temp_avg = temp_avg.set_index("date_clean").drop(["Date"], axis=1)
temp_avg.columns = ["temp_avg_"+col for col in temp_avg.columns]
temp_avg = temp_avg.loc['2016-01-01':'2022-12-31']

In [26]:
# temp min
temp_min = pd.read_csv("tulare/tulare_min.csv", header=4)
temp_min["date_clean"] = [datetime.datetime(int(str(temp_min.at[i,"Date"])[0:4]), int(str(temp_min.at[i,"Date"])[4:]), monthrange(int(str(temp_min.at[i,"Date"])[0:4]), int(str(temp_min.at[i,"Date"])[4:]))[1]) for i in range(len(temp_min))]
temp_min = temp_min.set_index("date_clean").drop(["Date"], axis=1)
temp_min.columns = ["temp_min_"+col for col in temp_min.columns]
temp_min = temp_min.loc['2016-01-01':'2022-12-31']

In [27]:
# temp max
temp_max = pd.read_csv("tulare/tulare_max.csv", header=4)
temp_max["date_clean"] = [datetime.datetime(int(str(temp_max.at[i,"Date"])[0:4]), int(str(temp_max.at[i,"Date"])[4:]), monthrange(int(str(temp_max.at[i,"Date"])[0:4]), int(str(temp_max.at[i,"Date"])[4:]))[1]) for i in range(len(temp_max))]
temp_max = temp_max.set_index("date_clean").drop(["Date"], axis=1)
temp_max.columns = ["temp_max_"+col for col in temp_max.columns]
temp_max = temp_max.loc['2016-01-01':'2022-12-31']

In [28]:
# precip
precip_new = pd.read_csv("tulare/tulare_precip.csv", header=4)
precip_new["date_clean"] = [datetime.datetime(int(str(precip_new.at[i,"Date"])[0:4]), int(str(precip_new.at[i,"Date"])[4:]), monthrange(int(str(precip_new.at[i,"Date"])[0:4]), int(str(precip_new.at[i,"Date"])[4:]))[1]) for i in range(len(precip_new))]
precip_new = precip_new.set_index("date_clean").drop(["Date"], axis=1)
precip_new.columns = ["precip_new_"+col for col in precip_new.columns]
precip_new = precip_new.loc['2016-01-01':'2022-12-31']

In [29]:
# phdi
phdi = pd.read_csv("tulare/tulare_phdi.csv", header=3)
phdi["date_clean"] = [datetime.datetime(int(str(phdi.at[i,"Date"])[0:4]), int(str(phdi.at[i,"Date"])[4:]), monthrange(int(str(phdi.at[i,"Date"])[0:4]), int(str(phdi.at[i,"Date"])[4:]))[1]) for i in range(len(phdi))]
phdi = phdi.set_index("date_clean").drop(["Date"], axis=1)
phdi.columns = ["phdi_"+col for col in phdi.columns]
phdi = phdi.loc['2016-01-01':'2022-12-31']

In [30]:
# pdsi
pdsi = pd.read_csv("tulare/tulare_pdsi.csv", header=3)
pdsi["date_clean"] = [datetime.datetime(int(str(pdsi.at[i,"Date"])[0:4]), int(str(pdsi.at[i,"Date"])[4:]), monthrange(int(str(pdsi.at[i,"Date"])[0:4]), int(str(pdsi.at[i,"Date"])[4:]))[1]) for i in range(len(pdsi))]
pdsi = pdsi.set_index("date_clean").drop(["Date"], axis=1)
pdsi.columns = ["pdsi_"+col for col in pdsi.columns]
pdsi = pdsi.loc['2016-01-01':'2022-12-31']

In [31]:
# pmdi
pmdi = pd.read_csv("tulare/tulare_pmdi.csv", header=3)
pmdi["date_clean"] = [datetime.datetime(int(str(pmdi.at[i,"Date"])[0:4]), int(str(pmdi.at[i,"Date"])[4:]), monthrange(int(str(pmdi.at[i,"Date"])[0:4]), int(str(pmdi.at[i,"Date"])[4:]))[1]) for i in range(len(pmdi))]
pmdi = pmdi.set_index("date_clean").drop(["Date"], axis=1)
pmdi.columns = ["pmdi_"+col for col in pmdi.columns]
pmdi = pmdi.loc['2016-01-01':'2022-12-31']

In [32]:
# pzi
pzi = pd.read_csv("tulare/tulare_pzi.csv", header=3)
pzi["date_clean"] = [datetime.datetime(int(str(pzi.at[i,"Date"])[0:4]), int(str(pzi.at[i,"Date"])[4:]), monthrange(int(str(pzi.at[i,"Date"])[0:4]), int(str(pzi.at[i,"Date"])[4:]))[1]) for i in range(len(pzi))]
pzi = pzi.set_index("date_clean").drop(["Date"], axis=1)
pzi.columns = ["pzi_"+col for col in pzi.columns]
pzi = pzi.loc['2016-01-01':'2022-12-31']

In [33]:
# cooling days
cooling = pd.read_csv("tulare/tulare_cooling.csv", header=4)
cooling["date_clean"] = [datetime.datetime(int(str(cooling.at[i,"Date"])[0:4]), int(str(cooling.at[i,"Date"])[4:]), monthrange(int(str(cooling.at[i,"Date"])[0:4]), int(str(cooling.at[i,"Date"])[4:]))[1]) for i in range(len(cooling))]
cooling = cooling.set_index("date_clean").drop(["Date"], axis=1)
cooling.columns = ["cooling_days"+col for col in cooling.columns]
cooling = cooling.loc['2016-01-01':'2022-12-31']

In [34]:
# heating days
heating = pd.read_csv("tulare/tulare_heating.csv", header=4)
heating["date_clean"] = [datetime.datetime(int(str(heating.at[i,"Date"])[0:4]), int(str(heating.at[i,"Date"])[4:]), monthrange(int(str(heating.at[i,"Date"])[0:4]), int(str(heating.at[i,"Date"])[4:]))[1]) for i in range(len(heating))]
heating = heating.set_index("date_clean").drop(["Date"], axis=1)
heating.columns = ["heating_days"+col for col in heating.columns]
heating = heating.loc['2016-01-01':'2022-12-31']

In [35]:
pd.concat([monthly_acres_df.tz_localize(None), prescribed_monthly.tz_localize(None), 
           soil.tz_localize(None), temp_avg.tz_localize(None), temp_min.tz_localize(None), 
           temp_max.tz_localize(None), precip_new.tz_localize(None), pdsi.tz_localize(None), 
           phdi.tz_localize(None), pmdi.tz_localize(None), pzi.tz_localize(None), 
           cooling.tz_localize(None), heating.tz_localize(None)], axis=1).fillna(0).to_csv("tulare_data.csv")

## Humboldt - Redwoods National Park

In [36]:
county_geo = county_boundary.loc[county_boundary["COUNTY_NAME"]=="Humboldt"]["geometry"].reset_index(drop=True).at[0]
gdf_indices = []
county = shape(county_geo)

gdf = gdf.reset_index()

for i in range(len(gdf)):
    point = Point(gdf.at[i,"lon_center"], gdf.at[i,"lat_center"])
    if county.contains(point):
        gdf_indices.append(i)

county_gdf = gdf.iloc[gdf_indices][:].set_index("START_DATE")
county_gdf = county_gdf.loc['2015-01-01':'2022-12-31']
gdf_mon = county_gdf[["TREATED_AC"]].resample('M').sum()

In [37]:
county_gdf

Unnamed: 0_level_0,index,OBJECTID,YEAR_,STATE,AGENCY,UNIT_ID,TREATMENT_ID,TREATMENT_NAME,TREATMENT_TYPE,END_DATE,...,GIS_ACRES,RX_CONSUM,PRE_CON_CLASS,POST_CON_CLASS,SHAPE_Length,SHAPE_Area,geometry,lon_center,lat_center,acres_decimal
START_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-10-03 00:00:00+00:00,5450,7064,2015,CA,PVT,KWC,3613,Sims Gulch,1.0,2015-10-04T00:00:00+00:00,...,63.864250,0.0,0.0,0.0,0.021317,0.000028,"MULTIPOLYGON (((-123.55504 41.31757, -123.5539...",-123.554286,41.315268,0.003434
2015-10-06 00:00:00+00:00,5452,7087,2015,CA,PVT,KWC,3624,Tshanik,1.0,2015-10-06T00:00:00+00:00,...,90.663704,0.0,0.0,0.0,0.037354,0.000039,"MULTIPOLYGON (((-123.54648 41.29801, -123.5468...",-123.555196,41.293563,0.004084
2015-10-06 00:00:00+00:00,5454,7085,2015,CA,PVT,KWC,3612,Beacon Flat,1.0,2015-10-06T00:00:00+00:00,...,24.806728,0.0,0.0,0.0,0.019859,0.000011,"MULTIPOLYGON (((-123.53341 41.31515, -123.5330...",-123.533609,41.313263,0.002137
2015-10-08 00:00:00+00:00,5459,7067,2015,CA,PVT,KWC,3621,Ramsland,1.0,2015-10-08T00:00:00+00:00,...,5.455334,0.0,0.0,0.0,0.006681,0.000002,"MULTIPOLYGON (((-123.52039 41.35154, -123.5205...",-123.520364,41.352184,0.001007
2015-10-09 00:00:00+00:00,5461,7089,2015,CA,PVT,KWC,3626,Gilkison Lower,1.0,2015-10-09T00:00:00+00:00,...,4.564350,0.0,0.0,0.0,0.006200,0.000002,"MULTIPOLYGON (((-123.60303 41.25582, -123.6031...",-123.603227,41.257131,0.000920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-27 00:00:00+00:00,8397,15731,2022,CA,CSP,119,041,Broadcast Burn,1.0,2022-11-02T00:00:00+00:00,...,39.355888,0.0,0.0,0.0,0.053013,0.000017,"MULTIPOLYGON (((-124.01623 40.35930, -124.0161...",-124.013029,40.355962,0.002654
2022-10-27 00:00:00+00:00,8398,15734,2022,CA,CSP,119,037,Broadcast Burn,1.0,2022-11-02T00:00:00+00:00,...,19.854837,0.0,0.0,0.0,0.013758,0.000009,"MULTIPOLYGON (((-124.02784 40.37103, -124.0277...",-124.028180,40.368420,0.001885
2022-10-27 00:00:00+00:00,8399,15737,2022,CA,CSP,119,043,Broadcast Burn,1.0,2022-11-02T00:00:00+00:00,...,44.108467,0.0,0.0,0.0,0.074225,0.000019,"MULTIPOLYGON (((-124.02167 40.35812, -124.0216...",-124.019502,40.355180,0.002810
2022-10-27 00:00:00+00:00,8400,15738,2022,CA,CSP,119,040,Broadcast Burn,1.0,2022-11-02T00:00:00+00:00,...,32.861622,0.0,0.0,0.0,0.040817,0.000014,"MULTIPOLYGON (((-124.02736 40.36098, -124.0273...",-124.026197,40.363296,0.002425


In [38]:
# prescribed acres in past 3 months
gdf_mon["acres_3_month"] = [gdf_mon.loc[i - pd.DateOffset(months=4):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 6 months
gdf_mon["acres_6_month"] = [gdf_mon.loc[i - pd.DateOffset(months=7):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 9 months
gdf_mon["acres_9_month"] = [gdf_mon.loc[i - pd.DateOffset(months=10):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
# prescribed acres in past 12 months
gdf_mon["acres_12_month"] = [gdf_mon.loc[i - pd.DateOffset(months=13):i- pd.DateOffset(months=1)]["TREATED_AC"].sum() for i in gdf_mon.index]
prescribed_monthly = gdf_mon.loc['2016-01-01':'2022-12-31']

In [39]:
df = df.reset_index()
df_indices = []

for i in range(len(df)):
    point = Point(df.at[i,"incident_longitude"], df.at[i,"incident_latitude"])
    if county.contains(point):
        df_indices.append(i)
        
county_df = df.iloc[df_indices][:].set_index("ex_date")
monthly_acres_df = county_df[["incident_acres_burned"]].resample('M').sum()

In [40]:
names = ["wild_fire", "prescribed_fire"]
for i, A in enumerate([county_df, gdf]):
    A.to_csv("humboldt"+names[i]+".csv")

In [41]:
soil = get_monthly_soil(r"C:\Users\sequo\OneDrive\Desktop\thesis\soil moisture\soilw.mon.mean.nc", county="Humboldt").loc['2016-01-01':'2022-12-31']

In [42]:
# temp avg
temp_avg = pd.read_csv("humbolt/h_temp_avg.csv", header=4)
temp_avg["date_clean"] = [datetime.datetime(int(str(temp_avg.at[i,"Date"])[0:4]), int(str(temp_avg.at[i,"Date"])[4:]), monthrange(int(str(temp_avg.at[i,"Date"])[0:4]), int(str(temp_avg.at[i,"Date"])[4:]))[1]) for i in range(len(temp_avg))]
temp_avg = temp_avg.set_index("date_clean").drop(["Date"], axis=1)
temp_avg.columns = ["temp_avg_"+col for col in temp_avg.columns]
temp_avg = temp_avg.loc['2016-01-01':'2022-12-31']

In [43]:
# temp min
temp_min = pd.read_csv("humbolt/h_temp_min.csv", header=4)
temp_min["date_clean"] = [datetime.datetime(int(str(temp_min.at[i,"Date"])[0:4]), int(str(temp_min.at[i,"Date"])[4:]), monthrange(int(str(temp_min.at[i,"Date"])[0:4]), int(str(temp_min.at[i,"Date"])[4:]))[1]) for i in range(len(temp_min))]
temp_min = temp_min.set_index("date_clean").drop(["Date"], axis=1)
temp_min.columns = ["temp_min_"+col for col in temp_min.columns]
temp_min = temp_min.loc['2016-01-01':'2022-12-31']

In [44]:
# temp max
temp_max = pd.read_csv("humbolt/h_temp_max.csv", header=4)
temp_max["date_clean"] = [datetime.datetime(int(str(temp_max.at[i,"Date"])[0:4]), int(str(temp_max.at[i,"Date"])[4:]), monthrange(int(str(temp_max.at[i,"Date"])[0:4]), int(str(temp_max.at[i,"Date"])[4:]))[1]) for i in range(len(temp_max))]
temp_max = temp_max.set_index("date_clean").drop(["Date"], axis=1)
temp_max.columns = ["temp_max_"+col for col in temp_max.columns]
temp_max = temp_max.loc['2016-01-01':'2022-12-31']

In [45]:
# precip
precip_new = pd.read_csv("humbolt/h_precip.csv", header=4)
precip_new["date_clean"] = [datetime.datetime(int(str(precip_new.at[i,"Date"])[0:4]), int(str(precip_new.at[i,"Date"])[4:]), monthrange(int(str(precip_new.at[i,"Date"])[0:4]), int(str(precip_new.at[i,"Date"])[4:]))[1]) for i in range(len(precip_new))]
precip_new = precip_new.set_index("date_clean").drop(["Date"], axis=1)
precip_new.columns = ["precip_new_"+col for col in precip_new.columns]
precip_new = precip_new.loc['2016-01-01':'2022-12-31']

In [46]:
# phdi
phdi = pd.read_csv("humbolt/h_phdi.csv", header=3)
phdi["date_clean"] = [datetime.datetime(int(str(phdi.at[i,"Date"])[0:4]), int(str(phdi.at[i,"Date"])[4:]), monthrange(int(str(phdi.at[i,"Date"])[0:4]), int(str(phdi.at[i,"Date"])[4:]))[1]) for i in range(len(phdi))]
phdi = phdi.set_index("date_clean").drop(["Date"], axis=1)
phdi.columns = ["phdi_"+col for col in phdi.columns]
phdi = phdi.loc['2016-01-01':'2022-12-31']

In [47]:
# pdsi
pdsi = pd.read_csv("humbolt/h_pdsi.csv", header=3)
pdsi["date_clean"] = [datetime.datetime(int(str(pdsi.at[i,"Date"])[0:4]), int(str(pdsi.at[i,"Date"])[4:]), monthrange(int(str(pdsi.at[i,"Date"])[0:4]), int(str(pdsi.at[i,"Date"])[4:]))[1]) for i in range(len(pdsi))]
pdsi = pdsi.set_index("date_clean").drop(["Date"], axis=1)
pdsi.columns = ["pdsi_"+col for col in pdsi.columns]
pdsi = pdsi.loc['2016-01-01':'2022-12-31']

In [48]:
# pmdi
pmdi = pd.read_csv("humbolt/h_pmdi.csv", header=3)
pmdi["date_clean"] = [datetime.datetime(int(str(pmdi.at[i,"Date"])[0:4]), int(str(pmdi.at[i,"Date"])[4:]), monthrange(int(str(pmdi.at[i,"Date"])[0:4]), int(str(pmdi.at[i,"Date"])[4:]))[1]) for i in range(len(pmdi))]
pmdi = pmdi.set_index("date_clean").drop(["Date"], axis=1)
pmdi.columns = ["pmdi_"+col for col in pmdi.columns]
pmdi = pmdi.loc['2016-01-01':'2022-12-31']

In [49]:
# pzi
pzi = pd.read_csv("humbolt/h_pzi.csv", header=3)
pzi["date_clean"] = [datetime.datetime(int(str(pzi.at[i,"Date"])[0:4]), int(str(pzi.at[i,"Date"])[4:]), monthrange(int(str(pzi.at[i,"Date"])[0:4]), int(str(pzi.at[i,"Date"])[4:]))[1]) for i in range(len(pzi))]
pzi = pzi.set_index("date_clean").drop(["Date"], axis=1)
pzi.columns = ["pzi_"+col for col in pzi.columns]
pzi = pzi.loc['2016-01-01':'2022-12-31']

In [50]:
# cooling days
cooling = pd.read_csv("humbolt/h_cooling.csv", header=4)
cooling["date_clean"] = [datetime.datetime(int(str(cooling.at[i,"Date"])[0:4]), int(str(cooling.at[i,"Date"])[4:]), monthrange(int(str(cooling.at[i,"Date"])[0:4]), int(str(cooling.at[i,"Date"])[4:]))[1]) for i in range(len(cooling))]
cooling = cooling.set_index("date_clean").drop(["Date"], axis=1)
cooling.columns = ["cooling_days"+col for col in cooling.columns]
cooling = cooling.loc['2016-01-01':'2022-12-31']

In [51]:
# heating days
heating = pd.read_csv("humbolt/h_heating.csv", header=4)
heating["date_clean"] = [datetime.datetime(int(str(heating.at[i,"Date"])[0:4]), int(str(heating.at[i,"Date"])[4:]), monthrange(int(str(heating.at[i,"Date"])[0:4]), int(str(heating.at[i,"Date"])[4:]))[1]) for i in range(len(heating))]
heating = heating.set_index("date_clean").drop(["Date"], axis=1)
heating.columns = ["heating_days"+col for col in heating.columns]
heating = heating.loc['2016-01-01':'2022-12-31']

In [52]:
pd.concat([monthly_acres_df.tz_localize(None), prescribed_monthly.tz_localize(None), 
           soil.tz_localize(None), temp_avg.tz_localize(None), temp_min.tz_localize(None), 
           temp_max.tz_localize(None), precip_new.tz_localize(None), pdsi.tz_localize(None), 
           phdi.tz_localize(None), pmdi.tz_localize(None), pzi.tz_localize(None), 
           cooling.tz_localize(None), heating.tz_localize(None)], axis=1).fillna(0).to_csv("humbolt_data.csv")

In [53]:
pd.concat([monthly_acres_df.tz_localize(None), prescribed_monthly.tz_localize(None), 
           soil.tz_localize(None), temp_avg.tz_localize(None), temp_min.tz_localize(None), 
           temp_max.tz_localize(None), precip_new.tz_localize(None), pdsi.tz_localize(None), 
           phdi.tz_localize(None), pmdi.tz_localize(None), pzi.tz_localize(None), 
           cooling.tz_localize(None), heating.tz_localize(None)], axis=1).fillna(0)

Unnamed: 0,incident_acres_burned,TREATED_AC,acres_3_month,acres_6_month,acres_9_month,acres_12_month,soil moisture,temp_avg_Value,temp_avg_Anomaly,temp_min_Value,...,phdi_Value,phdi_Anomaly,pmdi_Value,pmdi_Anomaly,pzi_Value,pzi_Anomaly,cooling_daysValue,cooling_daysAnomaly,heating_daysValue,heating_daysAnomaly
2016-01-31,0.0,0.000000,692.660944,692.660944,692.660944,692.660944,698.047607,46.4,4.6,41.1,...,-1.48,-1.39,0.84,0.95,3.08,3.01,0,0,577,-144
2016-02-29,0.0,0.000000,692.660944,692.660944,692.660944,692.660944,678.955200,49.0,4.9,40.2,...,-2.06,-2.12,-0.61,-0.65,-2.20,-2.51,0,0,448,-138
2016-03-31,0.0,0.000000,180.143350,692.660944,692.660944,692.660944,681.090637,49.5,3.3,40.9,...,-0.85,-0.80,1.56,1.70,3.00,3.27,0,0,480,-102
2016-04-30,0.0,0.000000,0.000000,692.660944,692.660944,692.660944,660.140503,55.3,5.6,43.8,...,-1.09,-1.14,0.86,0.90,-0.98,-1.10,0,0,295,-163
2016-05-31,0.0,0.000000,0.000000,692.660944,692.660944,692.660944,581.487854,58.4,4.0,46.9,...,-1.47,-1.48,-0.05,0.09,-1.48,-1.41,6,6,210,-118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-31,26.0,0.000000,3.900000,789.500000,1076.100000,1196.100000,376.434570,69.4,3.6,55.9,...,-1.48,-1.47,0.59,0.68,-0.14,-0.05,138,85,0,-27
2022-09-30,0.0,287.414442,3.900000,433.700000,897.000000,1196.100000,333.844482,65.9,3.0,53.2,...,-1.44,-1.45,0.41,0.50,-0.36,-0.39,64,37,38,-52
2022-10-31,0.0,1293.290658,291.314442,441.614442,1087.814442,1483.514442,292.342621,58.8,3.3,47.0,...,-2.31,-2.18,-1.39,-1.09,-3.05,-2.72,0,0,197,-99
2022-11-30,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,324.599609,44.4,-2.6,35.5,...,-2.02,-1.93,-1.05,-0.87,0.16,0.06,0,0,617,77


# Prescribed Burn treatment per wildfire

In [54]:
from geopy import distance
from tqdm import tqdm

In [55]:
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [56]:
df = df.loc[(df["incident_latitude"]>-90) &(df["incident_latitude"]<90)].reset_index(drop=True)

In [57]:
gdf = gdf.sort_values('START_DATE', ascending=True)
gdf = gdf.set_index("START_DATE")
gdf = gdf.loc['2012-01-01':'2022-12-31']
gdf = gdf.reset_index()

In [58]:
times = [3, 6, 12, 24, 36]
keep_cols = ['ex_date', 'incident_name', 'incident_county', 'incident_acres_burned', 'incident_longitude','incident_latitude', 'incident_type']
tdf = df[keep_cols]
for time in tqdm(times):
    res = dict(in_5km = [],
    size_5km = [],
    in_10km = [],
    size_10km = [],
    in_25km = [],
    size_25km = [],
    in_50km = [],
    size_50km = [],
    in_100km = [],
    size_100km = [])
    for i in tqdm(range(len(tdf))):
        lat = tdf.iloc[i]["incident_latitude"]
        lon = tdf.iloc[i]["incident_longitude"]
        df_date = tdf.iloc[i]["ex_date"]
        inds = [k for k in range(len(gdf)) if (diff_month(df_date, gdf.iloc[k]["START_DATE"])<=time) and (diff_month(df_date,gdf.iloc[k]["START_DATE"])>0)]
        gdf_t = gdf.iloc[inds][:].reset_index(drop=True) #time is in times
        bool_5km = False
        size_5km = 0
        bool_10km = False
        size_10km = 0
        bool_25km = False
        size_25km = 0
        bool_50km = False
        size_50km = 0
        bool_100km = False
        size_100km = 0
        for j in range(len(gdf_t)):
            g_lat = gdf_t.iloc[j]["lat_center"]
            g_lon = gdf_t.iloc[j]["lon_center"]
            dist = distance.distance((lat,lon), (g_lat, g_lon)).km
            if dist <= 5:
                bool_5km = True
                size_5km += gdf_t.iloc[j]["TREATED_AC"]
            if dist <= 10:
                bool_10km = True
                size_10km += gdf_t.iloc[j]["TREATED_AC"]
            if dist <= 25:
                bool_25km = True
                size_25km += gdf_t.iloc[j]["TREATED_AC"]
            if dist <= 50:
                bool_50km = True
                size_50km += gdf_t.iloc[j]["TREATED_AC"]
            if dist <= 100:
                bool_100km = True
                size_100km += gdf_t.iloc[j]["TREATED_AC"]
        res["in_5km"].append(bool_5km)
        res["in_10km"].append(bool_10km)
        res["in_25km"].append(bool_25km)
        res["in_50km"].append(bool_50km)
        res["in_100km"].append(bool_100km)
        res["size_5km"].append(size_5km)
        res["size_10km"].append(size_10km)
        res["size_25km"].append(size_25km)
        res["size_50km"].append(size_50km)
        res["size_100km"].append(size_100km)

    res = {str(time)+"_months_"+key: value for key, value in res.items()}

    tdf = pd.concat([tdf, pd.DataFrame(res)], axis=1)





  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 1740/1740 [40:23<00:00,  1.39s/it]
100%|██████████| 1740/1740 [1:26:16<00:00,  2.98s/it]
100%|██████████| 1740/1740 [1:08:21<00:00,  2.36s/it]
100%|██████████| 1740/1740 [47:54<00:00,  1.65s/it]
100%|██████████| 1740/1740 [56:18<00:00,  1.94s/it]
100%|██████████| 5/5 [4:59:14<00:00, 3590.92s/it]


In [59]:
tdf

Unnamed: 0,ex_date,incident_name,incident_county,incident_acres_burned,incident_longitude,incident_latitude,incident_type,3_months_in_5km,3_months_size_5km,3_months_in_10km,...,36_months_in_5km,36_months_size_5km,36_months_in_10km,36_months_size_10km,36_months_in_25km,36_months_size_25km,36_months_in_50km,36_months_size_50km,36_months_in_100km,36_months_size_100km
0,2016-04-19 13:09:00+00:00,Gorman Fire,Los Angeles,0.0,-118.789259,34.688873,,False,0.0,False,...,False,0.0,False,0.00,False,0.00,False,0.000000,True,128.100000
1,2016-04-24 11:10:00+00:00,Taglio Fire,Merced,30.0,-121.080360,37.217100,,False,0.0,False,...,False,0.0,False,0.00,False,0.00,True,1906.880000,True,2898.880000
2,2016-05-10 16:11:00+00:00,Shedd Fire,San Luis Obispo,150.0,-120.399930,35.619610,,False,0.0,False,...,False,0.0,False,0.00,False,0.00,True,574.200000,True,1089.200000
3,2016-05-12 14:13:00+00:00,Bryson Fire,Monterey,25.0,-121.038151,35.837827,,False,0.0,False,...,False,0.0,False,0.00,True,0.00,True,410.000000,True,1935.200000
4,2016-05-15 16:38:00+00:00,Avocado Fire,Fresno,132.0,-119.382480,36.787690,,False,0.0,False,...,False,0.0,False,0.00,True,316.00,True,2190.070129,True,4315.584457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1735,2022-09-21 07:29:00+00:00,Irie Fire,Tulare,,-118.752908,36.007323,Wildfire,False,0.0,False,...,False,0.0,True,1117.48,True,3635.58,True,4744.580000,True,7771.680000
1736,2022-09-23 15:23:00+00:00,Manzanita Fire,San Bernardino,,-117.152900,34.177600,Wildfire,False,0.0,False,...,True,57.0,True,128.00,True,1307.00,True,2833.900000,True,32710.200000
1737,2022-09-27 08:54:50+00:00,97 Fire,Siskiyou,30.0,-122.341400,41.498700,Wildfire,False,0.0,False,...,False,0.0,False,0.00,True,285.00,True,7196.970000,True,28956.990000
1738,2022-10-08 15:42:50+00:00,Howard Fire,Ventura,,-119.167059,34.560597,Wildfire,False,0.0,False,...,False,0.0,False,0.00,True,141.00,True,304.000000,True,2293.800000


In [None]:
cols_to_fix = ['3_months_in_5km', '3_months_size_5km', '3_months_in_10km',
       '3_months_size_10km', '3_months_in_25km', '3_months_size_25km',
       '3_months_in_50km', '3_months_size_50km', '3_months_in_100km',
       '3_months_size_100km', '6_months_in_5km', '6_months_size_5km',
       '6_months_in_10km', '6_months_size_10km', '6_months_in_25km',
       '6_months_size_25km', '6_months_in_50km', '6_months_size_50km',
       '6_months_in_100km', '6_months_size_100km', '12_months_in_5km',
       '12_months_size_5km', '12_months_in_10km', '12_months_size_10km',
       '12_months_in_25km', '12_months_size_25km', '12_months_in_50km',
       '12_months_size_50km', '12_months_in_100km', '12_months_size_100km',
       '24_months_in_5km', '24_months_size_5km', '24_months_in_10km',
       '24_months_size_10km', '24_months_in_25km', '24_months_size_25km',
       '24_months_in_50km', '24_months_size_50km', '24_months_in_100km',
       '24_months_size_100km', '36_months_in_5km', '36_months_size_5km',
       '36_months_in_10km', '36_months_size_10km', '36_months_in_25km',
       '36_months_size_25km', '36_months_in_50km', '36_months_size_50km',
       '36_months_in_100km', '36_months_size_100km']
fixed_cols = {col:"prescribed_"+col for col in cols_to_fix}
new_cols = []
for col in tdf.columns:
    if col in cols_to_fix:
        col = fixed_cols[col]
    new_cols.append(col)
tdf.columns = new_cols

In [None]:
df.at[1422,"incident_longitude"] = -119.1414610 # fix incorrect long

In [None]:
df["log_incident_acres_burned"] =  np.log(df['incident_acres_burned'].replace(0, np.nan)).replace(np.nan, 0)

In [60]:
tdf.to_csv("prescribed_treatment_data.csv")