## Load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans

In [2]:
url1 = "https://raw.githubusercontent.com/statzenthusiast921/wildfires/refs/heads/main/data/fire_df_wa.csv"
url2 = "https://raw.githubusercontent.com/statzenthusiast921/wildfires/refs/heads/main/data/fire_df_or.csv"
url3 = "https://raw.githubusercontent.com/statzenthusiast921/wildfires/refs/heads/main/data/fire_df_ca.csv"

df_wa = pd.read_csv(url1)
df_or = pd.read_csv(url2)
df_ca = pd.read_csv(url3, dtype={16: str, 18: str})


print(df_wa.shape)
print(df_or.shape)
print(df_ca.shape)

(33513, 19)
(61088, 19)
(189550, 19)


## Combine state datasets

In [3]:
full_df = pd.concat([df_wa, df_or, df_ca], ignore_index=True)
full_df.shape

(284151, 19)

## Prepare data/create features

#### Fire season is cyclical so let's make some seasonal parameters

In [4]:
#-----Convert to appropriate time format
full_df['EndDate'] = pd.to_datetime(full_df['CONT_DATE'], unit='D', origin='julian')
full_df['StartDate'] = pd.to_datetime(full_df['DISCOVERY_DATE'], unit='D', origin='julian')
full_df['FireLengthDays'] = full_df['CONT_DATE'] - full_df['DISCOVERY_DATE']

In [5]:
full_df = full_df[['LATITUDE','LONGITUDE','STATE','FireLengthDays','FIRE_YEAR','StartDate','FIRE_SIZE','STAT_CAUSE_DESCR']]

In [6]:
#full_df['MonthName'] = full_df['StartDate'].dt.strftime('%B')
full_df['MonthName'] = full_df['StartDate'].dt.month
full_df['month_sin'] = np.sin(2 * np.pi * full_df['MonthName'] / 12)
full_df['month_cos'] = np.cos(2 * np.pi * full_df['MonthName'] / 12)

#### We don't need all of these cause values

In [7]:
full_df['STAT_CAUSE_DESCR'].value_counts()

Lightning            68250
Miscellaneous        64835
Equipment Use        44958
Debris Burning       24800
Arson                22942
Campfire             20812
Missing/Undefined    14017
Smoking               9163
Children              9155
Powerline             1814
Fireworks             1781
Railroad              1299
Structure              325
Name: STAT_CAUSE_DESCR, dtype: int64

In [8]:
print(full_df.shape)
full_df = full_df[~full_df['STAT_CAUSE_DESCR'].isin(['Structure','Railroad','Missing/Undefined'])]
print(full_df.shape)

(284151, 11)
(268510, 11)


In [9]:
full_df = pd.get_dummies(full_df, columns=['STAT_CAUSE_DESCR']).drop(columns=['STAT_CAUSE_DESCR_Fireworks'])

#### Can't do much with lat/lon, so let's do some spatial clustering

In [10]:
#----- Count fires per state per year
fires_per_year = full_df.groupby(['STATE', 'FIRE_YEAR']).size().reset_index(name='fire_count')

#----- Calculate average fires per year per state
avg_fires_per_state = fires_per_year.groupby('STATE')['fire_count'].mean()
avg_fires_per_state = avg_fires_per_state.to_dict()
avg_fires_per_state

{'CA': 7337.583333333333, 'OR': 2494.7916666666665, 'WA': 1355.5416666666667}

In [16]:
#----- Starting # of clusters
base_clusters = 20

#----- Compute proportional clusters
total_fires = sum(avg_fires_per_state.values())
clusters_per_state = {
    state: max(1, int(round(base_clusters * count / total_fires))) 
    for state, count in avg_fires_per_state.items()
}

#----- Initialize cluster column
full_df['location_cluster'] = -1

#----- Perform clustering per state with proportional clusters
for state, n_clusters in clusters_per_state.items():
    mask = full_df['STATE'] == state
    coords = full_df.loc[mask, ['LATITUDE', 'LONGITUDE']]
    
    if len(coords) >= n_clusters:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(coords)
        full_df.loc[mask, 'location_cluster'] = clusters
    else:
        #----- Assign single cluster if not enough points
        full_df.loc[mask, 'location_cluster'] = 0


In [13]:
full_df = pd.get_dummies(full_df, columns=['location_cluster'], prefix='loc')

0     43050
1     39800
3     30284
2     29113
9     23423
5     19511
11    18590
4     12764
10    12730
8     12261
7     12210
6      8374
12     6400
Name: location_cluster, dtype: int64