## Merging Wildfire and Environmental Data

We merge wildfire data with relevant environmental data from nearby weather stations. By finding the nearest station for each wildfire event, based on geographical coordinates, the environmental conditions are linked to the corresponding wildfire records.

In [120]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pytrends.request import TrendReq
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from scipy.spatial import cKDTree
pd.set_option('display.max_columns', None)

In [121]:
df_wildfire = pd.read_csv("california_wildfire.csv")
df_wildfire

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205000,-120.335000,Natural,Natural
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.166700,-116.634200,Human,Debris and open burning
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.030000,-116.831944,Human,Missing data/not specified/undetermined
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.990000,Human,Missing data/not specified/undetermined
...,...,...,...,...,...,...,...,...,...,...,...,...
251876,2020-12-31,366,16:58,2020-12-31,366.0,18:41,15.8,C,38.938853,-121.107672,Human,Debris and open burning
251877,2020-12-31,366,17:43,2020-12-31,366.0,18:23,0.5,B,34.073641,-117.787055,Human,Debris and open burning
251878,2020-12-31,366,20:32,2020-12-31,366.0,21:02,0.1,A,33.982775,-117.234942,Missing data/not specified/undetermined,Missing data/not specified/undetermined
251879,2020-12-31,366,22:21,2020-12-31,366.0,22:51,0.1,A,33.704975,-117.207482,Missing data/not specified/undetermined,Missing data/not specified/undetermined


In [122]:
df_env = pd.read_csv("cleaned_environmental_data.csv")
df_env

Unnamed: 0,Station,Name,Latitude,Longitude,Elevation,Date,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,USC00040029,"ADIN RANGER STATION, CA US",41.19334,-120.94458,1280.8,1992-01-01,-5.00,6.67,0.83,0.00,0.0
1,USC00040383,"AUBURN, CA US",38.90720,-121.08380,393.8,1992-01-01,3.89,13.89,8.89,0.00,0.0
2,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,1992-01-01,-1.67,12.78,5.56,0.00,0.0
3,USC00041018,"BOWMAN DAM, CA US",39.45390,-120.65560,1641.3,1992-01-01,-4.44,7.78,1.67,0.00,0.0
4,USC00041614,"CEDARVILLE, CA US",41.53010,-120.17910,1428.9,1992-01-01,-9.44,2.78,-3.33,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...
60442,USC00047195,"QUINCY, CA US",39.93660,-120.94750,1042.4,2020-12-31,-3.33,9.44,3.06,4.83,0.0
60443,USC00040029,"ADIN RANGER STATION, CA US",41.19334,-120.94458,1280.8,2021-01-01,2.22,26.67,14.44,0.00,0.0
60444,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,2021-01-01,2.78,14.44,8.61,0.00,0.0
60445,USC00041614,"CEDARVILLE, CA US",41.53010,-120.17910,1428.9,2021-01-01,-9.44,6.67,-1.39,0.00,0.0


In [123]:
stations_df = df_env[['Station', 'Latitude', 'Longitude']].drop_duplicates()

# Convert coordinates to radians for k-d tree usage
stations_coords = np.radians(stations_df[['Latitude', 'Longitude']].to_numpy())
wildfire_coords = np.radians(df_wildfire[['Latitude', 'Longitude']].to_numpy())

# Build the k-d tree
tree = cKDTree(stations_coords)

# Find the index of the nearest station for each wildfire (k=1 for the first nearest station)
distances, indices = tree.query(wildfire_coords, k=1)

# Assign the nearest station to the df_wildfire DataFrame
df_wildfire['NearestStation'] = stations_df.iloc[indices.flatten()]['Station'].values

# Merge the wildfire data with environmental data on the nearest station and the date
df_wildfire_ca = pd.merge(df_wildfire, df_env, how='left', left_on=['NearestStation', 'DiscoveryDate'], right_on=['Station', 'Date'])

# Display the merged DataFrame
df_wildfire_ca

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude_x,Longitude_x,CauseClassification,GeneralCause,NearestStation,Station,Name,Latitude_y,Longitude_y,Elevation,Date,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205000,-120.335000,Natural,Natural,USC00040383,USC00040383,"AUBURN, CA US",38.90720,-121.08380,393.8,1992-01-01,3.89,13.89,8.89,0.00,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.166700,-116.634200,Human,Debris and open burning,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,1992-01-01,-1.67,12.78,5.56,0.00,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,1992-01-01,-1.67,12.78,5.56,0.00,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.030000,-116.831944,Human,Missing data/not specified/undetermined,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,1992-01-02,-1.11,12.22,5.56,0.00,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.990000,Human,Missing data/not specified/undetermined,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,1992-01-02,-1.11,12.22,5.56,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251876,2020-12-31,366,16:58,2020-12-31,366.0,18:41,15.8,C,38.938853,-121.107672,Human,Debris and open burning,USC00040383,USC00040383,"AUBURN, CA US",38.90720,-121.08380,393.8,2020-12-31,-1.11,10.00,4.44,19.81,0.0
251877,2020-12-31,366,17:43,2020-12-31,366.0,18:23,0.5,B,34.073641,-117.787055,Human,Debris and open burning,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,2020-12-31,2.22,15.56,8.89,0.00,0.0
251878,2020-12-31,366,20:32,2020-12-31,366.0,21:02,0.1,A,33.982775,-117.234942,Missing data/not specified/undetermined,Missing data/not specified/undetermined,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,2020-12-31,2.22,15.56,8.89,0.00,0.0
251879,2020-12-31,366,22:21,2020-12-31,366.0,22:51,0.1,A,33.704975,-117.207482,Missing data/not specified/undetermined,Missing data/not specified/undetermined,USW00023161,USW00023161,"BARSTOW DAGGETT AIRPORT, CA US",34.85371,-116.78702,584.8,2020-12-31,2.22,15.56,8.89,0.00,0.0


In [124]:
df_wildfire_ca['MaxTemperature'].isna().sum()

14426

In [125]:
len(df_wildfire_ca)

251881

In [126]:
df_wildfire_ca.columns

Index(['DiscoveryDate', 'DiscoveryDayOfYear', 'DiscoveryTime',
       'ContainmentDate', 'ContainmentDayOfYear', 'ContainmentTime',
       'FireSize', 'FireSizeClass', 'Latitude_x', 'Longitude_x',
       'CauseClassification', 'GeneralCause', 'NearestStation', 'Station',
       'Name', 'Latitude_y', 'Longitude_y', 'Elevation', 'Date',
       'MinTemperature', 'MaxTemperature', 'AvgTemperature', 'Precipitation',
       'Snowfall'],
      dtype='object')

In [127]:
df_wildfire_ca = df_wildfire_ca.drop(columns=['Station', 'Name', 'Latitude_y', 'Longitude_y', 'Elevation', 'Date', ])
df_wildfire_ca = df_wildfire_ca.rename(columns={'Latitude_x': 'Latitude', 'Longitude_x': 'Longitude'})

df_wildfire_ca.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,NearestStation,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205,-120.335,Natural,Natural,USC00040383,3.89,13.89,8.89,0.0,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.1667,-116.6342,Human,Debris and open burning,USW00023161,-1.67,12.78,5.56,0.0,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,USW00023161,-1.67,12.78,5.56,0.0,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.03,-116.831944,Human,Missing data/not specified/undetermined,USW00023161,-1.11,12.22,5.56,0.0,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.99,Human,Missing data/not specified/undetermined,USW00023161,-1.11,12.22,5.56,0.0,0.0


In [128]:
# Create a missing values DataFrame 
df_missing_values = df_wildfire_ca[df_wildfire_ca.isnull().any(axis=1)]
df_missing_values.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,NearestStation,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
186,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.5,B,41.533889,-122.943889,Human,Debris and open burning,USC00040029,,,,,
187,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.1,A,41.0999,-123.6845,Human,Missing data/not specified/undetermined,USC00040029,,,,,
206,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.5,B,39.84,-120.696667,Human,Debris and open burning,USC00047195,,,,,
217,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.2,A,40.3332,-120.2344,Human,Debris and open burning,USC00047195,,,,,
221,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.1,A,40.241944,-121.07,Human,Debris and open burning,USC00047195,,,,,


In [129]:
df_missing_values = df_missing_values.dropna(axis=1, how='all')
df_missing_values

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,NearestStation
186,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.50,B,41.533889,-122.943889,Human,Debris and open burning,USC00040029
187,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.10,A,41.099900,-123.684500,Human,Missing data/not specified/undetermined,USC00040029
206,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.50,B,39.840000,-120.696667,Human,Debris and open burning,USC00047195
217,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.20,A,40.333200,-120.234400,Human,Debris and open burning,USC00047195
221,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.10,A,40.241944,-121.070000,Human,Debris and open burning,USC00047195
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251842,2020-12-26,361,20:45,2020-12-26,361.0,21:15,0.10,A,37.971846,-120.367498,Human,Debris and open burning,USC00040383
251845,2020-12-27,362,07:34,2020-12-27,362.0,08:04,0.10,A,36.438782,-121.322502,Missing data/not specified/undetermined,Missing data/not specified/undetermined,USC00040383
251852,2020-12-27,362,15:27,2020-12-27,362.0,15:57,0.10,A,38.111202,-122.250399,Human,Debris and open burning,USC00040383
251855,2020-12-27,362,22:43,2020-12-27,362.0,23:13,0.20,A,36.950082,-120.071438,Human,Missing data/not specified/undetermined,USC00040383


In [130]:
next_nearest_stations = []

for nearest_station in df_missing_values['NearestStation']:
    current_station_coords = stations_df[stations_df['Station'] == nearest_station][['Latitude', 'Longitude']].values
    current_station_coords = np.radians(current_station_coords)
    
    # Find the second nearest station to the current NearestStation
    distances, indices = tree.query(current_station_coords, k=2)
    nearest_station_index = indices[0, 1]
    next_nearest_station = stations_df.iloc[nearest_station_index]['Station']
    
    # Add the next nearest station to the list
    next_nearest_stations.append(next_nearest_station)

# Add the next nearest station to the df_missing_values DataFrame
df_missing_values['NextNearestStation'] = next_nearest_stations

df_missing_values.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,NearestStation,NextNearestStation
186,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.5,B,41.533889,-122.943889,Human,Debris and open burning,USC00040029,USC00041614
187,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.1,A,41.0999,-123.6845,Human,Missing data/not specified/undetermined,USC00040029,USC00041614
206,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.5,B,39.84,-120.696667,Human,Debris and open burning,USC00047195,USC00041018
217,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.2,A,40.3332,-120.2344,Human,Debris and open burning,USC00047195,USC00041018
221,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.1,A,40.241944,-121.07,Human,Debris and open burning,USC00047195,USC00041018


In [131]:
# Merge df_missing_values with df_env
df_merge_missing_values = pd.merge(
    df_missing_values, 
    df_env, 
    how='left', 
    left_on=['NextNearestStation', 'DiscoveryDate'], 
    right_on=['Station', 'Date'],
    suffixes=('_wildfire', '_env')
)

# Display the resulting DataFrame
df_merge_missing_values.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude_wildfire,Longitude_wildfire,CauseClassification,GeneralCause,NearestStation,NextNearestStation,Station,Name,Latitude_env,Longitude_env,Elevation,Date,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.5,B,41.533889,-122.943889,Human,Debris and open burning,USC00040029,USC00041614,USC00041614,"CEDARVILLE, CA US",41.5301,-120.1791,1428.9,1992-03-28,-1.11,16.67,7.78,0.0,0.0
1,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.1,A,41.0999,-123.6845,Human,Missing data/not specified/undetermined,USC00040029,USC00041614,USC00041614,"CEDARVILLE, CA US",41.5301,-120.1791,1428.9,1992-03-28,-1.11,16.67,7.78,0.0,0.0
2,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.5,B,39.84,-120.696667,Human,Debris and open burning,USC00047195,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,1992-04-05,-2.22,17.78,7.78,0.0,0.0
3,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.2,A,40.3332,-120.2344,Human,Debris and open burning,USC00047195,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,1992-04-07,-1.11,12.78,5.83,0.0,0.0
4,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.1,A,40.241944,-121.07,Human,Debris and open burning,USC00047195,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,1992-04-07,-1.11,12.78,5.83,0.0,0.0


In [132]:
df_merge_missing_values["MaxTemperature"].isna().sum()

2653

In [133]:
len(df_merge_missing_values)

14426

In [134]:
df_merge_missing_values

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude_wildfire,Longitude_wildfire,CauseClassification,GeneralCause,NearestStation,NextNearestStation,Station,Name,Latitude_env,Longitude_env,Elevation,Date,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.50,B,41.533889,-122.943889,Human,Debris and open burning,USC00040029,USC00041614,USC00041614,"CEDARVILLE, CA US",41.5301,-120.1791,1428.9,1992-03-28,-1.11,16.67,7.78,0.00,0.0
1,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.10,A,41.099900,-123.684500,Human,Missing data/not specified/undetermined,USC00040029,USC00041614,USC00041614,"CEDARVILLE, CA US",41.5301,-120.1791,1428.9,1992-03-28,-1.11,16.67,7.78,0.00,0.0
2,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.50,B,39.840000,-120.696667,Human,Debris and open burning,USC00047195,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,1992-04-05,-2.22,17.78,7.78,0.00,0.0
3,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.20,A,40.333200,-120.234400,Human,Debris and open burning,USC00047195,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,1992-04-07,-1.11,12.78,5.83,0.00,0.0
4,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.10,A,40.241944,-121.070000,Human,Debris and open burning,USC00047195,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,1992-04-07,-1.11,12.78,5.83,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14421,2020-12-26,361,20:45,2020-12-26,361.0,21:15,0.10,A,37.971846,-120.367498,Human,Debris and open burning,USC00040383,USC00041018,USC00041018,"BOWMAN DAM, CA US",39.4539,-120.6556,1641.3,2020-12-26,-2.22,12.22,5.00,44.96,152.4
14422,2020-12-27,362,07:34,2020-12-27,362.0,08:04,0.10,A,36.438782,-121.322502,Missing data/not specified/undetermined,Missing data/not specified/undetermined,USC00040383,USC00041018,,,,,,,,,,,
14423,2020-12-27,362,15:27,2020-12-27,362.0,15:57,0.10,A,38.111202,-122.250399,Human,Debris and open burning,USC00040383,USC00041018,,,,,,,,,,,
14424,2020-12-27,362,22:43,2020-12-27,362.0,23:13,0.20,A,36.950082,-120.071438,Human,Missing data/not specified/undetermined,USC00040383,USC00041018,,,,,,,,,,,


In [135]:
df_wildfire_ca

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,NearestStation,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205000,-120.335000,Natural,Natural,USC00040383,3.89,13.89,8.89,0.00,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.166700,-116.634200,Human,Debris and open burning,USW00023161,-1.67,12.78,5.56,0.00,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,USW00023161,-1.67,12.78,5.56,0.00,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.030000,-116.831944,Human,Missing data/not specified/undetermined,USW00023161,-1.11,12.22,5.56,0.00,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.990000,Human,Missing data/not specified/undetermined,USW00023161,-1.11,12.22,5.56,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251876,2020-12-31,366,16:58,2020-12-31,366.0,18:41,15.8,C,38.938853,-121.107672,Human,Debris and open burning,USC00040383,-1.11,10.00,4.44,19.81,0.0
251877,2020-12-31,366,17:43,2020-12-31,366.0,18:23,0.5,B,34.073641,-117.787055,Human,Debris and open burning,USW00023161,2.22,15.56,8.89,0.00,0.0
251878,2020-12-31,366,20:32,2020-12-31,366.0,21:02,0.1,A,33.982775,-117.234942,Missing data/not specified/undetermined,Missing data/not specified/undetermined,USW00023161,2.22,15.56,8.89,0.00,0.0
251879,2020-12-31,366,22:21,2020-12-31,366.0,22:51,0.1,A,33.704975,-117.207482,Missing data/not specified/undetermined,Missing data/not specified/undetermined,USW00023161,2.22,15.56,8.89,0.00,0.0


In [136]:
df_merge_missing_values.drop(columns=['NearestStation', 'NextNearestStation', 'Station', 'Name', 'Latitude_env', 'Longitude_env', 'Elevation', 'Date'], inplace=True)
df_merge_missing_values.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude_wildfire,Longitude_wildfire,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.5,B,41.533889,-122.943889,Human,Debris and open burning,-1.11,16.67,7.78,0.0,0.0
1,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.1,A,41.0999,-123.6845,Human,Missing data/not specified/undetermined,-1.11,16.67,7.78,0.0,0.0
2,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.5,B,39.84,-120.696667,Human,Debris and open burning,-2.22,17.78,7.78,0.0,0.0
3,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.2,A,40.3332,-120.2344,Human,Debris and open burning,-1.11,12.78,5.83,0.0,0.0
4,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.1,A,40.241944,-121.07,Human,Debris and open burning,-1.11,12.78,5.83,0.0,0.0


In [137]:
df_merge_missing_values.rename(columns={'Latitude_wildfire':'Latitude', 'Longitude_wildfire': 'Longitude'}, inplace = True)

In [138]:
df_merge_missing_values

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-03-28,88,14:37,1992-03-28,88.0,15:17,0.50,B,41.533889,-122.943889,Human,Debris and open burning,-1.11,16.67,7.78,0.00,0.0
1,1992-03-28,88,16:30,1992-03-28,88.0,17:00,0.10,A,41.099900,-123.684500,Human,Missing data/not specified/undetermined,-1.11,16.67,7.78,0.00,0.0
2,1992-04-05,96,13:15,1992-04-05,96.0,16:30,0.50,B,39.840000,-120.696667,Human,Debris and open burning,-2.22,17.78,7.78,0.00,0.0
3,1992-04-07,98,14:20,1992-04-07,98.0,15:10,0.20,A,40.333200,-120.234400,Human,Debris and open burning,-1.11,12.78,5.83,0.00,0.0
4,1992-04-07,98,14:37,1992-04-07,98.0,15:07,0.10,A,40.241944,-121.070000,Human,Debris and open burning,-1.11,12.78,5.83,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14421,2020-12-26,361,20:45,2020-12-26,361.0,21:15,0.10,A,37.971846,-120.367498,Human,Debris and open burning,-2.22,12.22,5.00,44.96,152.4
14422,2020-12-27,362,07:34,2020-12-27,362.0,08:04,0.10,A,36.438782,-121.322502,Missing data/not specified/undetermined,Missing data/not specified/undetermined,,,,,
14423,2020-12-27,362,15:27,2020-12-27,362.0,15:57,0.10,A,38.111202,-122.250399,Human,Debris and open burning,,,,,
14424,2020-12-27,362,22:43,2020-12-27,362.0,23:13,0.20,A,36.950082,-120.071438,Human,Missing data/not specified/undetermined,,,,,


In [139]:
df_wildfire_ca.drop('NearestStation', axis=1, inplace=True)

In [140]:
df_wildfire_ca.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205,-120.335,Natural,Natural,3.89,13.89,8.89,0.0,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.1667,-116.6342,Human,Debris and open burning,-1.67,12.78,5.56,0.0,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,-1.67,12.78,5.56,0.0,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.03,-116.831944,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.0,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.99,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.0,0.0


In [141]:
# Reset the index if it was previously set
df_wildfire_ca.reset_index(inplace=True)
df_merge_missing_values.reset_index(inplace=True)

# Merge the DataFrames on the relevant columns (the keys should uniquely identify rows)
df_combined = pd.merge(df_wildfire_ca, df_merge_missing_values, 
                       on=['DiscoveryDate', 'DiscoveryTime', 'FireSize', 'Latitude', 'Longitude'], 
                       how='left', 
                       suffixes=('', '_merge'))

# Use combine_first to fill in the missing values
for column in df_wildfire_ca.columns:
    if column not in ['DiscoveryDate', 'DiscoveryTime', 'FireSize', 'Latitude', 'Longitude']:
        df_combined[column] = df_combined[column].combine_first(df_combined[column + '_merge'])

df_combined.drop(columns=[col for col in df_combined.columns if '_merge' in col], inplace=True)

In [142]:
df_combined.drop(columns=['index'], inplace=True)
df_combined.reset_index(drop=True, inplace=True)
df_combined.head()

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205,-120.335,Natural,Natural,3.89,13.89,8.89,0.0,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.1667,-116.6342,Human,Debris and open burning,-1.67,12.78,5.56,0.0,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,-1.67,12.78,5.56,0.0,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.03,-116.831944,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.0,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.99,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.0,0.0


In [143]:
df_combined['MinTemperature'].isna().sum()

2655

In [144]:
df_combined.dropna(subset=['MinTemperature'], inplace=True)
df_combined['MinTemperature'].isna().sum()

0

In [145]:
df_combined

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205000,-120.335000,Natural,Natural,3.89,13.89,8.89,0.00,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.166700,-116.634200,Human,Debris and open burning,-1.67,12.78,5.56,0.00,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,-1.67,12.78,5.56,0.00,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.030000,-116.831944,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.00,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.990000,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251964,2020-12-31,366,16:58,2020-12-31,366.0,18:41,15.8,C,38.938853,-121.107672,Human,Debris and open burning,-1.11,10.00,4.44,19.81,0.0
251965,2020-12-31,366,17:43,2020-12-31,366.0,18:23,0.5,B,34.073641,-117.787055,Human,Debris and open burning,2.22,15.56,8.89,0.00,0.0
251966,2020-12-31,366,20:32,2020-12-31,366.0,21:02,0.1,A,33.982775,-117.234942,Missing data/not specified/undetermined,Missing data/not specified/undetermined,2.22,15.56,8.89,0.00,0.0
251967,2020-12-31,366,22:21,2020-12-31,366.0,22:51,0.1,A,33.704975,-117.207482,Missing data/not specified/undetermined,Missing data/not specified/undetermined,2.22,15.56,8.89,0.00,0.0


In [146]:
# Check duplicate
duplicates = df_combined[df_combined.duplicated()]
duplicates.shape[0]

1588

In [147]:
# Remove duplicate rows
df_combined.drop_duplicates(inplace=True)
df_combined.shape[0]

247726

In [148]:
df_combined

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205000,-120.335000,Natural,Natural,3.89,13.89,8.89,0.00,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.166700,-116.634200,Human,Debris and open burning,-1.67,12.78,5.56,0.00,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,-1.67,12.78,5.56,0.00,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.030000,-116.831944,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.00,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.990000,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251964,2020-12-31,366,16:58,2020-12-31,366.0,18:41,15.8,C,38.938853,-121.107672,Human,Debris and open burning,-1.11,10.00,4.44,19.81,0.0
251965,2020-12-31,366,17:43,2020-12-31,366.0,18:23,0.5,B,34.073641,-117.787055,Human,Debris and open burning,2.22,15.56,8.89,0.00,0.0
251966,2020-12-31,366,20:32,2020-12-31,366.0,21:02,0.1,A,33.982775,-117.234942,Missing data/not specified/undetermined,Missing data/not specified/undetermined,2.22,15.56,8.89,0.00,0.0
251967,2020-12-31,366,22:21,2020-12-31,366.0,22:51,0.1,A,33.704975,-117.207482,Missing data/not specified/undetermined,Missing data/not specified/undetermined,2.22,15.56,8.89,0.00,0.0


In [149]:
# Check for any missing values in the DataFrame
missing_values_count = df_combined.isna().sum().sum()

if missing_values_count > 0:
    print(f"There are {missing_values_count} missing values in the DataFrame.")
else:
    print("No missing values found in the DataFrame.")

No missing values found in the DataFrame.


In [153]:
df_combined.reset_index(drop=True, inplace=True)
df_combined

Unnamed: 0,DiscoveryDate,DiscoveryDayOfYear,DiscoveryTime,ContainmentDate,ContainmentDayOfYear,ContainmentTime,FireSize,FireSizeClass,Latitude,Longitude,CauseClassification,GeneralCause,MinTemperature,MaxTemperature,AvgTemperature,Precipitation,Snowfall
0,1992-01-01,1,01:30,1992-01-01,1.0,02:10,0.1,A,38.205000,-120.335000,Natural,Natural,3.89,13.89,8.89,0.00,0.0
1,1992-01-01,1,13:30,1992-01-01,1.0,17:00,5.0,B,33.166700,-116.634200,Human,Debris and open burning,-1.67,12.78,5.56,0.00,0.0
2,1992-01-01,1,14:37,1992-01-01,1.0,15:07,0.2,A,33.663889,-116.171944,Human,Misuse of fire by a minor,-1.67,12.78,5.56,0.00,0.0
3,1992-01-02,2,14:37,1992-01-02,2.0,15:07,0.2,A,33.030000,-116.831944,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.00,0.0
4,1992-01-02,2,14:37,1992-01-02,2.0,15:17,0.5,B,33.896111,-116.990000,Human,Missing data/not specified/undetermined,-1.11,12.22,5.56,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247721,2020-12-31,366,16:58,2020-12-31,366.0,18:41,15.8,C,38.938853,-121.107672,Human,Debris and open burning,-1.11,10.00,4.44,19.81,0.0
247722,2020-12-31,366,17:43,2020-12-31,366.0,18:23,0.5,B,34.073641,-117.787055,Human,Debris and open burning,2.22,15.56,8.89,0.00,0.0
247723,2020-12-31,366,20:32,2020-12-31,366.0,21:02,0.1,A,33.982775,-117.234942,Missing data/not specified/undetermined,Missing data/not specified/undetermined,2.22,15.56,8.89,0.00,0.0
247724,2020-12-31,366,22:21,2020-12-31,366.0,22:51,0.1,A,33.704975,-117.207482,Missing data/not specified/undetermined,Missing data/not specified/undetermined,2.22,15.56,8.89,0.00,0.0


In [154]:
df_combined.to_csv("cleaned_wilfire_environmental_data.csv", index=False)