In [1]:
import pandas as pd
import numpy as np
import geocoder

In [2]:
path = '../scratch_data/'
smoke = pd.read_csv(path + 'smoke_west_counties_2010_2016.csv')

In [3]:
print(smoke.shape)
smoke.head()

(3683736, 7)


Unnamed: 0,STATE,STATEFP,COUNTYFP,county_name,date,POPULATION,smoke_score
0,AZ,4,7,Gila County,20100604,460,1
1,AZ,4,7,Gila County,20100604,821,1
2,AZ,4,7,Gila County,20100604,1301,1
3,AZ,4,7,Gila County,20100604,1583,1
4,AZ,4,7,Gila County,20100604,969,1


In [4]:
fires = pd.read_csv(path + 'mtbs_FODpoints_DD/MTBS_FODpoints.csv', lineterminator='\n')
wildfires = fires.copy()

In [5]:
print(fires.shape)
fires.head()

(26557, 24)


Unnamed: 0.1,Unnamed: 0,Event_ID,irwinID,Incid_Name,Incid_Type,Map_ID,Map_Prog,Asmnt_Type,BurnBndAc,BurnBndLat,...,Perim_ID,dNBR_offst,dNBR_stdDv,NoData_T,IncGreen_T,Low_T,Mod_T,High_T,Comment,geometry
0,0,UT4176411148820121004,,HELLS HOLLOW,Prescribed Fire,414,MTBS,Extended,4758,41.766,...,703803100000000.0,47,-9999,-970,-150,113,345,616,,POINT (-111.507983362626 41.75847249141982)
1,1,WA4796812059020120908,,KLONE PEAK,Wildfire,441,MTBS,Extended,1634,47.972,...,704502700000000.0,25,-9999,-970,-150,50,250,460,,POINT (-120.5815199717518 47.97210047512218)
2,2,WA4797012073420120908,,BASALT,Wildfire,444,MTBS,Extended,1705,47.967,...,704502700000000.0,-4,-9999,-970,-150,60,250,500,,POINT (-120.7448841870875 47.96732496805843)
3,3,WA4805112058820120908,,PYRAMID,Wildfire,449,MTBS,Extended,1962,48.044,...,,74,-9999,-970,-150,120,220,530,,POINT (-120.5812232399775 48.04473598140331)
4,4,WA4788612026820120908,,FIRST CREEK,Wildfire,454,MTBS,Extended,1434,47.895,...,704502700000000.0,-37,-9999,-970,-150,50,200,460,,POINT (-120.2604888608036 47.89597596325936)


In [6]:
smoke.columns = smoke.columns.str.lower()
wildfires.columns = wildfires.columns.str.lower()

In [7]:
#drops - only run cell once
wildfires.drop('unnamed: 0', axis=1, inplace=True)
wildfires['geometry2'] = wildfires['geometry'].map(lambda x: x.replace('POINT (', ''))

### Try merge on lat/long
* wildfires: extract lat/lon from 'geometry'
    * same as 'brnbndlat' and 'brnbndlon' with more decimals...
    * filter df by lon (since no 'state' id) : min_lon_west = -125.5, max_lon_east = -108.5
* smoke: create lat/lon from county

In [8]:
wildfires[['lon', 'lat']] = wildfires['geometry2'].str.split(' ', expand=True)
wildfires['lat'] = wildfires['lat'].str.replace(')','').astype('float')
wildfires['lon'] = wildfires['lon'].astype('float')

  wildfires['lat'] = wildfires['lat'].str.replace(')','').astype('float')


In [9]:
wildfires['ig_date'] = wildfires['ig_date'].str.replace('-','').astype('int32')

In [10]:
wildfires = wildfires.loc[(wildfires['ig_date']>=20100604) & (wildfires['ig_date']<=20161231)]

### lat/lon too granular for merge easily with smoke, try reverse geocode county to wildfires

In [11]:
len(wildfires)

7198

In [12]:
#bound by lat/lon to get States of interest, no 'state' identifier
wildfires = wildfires.loc[wildfires['lon'] <= -108.5]  #eastern bound, AZ-NM border
wildfires = wildfires.loc[wildfires['lat'] <= 53.5] #northernmost point in smoke data
wildfires = wildfires.loc[wildfires['lon'] >= -130] #west bound since apparently HI is in here

In [13]:
len(wildfires)

1588

In [14]:
wildfires.head(1)

Unnamed: 0,event_id,irwinid,incid_name,incid_type,map_id,map_prog,asmnt_type,burnbndac,burnbndlat,burnbndlon,...,nodata_t,incgreen_t,low_t,mod_t,high_t,comment,geometry,geometry2,lon,lat
0,UT4176411148820121004,,HELLS HOLLOW,Prescribed Fire,414,MTBS,Extended,4758,41.766,-111.507,...,-970,-150,113,345,616,,POINT (-111.507983362626 41.75847249141982),-111.507983362626 41.75847249141982),-111.507983,41.758472


In [15]:
reverse_geo_lats = wildfires['lat'].to_list()
reverse_geo_lon = wildfires['lon'].to_list()
rev_geo_coords = list(zip(reverse_geo_lats, reverse_geo_lon))

In [16]:
min(reverse_geo_lats), max(reverse_geo_lats)

(31.30242149353263, 48.974338338076)

In [17]:
min(reverse_geo_lon), max(reverse_geo_lon)

(-124.0926552889532, -108.5106089326423)

In [18]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = 'helen.meigs@gmail.com')
location = geolocator.reverse(rev_geo_coords[0])

In [19]:
location[0]

'Cache County, Utah, United States'

In [20]:
# %%time
# wf_counties = []
# wf_states = []
# wf_full_loc = []

# for i in range(len(rev_geo_coords)):
#     location = geolocator.reverse(rev_geo_coords[i])
#     county = location[0].split(',')[0]
#     state = location[0].split(',')[1]
#     wf_counties.append(county)
#     wf_states.append(state)
#     wf_full_loc.append(location[0])

##### Tried working with geocoded 'location' output in list  form (each entry a single str that could be split on ',') but couldn't figure it out.  Proceed to the less elegant answer:
* New df with geo_columns for each section of location
    * _ONE_ of these geo_columns will contain 'County' for most observations
    * geo_column containing 'County' will vary
        * curve ball, some things like County Road, and 'Not in a County' exist. great.
* Create a column for _each?_ geo_col indicating presence/absence of 'County'?
* Populate some other final column with all the counties?  Maybe by cascading fillna()?
* REPEAT WITH STATE

In [21]:
# wf_locdf = pd.DataFrame(wf_full_loc, columns=['reverse_geocode'])
# wf_locdf.to_csv('../scratch_data/mtbs_FODpoints_DD/mtbs_reverse_geocode.csv', index=False)

In [22]:
wf_locdf = pd.read_csv('../scratch_data/mtbs_FODpoints_DD/mtbs_reverse_geocode.csv')

In [23]:
wf_locdf.head()

Unnamed: 0,reverse_geocode,county_name,state
0,"Cache County, Utah, United States",Cache County,Utah
1,"Chelan County, Washington, United States",Chelan County,Washington
2,"Minnow Creek Trail, Chelan County, Washington,...",Chelan County,Washington
3,"Pugh Ridge Trail #1438, Chelan County, Washing...",Chelan County,Washington
4,"Chelan County, Washington, United States",Chelan County,Washington


In [24]:
wf_locdf[['geo1', 'geo2', 'geo3', 'geo4', 'geo5', 'geo6', 'geo7']] = wf_locdf['reverse_geocode'].str.split(',', expand=True)

In [25]:
wf_locdf.fillna('', inplace=True)

In [26]:
geo_cols = wf_locdf.drop(columns='reverse_geocode').columns.to_list()

for i in geo_cols:
    wf_locdf[f'{i}_county'] = wf_locdf[i].map(lambda x: x if 'County' in x else np.nan)

In [27]:
wf_locdf['county_name'] = np.nan

In [28]:
county_cols = ['geo1_county', 'geo2_county', 'geo3_county', 'geo4_county', 'geo5_county']
for i in county_cols:
    wf_locdf['county_name'].fillna(wf_locdf[i], inplace=True)

In [29]:
# only 15 locations without county ID, not bad
wf_locdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1588 entries, 0 to 1587
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   reverse_geocode     1588 non-null   object 
 1   county_name         1571 non-null   object 
 2   state               1588 non-null   object 
 3   geo1                1588 non-null   object 
 4   geo2                1588 non-null   object 
 5   geo3                1588 non-null   object 
 6   geo4                1588 non-null   object 
 7   geo5                1588 non-null   object 
 8   geo6                1588 non-null   object 
 9   geo7                1588 non-null   object 
 10  county_name_county  1585 non-null   object 
 11  state_county        0 non-null      float64
 12  geo1_county         1113 non-null   object 
 13  geo2_county         367 non-null    object 
 14  geo3_county         82 non-null     object 
 15  geo4_county         11 non-null     object 
 16  geo5_c

##### Repeat for State, so can filter by State
More annoying because States don't say the word State in them...luckily I only need seven.

In [30]:
wf_locdf.head()

Unnamed: 0,reverse_geocode,county_name,state,geo1,geo2,geo3,geo4,geo5,geo6,geo7,county_name_county,state_county,geo1_county,geo2_county,geo3_county,geo4_county,geo5_county,geo6_county,geo7_county
0,"Cache County, Utah, United States",Cache County,Utah,Cache County,Utah,United States,,,,,Cache County,,Cache County,,,,,,
1,"Chelan County, Washington, United States",Chelan County,Washington,Chelan County,Washington,United States,,,,,Chelan County,,Chelan County,,,,,,
2,"Minnow Creek Trail, Chelan County, Washington,...",Chelan County,Washington,Minnow Creek Trail,Chelan County,Washington,United States,,,,Chelan County,,,Chelan County,,,,,
3,"Pugh Ridge Trail #1438, Chelan County, Washing...",Chelan County,Washington,Pugh Ridge Trail #1438,Chelan County,Washington,United States,,,,Chelan County,,,Chelan County,,,,,
4,"Chelan County, Washington, United States",Chelan County,Washington,Chelan County,Washington,United States,,,,,Chelan County,,Chelan County,,,,,,


In [31]:
# westofrockies = ['Montana', 'Wyoming', 'New Mexico', 'Colorado', 'California', 'Arizona', 'Nevada', 'Washington', 'Oregon', 'Utah', 'Idaho']
# for i in geo_cols:
#     wf_locdf[f'{i}_state'] = wf_locdf[i].str.contains('|'.join(westofrockies)).astype('int')

In [32]:
# for i in geo_cols:
#     for j in range(len(wf_locdf)):
#                    if wf_locdf[f'{i}_state'][j] == 1:
#                        wf_locdf[f'{i}_state'][j]=wf_locdf[i][j]
#                    else:
#                        wf_locdf[f'{i}_state'][j]=np.nan
            
        

In [33]:
# wf_locdf['state'] = np.nan
# state_cols = ['geo1_state', 'geo2_state', 'geo3_state', 'geo4_state', 'geo5_state', 'geo6_state']
# for i in state_cols:
#     wf_locdf['state'].fillna(wf_locdf[i], inplace=True)

In [34]:
# wf_locdf.drop(columns=geo_cols + county_cols + state_cols, inplace=True)
# wf_locdf.drop(columns = ['geo6_county', 'geo7_county', 'geo7_state'], inplace=True)

In [35]:
# pd.set_option('display.max_colwidth', 1000)
# wf_locdf.loc[wf_locdf['county_name'].isnull()]  #now what

In [36]:
# #manually impute county nulls by google mapping it and seeing what's closest
# wf_locdf['county_name'][94] = 'Duchesne County'
# wf_locdf['county_name'][126] = 'Santa Cruz County'
# wf_locdf['state'][126] = 'Arizona'
# # index 135 cannot impute county (Utah, U.S.)
# wf_locdf['county_name'][188] = 'San Diego County'
# wf_locdf['state'][188] = 'California'
# wf_locdf['county_name'][288] = 'Los Angeles County'
# wf_locdf['county_name'][323] = 'Duchesne County'
# wf_locdf['county_name'][486] = 'Cochise County'
# wf_locdf['state'][486] = 'Arizona'
# wf_locdf['county_name'][589] = 'La Paz County'
# # index 649 cannot impute county (Utah, U.S.)
# wf_locdf['county_name'][698] = 'Los Angeles County'
# wf_locdf['county_name'][836] = 'Los Angeles County'
# wf_locdf['county_name'][1045] = 'Weber County'
# wf_locdf['county_name'][1055] = 'Los Angeles County'
# wf_locdf['county_name'][1177] = 'Los Angeles County'
# wf_locdf['county_name'][1500] = 'Los Angeles County'
# wf_locdf['county_name'][1504] = 'Los Angeles County'
# # index 1528 cannot impute county (Utah, U.S.)

In [37]:
# wf_locdf.loc[wf_locdf['county_name'].isnull()] #checks out

Got a bunch of not-States, dangit (ie Washington Camp, which is in Arizona).  Few, so fix manually.

In [38]:
# #got a bunch of not-states, dangit
# wf_locdf.state.value_counts()

In [39]:
# wf_locdf.loc[wf_locdf['state'].str.contains('County')] #phew easy fix

In [40]:
# wf_locdf['state'] = wf_locdf['state'].str.replace('County','')

In [41]:
# wf_locdf['state'] = wf_locdf['state'].str.replace('Riding and Hiking ','')
# wf_locdf['state'] = wf_locdf['state'].str.replace('Trail','')
# wf_locdf['state'] = wf_locdf['state'].str.replace('Washingtonia Drive','California') #San diego county
# wf_locdf['state'] = wf_locdf['state'].str.replace('Washington Camp','Arizona') #in AZ

In [42]:
# #strip leading and trailing whitespace for state and county_name
# wf_locdf['state'] = wf_locdf['state'].str.strip()
# wf_locdf['county_name'] = wf_locdf['county_name'].str.strip()

In [43]:
# wf_locdf.to_csv('../scratch_data/mtbs_FODpoints_DD/mtbs_reverse_geocode.csv', index=False)

## Merge reverse geocodes back to wildfires
Now wildfire df has state and county name
* Few NANs
* few obs from States not of interest (Wyoming, Colorado, New Mexico, Real Mexico).  However, these made the cut based on the lat/lon boundaries established earlier so these fire ignition points are very close to our established bounds (close to state border).
    * keep in air quality analysis, because they will impact pollution sensors even though fires not originating from 'State of interest'?

In [44]:
wf_locdf = pd.read_csv('../scratch_data/mtbs_FODpoints_DD/mtbs_reverse_geocode.csv')

In [45]:
wildfires.reset_index(drop=True, inplace=True)

In [46]:
wildfires_with_counties = pd.concat([wildfires, wf_locdf], axis=1)

In [47]:
wildfires_with_counties.head()

Unnamed: 0,event_id,irwinid,incid_name,incid_type,map_id,map_prog,asmnt_type,burnbndac,burnbndlat,burnbndlon,...,mod_t,high_t,comment,geometry,geometry2,lon,lat,reverse_geocode,county_name,state
0,UT4176411148820121004,,HELLS HOLLOW,Prescribed Fire,414,MTBS,Extended,4758,41.766,-111.507,...,345,616,,POINT (-111.507983362626 41.75847249141982),-111.507983362626 41.75847249141982),-111.507983,41.758472,"Cache County, Utah, United States",Cache County,Utah
1,WA4796812059020120908,,KLONE PEAK,Wildfire,441,MTBS,Extended,1634,47.972,-120.582,...,250,460,,POINT (-120.5815199717518 47.97210047512218),-120.5815199717518 47.97210047512218),-120.58152,47.9721,"Chelan County, Washington, United States",Chelan County,Washington
2,WA4797012073420120908,,BASALT,Wildfire,444,MTBS,Extended,1705,47.967,-120.746,...,250,500,,POINT (-120.7448841870875 47.96732496805843),-120.7448841870875 47.96732496805843),-120.744884,47.967325,"Minnow Creek Trail, Chelan County, Washington,...",Chelan County,Washington
3,WA4805112058820120908,,PYRAMID,Wildfire,449,MTBS,Extended,1962,48.044,-120.581,...,220,530,,POINT (-120.5812232399775 48.04473598140331),-120.5812232399775 48.04473598140331),-120.581223,48.044736,"Pugh Ridge Trail #1438, Chelan County, Washing...",Chelan County,Washington
4,WA4788612026820120908,,FIRST CREEK,Wildfire,454,MTBS,Extended,1434,47.895,-120.259,...,200,460,,POINT (-120.2604888608036 47.89597596325936),-120.2604888608036 47.89597596325936),-120.260489,47.895976,"Chelan County, Washington, United States",Chelan County,Washington


In [48]:
wildfires_with_counties.columns

Index(['event_id', 'irwinid', 'incid_name', 'incid_type', 'map_id', 'map_prog',
       'asmnt_type', 'burnbndac', 'burnbndlat', 'burnbndlon', 'ig_date',
       'pre_id', 'post_id', 'perim_id', 'dnbr_offst', 'dnbr_stddv', 'nodata_t',
       'incgreen_t', 'low_t', 'mod_t', 'high_t', 'comment', 'geometry',
       'geometry2', 'lon', 'lat', 'reverse_geocode', 'county_name', 'state'],
      dtype='object')

In [49]:
wildfires_with_counties.drop(['irwinid', 'dnbr_offst', 'dnbr_stddv', 'pre_id', 'post_id',
                'map_prog', 'map_id', 'comment', 'geometry2'], axis=1, inplace=True)

In [50]:
wildfires_with_counties.rename(columns = {'ig_date':'date'}, inplace=True)

In [51]:
wildfires_with_counties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1588 entries, 0 to 1587
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   event_id         1588 non-null   object 
 1   incid_name       1588 non-null   object 
 2   incid_type       1588 non-null   object 
 3   asmnt_type       1588 non-null   object 
 4   burnbndac        1588 non-null   int64  
 5   burnbndlat       1588 non-null   float64
 6   burnbndlon       1588 non-null   float64
 7   date             1588 non-null   int32  
 8   perim_id         238 non-null    float64
 9   nodata_t         1588 non-null   int64  
 10  incgreen_t       1588 non-null   int64  
 11  low_t            1588 non-null   int64  
 12  mod_t            1588 non-null   int64  
 13  high_t           1588 non-null   int64  
 14  geometry         1588 non-null   object 
 15  lon              1588 non-null   float64
 16  lat              1588 non-null   float64
 17  reverse_geocod

In [52]:
#manually impute missing counties based on incident name & lat/lon score
wildfires_with_counties['county_name'][135] = 'Uintah County'
wildfires_with_counties['county_name'][649] = 'Uintah County'
wildfires_with_counties['county_name'][1528] = 'Uintah County'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wildfires_with_counties['county_name'][135] = 'Uintah County'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wildfires_with_counties['county_name'][649] = 'Uintah County'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wildfires_with_counties['county_name'][1528] = 'Uintah County'


## Merge wildfires df with smoke df
## EDIT: first add lat/lon to smoke before merging
Analyzing at county level or higher, but want lat/lon for every observation for mapping or potential clustering (clusters may be better than counties to group nearby observations)

### Adding lat/lon to smoke - geocodes from county_name
Code to generate county_key is at end of notebook (had sidelined because thought no longer useful).  Starting with completed county_key here:

Geocode based on county name AND STATE NAME.  For example, there is a Kings County CA and Kings County PA, so your first go at this did not pan out and you never even knew it until it was mapped out in Tableau and you had smoke measurements coming from the East coast.

## Smoke df: Need to make column with 'county_name, State' for geocoding
Otherwise you get counties from East Coast with shared names

In [53]:
smoke.state.value_counts() #yes filtered to what we want

CA    1829232
WA     669898
OR     469052
ID     224217
UT     177761
AZ     177436
NV     136140
Name: state, dtype: int64

In [54]:
smoke['county_state'] = smoke['county_name'] + ', ' + smoke['state']

In [55]:
smoke.head()

Unnamed: 0,state,statefp,countyfp,county_name,date,population,smoke_score,county_state
0,AZ,4,7,Gila County,20100604,460,1,"Gila County, AZ"
1,AZ,4,7,Gila County,20100604,821,1,"Gila County, AZ"
2,AZ,4,7,Gila County,20100604,1301,1,"Gila County, AZ"
3,AZ,4,7,Gila County,20100604,1583,1,"Gila County, AZ"
4,AZ,4,7,Gila County,20100604,969,1,"Gila County, AZ"


In [56]:
unique_locs = set(smoke.county_state)

In [57]:
county_key = pd.DataFrame(unique_locs, columns=['county_state'])

In [58]:
county_key.loc[county_key['county_state'].str.contains('King')]

Unnamed: 0,county_state
26,"King County, WA"
146,"Kings County, CA"


In [59]:
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent = 'helen.meigs@gmail.com')

In [60]:
# %%time
# county_key['lat']= county_key['county_state'].map(lambda x: geocoder.arcgis(x).latlng[0])
# county_key['lon'] = county_key['county_state'].map(lambda x: geocoder.arcgis(x).latlng[1])

# # wall time 6min

In [61]:
# county_key['county_name'] = county_key['county_state'].map(lambda x: x[:-4])

In [62]:
# county_key.to_csv('../scratch_data/smoke_2000_16_county_key.csv', index=False)

In [63]:
county_key = pd.read_csv('../scratch_data/smoke_2000_16_county_key.csv')

In [64]:
county_key.head()

Unnamed: 0,county_state,lat,lon,county_name
0,"Walla Walla County, WA",46.231254,-118.472687,Walla Walla County
1,"San Mateo County, CA",37.422133,-122.32863,San Mateo County
2,"Placer County, CA",39.059137,-120.75363,Placer County
3,"Modoc County, CA",41.576517,-120.736847,Modoc County
4,"Bear Lake County, ID",42.297151,-111.33,Bear Lake County


In [65]:
smoke = pd.merge(smoke, county_key, on=['county_state', 'county_name'])

In [68]:
smoke.rename(columns={'lat':'lat_smo', 'lon':'lon_smo'}, inplace=True)

In [70]:
smoke.drop(columns = ['statefp', 'countyfp', 'population','county_state'], axis=1, inplace=True)
smoke.drop_duplicates(inplace=True)

In [71]:
smoke.reset_index(drop=True, inplace=True)

In [72]:
smoke.head()

Unnamed: 0,state,county_name,date,smoke_score,lat_smo,lon_smo
0,AZ,Gila County,20100604,1,33.799665,-110.811659
1,AZ,Gila County,20100606,1,33.799665,-110.811659
2,AZ,Gila County,20100624,1,33.799665,-110.811659
3,AZ,Gila County,20100627,1,33.799665,-110.811659
4,AZ,Gila County,20100713,1,33.799665,-110.811659


## Back to merging wildfires with smoke df:

In [73]:
wildfires_with_counties.head()

Unnamed: 0,event_id,incid_name,incid_type,asmnt_type,burnbndac,burnbndlat,burnbndlon,date,perim_id,nodata_t,incgreen_t,low_t,mod_t,high_t,geometry,lon,lat,reverse_geocode,county_name,state
0,UT4176411148820121004,HELLS HOLLOW,Prescribed Fire,Extended,4758,41.766,-111.507,20121004,703803100000000.0,-970,-150,113,345,616,POINT (-111.507983362626 41.75847249141982),-111.507983,41.758472,"Cache County, Utah, United States",Cache County,Utah
1,WA4796812059020120908,KLONE PEAK,Wildfire,Extended,1634,47.972,-120.582,20120908,704502700000000.0,-970,-150,50,250,460,POINT (-120.5815199717518 47.97210047512218),-120.58152,47.9721,"Chelan County, Washington, United States",Chelan County,Washington
2,WA4797012073420120908,BASALT,Wildfire,Extended,1705,47.967,-120.746,20120908,704502700000000.0,-970,-150,60,250,500,POINT (-120.7448841870875 47.96732496805843),-120.744884,47.967325,"Minnow Creek Trail, Chelan County, Washington,...",Chelan County,Washington
3,WA4805112058820120908,PYRAMID,Wildfire,Extended,1962,48.044,-120.581,20120908,,-970,-150,120,220,530,POINT (-120.5812232399775 48.04473598140331),-120.581223,48.044736,"Pugh Ridge Trail #1438, Chelan County, Washing...",Chelan County,Washington
4,WA4788612026820120908,FIRST CREEK,Wildfire,Extended,1434,47.895,-120.259,20120908,704502700000000.0,-970,-150,50,200,460,POINT (-120.2604888608036 47.89597596325936),-120.260489,47.895976,"Chelan County, Washington, United States",Chelan County,Washington


In [74]:
def rename_latlon(df, name):
    df.rename(columns = {'lat':f'lat_{name}', 'lon':f'lon_{name}'}, inplace=True)
    
    
rename_latlon(smoke, 'smo')
rename_latlon(wildfires_with_counties, 'wf')

In [75]:
state_dict = {'Idaho':'ID', 'California': 'CA', 'Arizona':'AZ', 'Oregon':'OR', 
              'Washington':'WA', 'Nevada':'NV', 'Montana':'MT', 'Utah':'UT',
              'Wyoming':'WY', 'New Mexico':'NM', 'Colorado':'CO'}

wildfires_with_counties['state_abbrev'] = wildfires_with_counties['state'].map(state_dict)

In [76]:
wildfires_with_counties.rename(columns = {'state':'state_full', 'state_abbrev':'state',
                                         'reverse_geocode':'reverse_geocode_fire_ig'}, inplace=True)

In [77]:
wildfires_with_counties.tail(1)

Unnamed: 0,event_id,incid_name,incid_type,asmnt_type,burnbndac,burnbndlat,burnbndlon,date,perim_id,nodata_t,...,low_t,mod_t,high_t,geometry,lon_wf,lat_wf,reverse_geocode_fire_ig,county_name,state_full,state
1587,OR4309711823520110825,DSL COMPLEX,Wildfire,Initial,5976,43.068,-118.234,20110825,,-970,...,40,9999,9999,POINT (-118.2343864935406 43.06794965014317),-118.234386,43.06795,"Harney County, Oregon, United States",Harney County,Oregon,OR


In [78]:
smoke.tail(1)

Unnamed: 0,state,county_name,date,smoke_score,lat_smo,lon_smo
66592,NV,White Pine County,20160919,1,39.442161,-114.901646


In [79]:
inner_fires_smoke = pd.merge(smoke, wildfires_with_counties, on = ['county_name','state', 'date'], how='inner')
keepfires_smoke = pd.merge(smoke, wildfires_with_counties, on = ['county_name', 'state', 'date'], how='right')
outer_fires_smoke = pd.merge(smoke, wildfires_with_counties, on=['county_name', 'state', 'date'], how = 'outer')

See len of each join: inner join drops 100 fires (of 1588 - no good).  Maintaining all fires gives 2250 datapoints, but we do want pollution data for days in between fires, so the outer join of 83,000 may be what we have to work with. Especially because this is only fire IGNITION DATES, and we absolutely want to look at days immediately post-fire, and how they compare to baseline (long duration away from fire).  Exporting outer join for EDA, the limited versions are option for modeling.

In [80]:
print(len(smoke))
print(len(wildfires_with_counties))
print(len(smoke) + len(wildfires_with_counties))
print(len(outer_fires_smoke), len(keepfires_smoke), len(inner_fires_smoke))

66593
1588
68181
67561 2006 1197


In [81]:
inner_fires_smoke

Unnamed: 0,state,county_name,date,smoke_score,lat_smo,lon_smo,event_id,incid_name,incid_type,asmnt_type,...,nodata_t,incgreen_t,low_t,mod_t,high_t,geometry,lon_wf,lat_wf,reverse_geocode_fire_ig,state_full
0,AZ,Graham County,20110626,1,32.932678,-109.887499,AZ3307311037720110626,STANLEY,Wildfire,Extended,...,-970,-150,-10,102,250,POINT (-110.3297109946449 33.05281063846503),-110.329711,33.052811,"Graham County, Arizona, United States",Arizona
1,AZ,Graham County,20120316,1,32.932678,-109.887499,AZ3328910997820120316,HOOKER,Wildfire,Initial (SS),...,9999,9999,5,-9999,-9999,POINT (-109.9572221649203 33.2835725224978),-109.957222,33.283573,"Graham County, Arizona, United States",Arizona
2,AZ,Graham County,20130513,1,32.932678,-109.887499,AZ3343510971420130513,PINE SALT,Prescribed Fire,Initial,...,-970,-150,50,330,9999,POINT (-109.7197042422813 33.33716745372745),-109.719704,33.337167,"Black River Road, Graham County, Arizona, Unit...",Arizona
3,AZ,Graham County,20130513,3,32.932678,-109.887499,AZ3343510971420130513,PINE SALT,Prescribed Fire,Initial,...,-970,-150,50,330,9999,POINT (-109.7197042422813 33.33716745372745),-109.719704,33.337167,"Black River Road, Graham County, Arizona, Unit...",Arizona
4,AZ,Graham County,20130616,1,32.932678,-109.887499,AZ3339910989520130616,CREEK,Wildfire,Initial,...,-970,-150,50,330,9999,POINT (-109.841033210156 33.39446296079526),-109.841033,33.394463,"Graham County, Arizona, United States",Arizona
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,NV,White Pine County,20130701,1,39.442161,-114.901646,NV4003011445220130701,NORTH CREEK,Wildfire,Extended,...,-970,-150,70,218,410,POINT (-114.4792105431762 40.01817586298156),-114.479211,40.018176,"White Pine County, Nevada, United States",Nevada
1193,NV,White Pine County,20140716,1,39.442161,-114.901646,NV3924811413320140716,HAMPTON,Wildfire,Extended,...,-970,-150,50,319,610,POINT (-114.1265650310761 39.27945844585704),-114.126565,39.279458,"White Pine County, Nevada, United States",Nevada
1194,NV,White Pine County,20140724,2,39.442161,-114.901646,NV4000011456620140724,SAMPSON,Wildfire,Extended,...,-970,-150,50,324,620,POINT (-114.5689564363166 40.00122744315588),-114.568956,40.001227,"White Pine County, Nevada, United States",Nevada
1195,NV,White Pine County,20160808,1,39.442161,-114.901646,NV3904811430620160808,STRAWBERRY,Wildfire,Extended,...,-970,-150,15,290,520,POINT (-114.276930874846 39.05990748074837),-114.276931,39.059907,"National Forest Development Road 456, White Pi...",Nevada


In [82]:
outer_fires_smoke.drop_duplicates(inplace=True)
outer_fires_smoke.reset_index(drop=True, inplace=True)

In [None]:
#outer_fires_smoke.to_csv('../scratch_data/MTBS_merge_smoke_inProgress.csv', index=False)