In [1]:
import pandas as pd
import numpy as np
import itertools

#### Read in CSVs

In [2]:
pfw_df_1988_1995_orig = pd.read_csv('../data/PFW_1988_1995_public.csv')

In [3]:
pfw_df_1996_2000_orig = pd.read_csv('../data/PFW_1996_2000_public.csv')

In [4]:
pfw_df_2001_2005_orig = pd.read_csv('../data/PFW_2001_2005_public.csv')

In [5]:
pfw_df_2006_2010_orig = pd.read_csv('../data/PFW_2006_2010_public.csv')

In [6]:
pfw_df_2011_2015_orig = pd.read_csv('../data/PFW_2011_2015_public.csv')

In [7]:
pfw_df_2016_2020_orig = pd.read_csv('../data/PFW_2016_2020_public.csv')

In [8]:
pfw_df_2021_orig = pd.read_csv('../data/PFW_2021_public.csv')

In [9]:
pfw_df_1988_1995 = pfw_df_1988_1995_orig.copy(deep=True)
pfw_df_1996_2000 = pfw_df_1996_2000_orig.copy(deep=True)
pfw_df_2001_2005 = pfw_df_2001_2005_orig.copy(deep=True)
pfw_df_2006_2010 = pfw_df_2006_2010_orig.copy(deep=True)
pfw_df_2011_2015 = pfw_df_2011_2015_orig.copy(deep=True)
pfw_df_2016_2020 = pfw_df_2016_2020_orig.copy(deep=True)
pfw_df_2021 = pfw_df_2021_orig.copy(deep=True)

#### Fix and Filter Column Headers

In [10]:
pfw_df_1988_1995.columns = pfw_df_1988_1995.columns.str.lower()
pfw_df_1996_2000.columns = pfw_df_1996_2000.columns.str.lower()
pfw_df_2001_2005.columns = pfw_df_2001_2005.columns.str.lower()
pfw_df_2006_2010.columns = pfw_df_2006_2010.columns.str.lower()
pfw_df_2011_2015.columns = pfw_df_2011_2015.columns.str.lower()
pfw_df_2016_2020.columns = pfw_df_2016_2020.columns.str.lower()
pfw_df_2021.columns = pfw_df_2021.columns.str.lower()

In [11]:
column_set = [pfw_df_1988_1995.columns,
              pfw_df_1996_2000.columns,
              pfw_df_2001_2005.columns,
              pfw_df_2006_2010.columns,
              pfw_df_2011_2015.columns,
              pfw_df_2016_2020.columns,
              pfw_df_2021.columns]

# Compare column headers
index = 1
while index < len(column_set):
    column_index = 0
    if(len(column_set[index - 1]) == len(column_set[index])):
        while column_index < len(column_set[index]):
            if(column_set[index - 1][column_index] != column_set[index][column_index]):
                print('Mismatching column headers at column index', column_index)
            column_index += 1
    else:
        print('Mismatching column lengths at indices', index - 1, 'and ' + str(index) + '.')
        while column_index < min(len(column_set[index - 1]), len(column_set[index])):
            if(column_set[index - 1][column_index] != column_set[index][column_index]):
                print('Mismatching column headers at column index ' 
                      + str(column_index) + ': "' + column_set[index - 1][column_index] + '" vs "' +
                      column_set[index][column_index] + '"')
                break
            column_index += 1
    index += 1

Mismatching column lengths at indices 5 and 6.
Mismatching column headers at column index 13: "plus_code" vs "valid"


In [12]:
print(column_set[5])
print(column_set[6])

Index(['loc_id', 'latitude', 'longitude', 'subnational1_code',
       'entry_technique', 'sub_id', 'obs_id', 'month', 'day', 'year',
       'proj_period_id', 'species_code', 'how_many', 'plus_code', 'valid',
       'reviewed', 'day1_am', 'day1_pm', 'day2_am', 'day2_pm',
       'effort_hrs_atleast', 'snow_dep_atleast', 'data_entry_method'],
      dtype='object')
Index(['loc_id', 'latitude', 'longitude', 'subnational1_code',
       'entry_technique', 'sub_id', 'obs_id', 'month', 'day', 'year',
       'proj_period_id', 'species_code', 'how_many', 'valid', 'reviewed',
       'day1_am', 'day1_pm', 'day2_am', 'day2_pm', 'effort_hrs_atleast',
       'snow_dep_atleast', 'data_entry_method'],
      dtype='object')


In [13]:
# Remove unnecessary columns, including the obsolete 'plus_code' column
valid_headers = ['proj_period_id',
                 'sub_id', 
                 'obs_id',
                 'month', 
                 'day', 
                 'year',
                 'species_code',
                 'how_many',
                 'valid',
                 'day1_am',
                 'day1_pm',
                 'day2_am',
                 'day2_pm',
                 'effort_hrs_atleast',
                 'snow_dep_atleast',
                 'data_entry_method',
                 'subnational1_code',
                 'loc_id',
                 'latitude',
                 'longitude']

In [14]:
pfw_df_1988_1995 = pfw_df_1988_1995[valid_headers]
pfw_df_1996_2000 = pfw_df_1996_2000[valid_headers]
pfw_df_2001_2005 = pfw_df_2001_2005[valid_headers]
pfw_df_2006_2010 = pfw_df_2006_2010[valid_headers]
pfw_df_2011_2015 = pfw_df_2011_2015[valid_headers]
pfw_df_2016_2020 = pfw_df_2016_2020[valid_headers]
pfw_df_2021 = pfw_df_2021[valid_headers]

#### Combine the Separate DataFrames

In [15]:
pfw_df_1988_2021 = pd.concat([pfw_df_1988_1995,
                              pfw_df_1996_2000,
                              pfw_df_2001_2005,
                              pfw_df_2006_2010,
                              pfw_df_2011_2015,
                              pfw_df_2016_2020,
                              pfw_df_2021])

In [16]:
pfw_df_1988_2021.shape

(36971663, 20)

#### Remove Invalid Observations

In [17]:
pfw_df_1988_2021 = pfw_df_1988_2021.loc[pfw_df_1988_2021.valid == 1]

In [18]:
pfw_df_1988_2021.shape

(36782264, 20)

In [19]:
pfw_df_1988_2021 = pfw_df_1988_2021.drop(labels='valid', 
                                         axis='columns')

#### Retain Only US observations (exclude Canada)

In [20]:
pfw_df_1988_2021 = pfw_df_1988_2021.rename(columns={'subnational1_code':'state'})

In [21]:
sorted(pfw_df_1988_2021.state.unique().tolist())[:20]

['CA-AB',
 'CA-BC',
 'CA-MB',
 'CA-NB',
 'CA-NL',
 'CA-NS',
 'CA-NT',
 'CA-NU',
 'CA-ON',
 'CA-PE',
 'CA-QC',
 'CA-SK',
 'CA-YT',
 'MX-JAL',
 'MX-NAY',
 'NZ-OTA',
 'PM-',
 'US-AK',
 'US-AL',
 'US-AR']

In [22]:
pfw_df_1988_2021 = pfw_df_1988_2021.loc[pfw_df_1988_2021.state.str.startswith('US-')]

In [23]:
pfw_df_1988_2021.shape

(32112589, 19)

In [24]:
sorted(pfw_df_1988_2021.state.unique().tolist())[:10]

['US-AK',
 'US-AL',
 'US-AR',
 'US-AZ',
 'US-CA',
 'US-CO',
 'US-CT',
 'US-DC',
 'US-DE',
 'US-FL']

In [25]:
pfw_df_1988_2021.state.nunique()

51

In [26]:
pfw_df_1988_2021.state = pfw_df_1988_2021.state.replace(to_replace={'US-':''}, 
                                                        regex=True)

In [27]:
pfw_df_1988_2021 = pfw_df_1988_2021.set_index('obs_id')
pfw_df_1988_2021.head()

Unnamed: 0_level_0,proj_period_id,sub_id,month,day,year,species_code,how_many,day1_am,day1_pm,day2_am,day2_pm,effort_hrs_atleast,snow_dep_atleast,data_entry_method,state,loc_id,latitude,longitude
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
OBS4781763,PFW_1995,S338425,12,17,1994,blujay,1,0.0,1.0,1.0,1.0,1.001,0.0,paper,AR,L20416,35.02262,-93.47239
OBS4781764,PFW_1995,S338425,12,17,1994,brnthr,1,0.0,1.0,1.0,1.0,1.001,0.0,paper,AR,L20416,35.02262,-93.47239
OBS5488036,PFW_1995,S338425,12,17,1994,carchi,2,0.0,1.0,1.0,1.0,1.001,0.0,paper,AR,L20416,35.02262,-93.47239
OBS7279835,PFW_1995,S338425,12,17,1994,haiwoo,1,0.0,1.0,1.0,1.0,1.001,0.0,paper,AR,L20416,35.02262,-93.47239
OBS7279836,PFW_1995,S338425,12,17,1994,houspa,3,0.0,1.0,1.0,1.0,1.001,0.0,paper,AR,L20416,35.02262,-93.47239


### Create a dimension table for Data Entry Methods

#### Consolidate Data Entry Methods Into: 'Paper = 0', 'Web = 1', and 'Mobile = 2'

In [28]:
pfw_df_1988_2021.data_entry_method = pfw_df_1988_2021.data_entry_method.replace(to_replace=['.*[Pp]aper.*', '.*[Ww]eb.*', '.*[Mm]obile.*'],
                                                                                value=[0, 1, 2],
                                                                                regex=True)

#### Create the dimension table

In [29]:
entry_methods_dim = pd.DataFrame(data=['Paper', 'Web', 'Mobile'], 
                                 columns=['entry_method'])

In [30]:
entry_methods_dim

Unnamed: 0,entry_method
0,Paper
1,Web
2,Mobile


### Create a dimension table for Effort in Hours

#### Replace Effort Values with Those Corrosponding to the Associated Table

In [31]:
pfw_df_1988_2021.effort_hrs_atleast = pfw_df_1988_2021.effort_hrs_atleast.replace(to_replace=[0.001, 1.001, 4.001, 8.001],
                                                                                  value=[0, 1, 4, 8])

In [32]:
effort_dim = pd.DataFrame(data=pfw_df_1988_2021.effort_hrs_atleast.unique(), 
                          columns=['effort_hrs'])
effort_dim.effort_hrs = effort_dim.effort_hrs.replace(to_replace=[0, 1, 4, 8],
                                                      value=['0-1', '1-4', '4-8', '8+'])

In [33]:
effort_dim.dropna(axis=0, inplace=True)

In [34]:
effort_dim = effort_dim.sort_values(by='effort_hrs').reset_index(drop=True)
effort_dim = effort_dim.rename(index={0:0, 1:1, 2:4, 3:8})
effort_dim

Unnamed: 0,effort_hrs
0,0-1
1,1-4
4,4-8
8,8+


### Read-in PFW Data Dictionary and Create a Common Species Name Table

In [35]:
pfw_data_dict_orig = pd.read_excel('../data/FeederWatch_Data_Dictionary.xlsx', 
                                   sheet_name='Species Codes', 
                                   header=1)

In [36]:
pfw_data_dict = pfw_data_dict_orig.copy(deep=True)

In [37]:
pfw_data_dict.columns = pfw_data_dict.columns.str.lower()
pfw_data_dict = pfw_data_dict[['species_code',
                               'sci_name',
                               'primary_com_name',
                               'order1',
                               'family',
                               'extinct',
                               'extinct_year']]
pfw_data_dict = pfw_data_dict.rename(columns={'order1':'order'})

In [38]:
pfw_data_dict

Unnamed: 0,species_code,sci_name,primary_com_name,order,family,extinct,extinct_year
0,scbtan2,Heterospingus xanthopygius,Scarlet-browed Tanager,Passeriformes,Thraupidae (Tanagers and Allies),0,
1,fabtan1,Pipraeidea melanonota,Fawn-breasted Tanager,Passeriformes,Thraupidae (Tanagers and Allies),0,
2,eurcoo,Fulica atra,Eurasian Coot,Gruiformes,"Rallidae (Rails, Gallinules, and Coots)",0,
3,cubgra,Tiaris canorus,Cuban Grassquit,Passeriformes,Thraupidae (Tanagers and Allies),0,
4,eurcur,Numenius arquata,Eurasian Curlew,Charadriiformes,Scolopacidae (Sandpipers and Allies),0,
...,...,...,...,...,...,...,...
15961,anteup4,Euphonia musica flavifrons,Antillean Euphonia (Lesser),Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)",0,
15962,reisee4,Crithagra reichardi reichardi,Reichard's Seedeater (Reichard's),Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)",0,
15963,mouser2,Chrysocorythus estherae [estherae Group],Mountain Serin (Mountain),Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)",0,
15964,y01060,Serinus pusillus/serinus/syriacus,European/Fire-fronted/Syrian Serin,Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)",0,


### Migrate extinction info to a new table to reduce redundancy

In [39]:
pfw_data_dict.loc[pfw_data_dict.extinct != 0]

Unnamed: 0,species_code,sci_name,primary_com_name,order,family,extinct,extinct_year
53,guspet,Oceanodroma macrodactyla,Guadalupe Storm-Petrel,Procellariiformes,Hydrobatidae (Storm-Petrels),1,1912.0
61,hawoo,Moho nobilis,Hawaii Oo,Passeriformes,Mohoidae (Hawaiian Honeyeaters),1,1898.0
78,kakawa,Paroreomyza flammea,Kakawahie,Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)",1,1963.0
83,kioea,Chaetoptila angustipluma,Kioea,Passeriformes,Mohoidae (Hawaiian Honeyeaters),1,1900.0
84,kongro,Chloridops kona,Kona Grosbeak,Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)",1,1894.0
...,...,...,...,...,...,...,...
15255,norgrd1,Alopecoenas norfolkensis,Norfolk Ground-Dove,Columbiformes,Columbidae (Pigeons and Doves),1,1800.0
15292,braeme2,Chlorostilbon bracei bracei,Brace's Emerald (Brace's),Caprimulgiformes,Trochilidae (Hummingbirds),1,1877.0
15437,soipio1,Turnagra capensis,South Island Piopio,Passeriformes,Oriolidae (Old World Orioles),1,1897.0
15676,braeme3,Chlorostilbon bracei elegans,Brace's Emerald (Caribbean),Caprimulgiformes,Trochilidae (Hummingbirds),1,1860.0


In [40]:
extinct_df = pfw_data_dict.loc[pfw_data_dict.extinct != 0][['species_code', 'extinct_year']].reset_index(drop=True)

In [41]:
extinct_df

Unnamed: 0,species_code,extinct_year
0,guspet,1912.0
1,hawoo,1898.0
2,kakawa,1963.0
3,kioea,1900.0
4,kongro,1894.0
...,...,...
124,norgrd1,1800.0
125,braeme2,1877.0
126,soipio1,1897.0
127,braeme3,1860.0


In [42]:
extinct_df.extinct_year = extinct_df.extinct_year.fillna(-1)
extinct_df = extinct_df.astype({'extinct_year': 'int32'})
extinct_df = extinct_df.sort_values(by='species_code')
extinct_df

Unnamed: 0,species_code,extinct_year
112,agurew1,1995
47,akepa2,1900
102,alagre1,1950
82,amaui,1825
92,apapan2,1923
...,...,...
111,trimoo2,1900
72,ulahaw,1937
121,verfly8,1987
98,wairai1,1944


In [43]:
extinct_dict = {family: types for (family, types) in zip(extinct_df.species_code, extinct_df.extinct_year)}

In [44]:
dict_head = dict(itertools.islice(extinct_dict.items(), 5))
print(dict_head)

{'agurew1': 1995, 'akepa2': 1900, 'alagre1': 1950, 'amaui': 1825, 'apapan2': 1923}


In [45]:
extinct_species_dim = pd.DataFrame.from_dict(extinct_dict, orient='index', columns=['extinct_year'])
extinct_species_dim

Unnamed: 0,extinct_year
agurew1,1995
akepa2,1900
alagre1,1950
amaui,1825
apapan2,1923
...,...
trimoo2,1900
ulahaw,1937
verfly8,1987
wairai1,1944


In [46]:
pfw_data_dict = pfw_data_dict.loc[:, 'species_code':'family']

In [47]:
pfw_data_dict

Unnamed: 0,species_code,sci_name,primary_com_name,order,family
0,scbtan2,Heterospingus xanthopygius,Scarlet-browed Tanager,Passeriformes,Thraupidae (Tanagers and Allies)
1,fabtan1,Pipraeidea melanonota,Fawn-breasted Tanager,Passeriformes,Thraupidae (Tanagers and Allies)
2,eurcoo,Fulica atra,Eurasian Coot,Gruiformes,"Rallidae (Rails, Gallinules, and Coots)"
3,cubgra,Tiaris canorus,Cuban Grassquit,Passeriformes,Thraupidae (Tanagers and Allies)
4,eurcur,Numenius arquata,Eurasian Curlew,Charadriiformes,Scolopacidae (Sandpipers and Allies)
...,...,...,...,...,...
15961,anteup4,Euphonia musica flavifrons,Antillean Euphonia (Lesser),Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)"
15962,reisee4,Crithagra reichardi reichardi,Reichard's Seedeater (Reichard's),Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)"
15963,mouser2,Chrysocorythus estherae [estherae Group],Mountain Serin (Mountain),Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)"
15964,y01060,Serinus pusillus/serinus/syriacus,European/Fire-fronted/Syrian Serin,Passeriformes,"Fringillidae (Finches, Euphonias, and Allies)"


### Reduce redundancy by creating a Family Subgroups table

In [48]:
pfw_data_dict = pfw_data_dict.fillna('NaN')

In [49]:
family_list = sorted(pfw_data_dict.family.unique().tolist())

In [50]:
family_dict = {}
for family in family_list:
    if family != 'NaN':
        name = family.split('(')[0].strip()
        types = family.split('(')[1][:-1]
        family_dict[name] = types

In [51]:
family_subgroups_dim = pd.DataFrame.from_dict(family_dict, orient='index', columns=['family_subgroups'])

In [52]:
family_subgroups_dim

Unnamed: 0,family_subgroups
Acanthisittidae,New Zealand Wrens
Acanthizidae,Thornbills and Allies
Accipitridae,"Hawks, Eagles, and Kites"
Acrocephalidae,Reed Warblers and Allies
Aegithalidae,Long-tailed Tits
...,...
Vangidae,"Vangas, Helmetshrikes, and Allies"
Viduidae,Indigobirds
Vireonidae,"Vireos, Shrike-Babblers, and Erpornis"
Zeledoniidae,Wrenthrush


In [53]:
pfw_data_dict.family = pfw_data_dict.family.replace(to_replace=' \(.*\)', value='', regex=True)

In [54]:
pfw_data_dict

Unnamed: 0,species_code,sci_name,primary_com_name,order,family
0,scbtan2,Heterospingus xanthopygius,Scarlet-browed Tanager,Passeriformes,Thraupidae
1,fabtan1,Pipraeidea melanonota,Fawn-breasted Tanager,Passeriformes,Thraupidae
2,eurcoo,Fulica atra,Eurasian Coot,Gruiformes,Rallidae
3,cubgra,Tiaris canorus,Cuban Grassquit,Passeriformes,Thraupidae
4,eurcur,Numenius arquata,Eurasian Curlew,Charadriiformes,Scolopacidae
...,...,...,...,...,...
15961,anteup4,Euphonia musica flavifrons,Antillean Euphonia (Lesser),Passeriformes,Fringillidae
15962,reisee4,Crithagra reichardi reichardi,Reichard's Seedeater (Reichard's),Passeriformes,Fringillidae
15963,mouser2,Chrysocorythus estherae [estherae Group],Mountain Serin (Mountain),Passeriformes,Fringillidae
15964,y01060,Serinus pusillus/serinus/syriacus,European/Fire-fronted/Syrian Serin,Passeriformes,Fringillidae


### Replace the numerical Data Dictionary index with species_code

In [55]:
data_dict_dim = pfw_data_dict.set_index('species_code')
data_dict_dim

Unnamed: 0_level_0,sci_name,primary_com_name,order,family
species_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
scbtan2,Heterospingus xanthopygius,Scarlet-browed Tanager,Passeriformes,Thraupidae
fabtan1,Pipraeidea melanonota,Fawn-breasted Tanager,Passeriformes,Thraupidae
eurcoo,Fulica atra,Eurasian Coot,Gruiformes,Rallidae
cubgra,Tiaris canorus,Cuban Grassquit,Passeriformes,Thraupidae
eurcur,Numenius arquata,Eurasian Curlew,Charadriiformes,Scolopacidae
...,...,...,...,...
anteup4,Euphonia musica flavifrons,Antillean Euphonia (Lesser),Passeriformes,Fringillidae
reisee4,Crithagra reichardi reichardi,Reichard's Seedeater (Reichard's),Passeriformes,Fringillidae
mouser2,Chrysocorythus estherae [estherae Group],Mountain Serin (Mountain),Passeriformes,Fringillidae
y01060,Serinus pusillus/serinus/syriacus,European/Fire-fronted/Syrian Serin,Passeriformes,Fringillidae


### Export Tables as CSVs

In [56]:
data_dict_dim.to_csv('../data/data_dict_dim.csv', index=True)

In [57]:
data_dict_dim.to_csv('../data/family_subgroups_dim.csv', index=True)

In [58]:
data_dict_dim.to_csv('../data/extinct_species_dim.csv', index=True)

In [59]:
data_dict_dim.to_csv('../data/effort_dim.csv', index=True)

In [60]:
data_dict_dim.to_csv('../data/entry_methods_dim.csv', index=True)

In [61]:
pfw_df_1988_2021.to_csv('../data/observations_fact.csv', index=True)