# Analyses on one county
Demonstration of analysis on a single county's worth of frequency data

In [1]:
import os
import pandas as pd

In [2]:
results_dir = "county_results"
counties = [file for file in os.listdir(results_dir) if file.endswith('csv')][:20]

In [3]:
def show_unique(series):
    '''
    Return a list of the unique
    elements in a series
    '''
    
    all_elements = list(series)
    seen = set()
    seen_add = seen.add
    unique_elements = [x for x in all_elements if not (x in seen or seen_add(x))]

    return unique_elements

### Remove unwanted weeks

In [4]:
november_desired = [f'November-{i}' for i in range(1,5)]
december_desired = [f'December-{i}' for i in range(1,5)]
january_desired = [f'January-{i}' for i in range(1,3)]
desired_weeks = november_desired + december_desired + january_desired

In [5]:
county_csv = counties[0]
all_weeks_data = pd.read_csv(os.path.join(results_dir, county_csv), index_col='Unnamed: 0')
desired_weeks_data = all_weeks_data[all_weeks_data['monthQt'].isin(desired_weeks)]

In [6]:
print('Weeks in dataset (before):', show_unique(all_weeks_data['monthQt']))
print('Weeks in dataset (after):', show_unique(desired_weeks_data['monthQt']))

Weeks in dataset (before): ['January-1', 'January-2', 'January-3', 'January-4', 'February-1', 'February-2', 'February-3', 'February-4', 'March-1', 'March-2', 'March-3', 'March-4', 'April-1', 'April-2', 'April-3', 'April-4', 'May-1', 'May-2', 'May-3', 'May-4', 'June-1', 'June-2', 'June-3', 'June-4', 'July-1', 'July-2', 'July-3', 'July-4', 'August-1', 'August-2', 'August-3', 'August-4', 'September-1', 'September-2', 'September-3', 'September-4', 'October-1', 'October-2', 'October-3', 'October-4', 'November-1', 'November-2', 'November-3', 'November-4', 'December-1', 'December-2', 'December-3', 'December-4']
Weeks in dataset (after): ['January-1', 'January-2', 'November-1', 'November-2', 'November-3', 'November-4', 'December-1', 'December-2', 'December-3', 'December-4']


### Remove non-species designations from data

In [7]:
clean_data = desired_weeks_data.loc[
    # Remove anything that contains a dot (i.e. a "spuh")
    ~(
        desired_weeks_data['comName'].str.contains('\.')
    ) &
    # Remove anything that contains a forward slash (i.e. an uncertain ID)
    ~(
        desired_weeks_data['comName'].str.contains('/')
    ) &
    # Remove anything that is a hybrid
    ~(
        desired_weeks_data['comName'].str.contains('hybrid')
    )
]

In [8]:
print('Species in dataset (before):', show_unique(desired_weeks_data['comName']))
print('Species in dataset (after):', show_unique(clean_data['comName']))

Species in dataset (before): ['Emperor Goose', 'Snow Goose', 'Greater White-fronted Goose', 'Brant', 'Cackling Goose', 'Canada Goose', 'Cackling/Canada Goose', 'goose sp.', 'Trumpeter Swan', 'Tundra Swan', 'Trumpeter/Tundra Swan', 'Northern Shoveler', 'Gadwall', 'Eurasian Wigeon', 'American Wigeon', 'Mallard', 'Northern Pintail', 'Green-winged Teal', 'dabbling duck sp.', 'Greater Scaup', 'Lesser Scaup', 'Greater/Lesser Scaup', "Steller's Eider", 'Spectacled Eider', 'King Eider', 'Common Eider', 'eider sp.', 'Harlequin Duck', 'Surf Scoter', 'White-winged Scoter', 'Black Scoter', 'scoter sp.', 'Long-tailed Duck', 'Bufflehead', 'Common Goldeneye', "Barrow's Goldeneye", 'Common Merganser', 'Red-breasted Merganser', 'Common/Red-breasted Merganser', 'merganser sp.', 'duck sp.', 'Willow Ptarmigan', 'Rock Ptarmigan', 'ptarmigan sp.', 'Horned Grebe', 'Red-necked Grebe', 'grebe sp.', 'Sandhill Crane', 'Black Oystercatcher', 'Black-bellied Plover', 'Pacific Golden-Plover', 'American/Pacific Golde

### Remove species already on one's own life list

In [9]:
life_data = pd.read_csv('csvs/tessa-lifelist.csv')

# Break "Species" column into two columns, splitting on a hyphen with two spaces around it
life_data[['Common Name','Scientific Name']] = life_data.Species.str.split(' - ', expand=True) 


In [10]:
my_lifelist = list(life_data['Common Name'])
species_removed_data = clean_data[~ clean_data['comName'].isin(my_lifelist)]

In [11]:
print('Number of species before:', len(show_unique(clean_data['comName'])))
print('Number of species after:', len(show_unique(species_removed_data['comName'])))

Number of species before: 165
Number of species after: 68


### Average frequencies over multiple weeks

In [12]:
averaged_freqs = pd.DataFrame({'comName':[], 'frequency':[]})
species = show_unique(species_removed_data['comName'])
for idx, sp in enumerate(species):
    freq = species_removed_data[
        species_removed_data['comName'] == sp]['frequency'].mean()
    averaged_freqs.loc[idx] = {'comName':sp, 'frequency':freq}

In [13]:
print('Rows before (number of species * number of weeks):', species_removed_data.shape[0])
print('Rows after (number of species):', averaged_freqs.shape[0])

Rows before (number of species * number of weeks): 680
Rows after (number of species): 68


### Eliminate species with frequency below a certain level 

In [14]:
threshold = 0.1
averaged_freqs[averaged_freqs['frequency'] > threshold]

Unnamed: 0,comName,frequency
0,Emperor Goose,0.222419
3,Steller's Eider,0.141393
15,Rock Sandpiper,0.132314
50,Northern Fulmar,0.101936
58,Black-billed Magpie,0.194961
64,Gray-crowned Rosy-Finch,0.136209
