In [1]:
import os
import pandas as pd

In [2]:
results_dir = "county_results"
counties = [file for file in os.listdir(results_dir) if file.endswith('csv')][:20]

# Analyses on one county

### Remove unwanted weeks

In [3]:
november_desired = [f'November-{i}' for i in range(1,5)]
december_desired = [f'December-{i}' for i in range(1,5)]
january_desired = [f'January-{i}' for i in range(1,3)]
desired_weeks = november_desired + december_desired + january_desired

In [4]:
county_csv = counties[0]
county_name = county_csv[:-4] #remove .csv
all_weeks_data = pd.read_csv(os.path.join(results_dir, county_csv), index_col='Unnamed: 0')
desired_weeks_data = all_weeks_data[all_weeks_data['monthQt'].isin(desired_weeks)]

### Remove non-species designations from data

In [5]:
clean_data = desired_weeks_data.loc[
    # Remove anything that contains a dot (i.e. a "spuh")
    ~(
        desired_weeks_data['comName'].str.contains('\.')
    ) &
    # Remove anything that contains a forward slash (i.e. an uncertain ID)
    ~(
        desired_weeks_data['comName'].str.contains('/')
    ) &
    # Remove anything that is a hybrid
    ~(
        desired_weeks_data['comName'].str.contains('hybrid')
    )
]

### Remove species already on one's own life list

In [6]:
life_data = pd.read_csv('csvs/tessa-lifelist.csv')

# Break "Species" column into two columns, splitting on a hyphen with two spaces around it
life_data[['Common Name','Scientific Name']] = life_data.Species.str.split(' - ', expand=True) 


In [7]:
my_lifelist = list(life_data['Common Name'])
species_removed_data = clean_data[~ clean_data['comName'].isin(my_lifelist)]

### Average frequencies over multiple weeks

In [8]:
def show_unique(series):
    '''
    Return a list of the unique
    elements in a series
    '''
    
    all_elements = list(series)
    seen = set()
    seen_add = seen.add
    unique_elements = [x for x in all_elements if not (x in seen or seen_add(x))]

    return unique_elements

In [9]:
averaged_freqs = pd.DataFrame({'comName':[], 'frequency':[]})
species = show_unique(species_removed_data['comName'])
for idx, sp in enumerate(species):
    freq = species_removed_data[
        species_removed_data['comName'] == sp]['frequency'].mean()
    averaged_freqs.loc[idx] = {'comName':sp, 'frequency':freq}

### Eliminate species with frequency below a certain level 

In [10]:
threshold = 0.1
averaged_freqs[averaged_freqs['frequency'] > threshold]

Unnamed: 0,comName,frequency
0,Emperor Goose,0.222419
3,Steller's Eider,0.141393
15,Rock Sandpiper,0.132314
50,Northern Fulmar,0.101936
58,Black-billed Magpie,0.194961
64,Gray-crowned Rosy-Finch,0.136209
