# Update species seen file

Creates an aggregate life list using checklists 

Before running this file:
* Use `initialize_list.ipynb` to generate a base list of all scientific and English (United States) common names
* Ensure your eBird "Species name display" preference is set to one of the following:
    * "Both"
    * "Scientific name"
    * "Common name" with the English (United States) translation option selected
* Obtain your eBird life list from [this link](https://ebird.org/MyEBird?cmd=list&rtype=custom&r=world&time=life&fmt=csv) while logged into your eBird account
* Place all `.csv`s to be processed in the `csv_dir_new` directory. Once each has been processed, it will be moved to the `csv_dir_processed` directory

In [1]:
import os
import pandas as pd

In [2]:
# Where lists to be processed are stored
csv_dir_new = 'lists_new/'
csv_dir_processed = 'lists_processed/'

# Where the completed seen_list is stored
current_list = 'seen_list.csv'
name_of_seen_column = 'number of people seeing species'

For a demonstration, use the lines below. For using your own lists, comment out the lines below.

In [3]:
# Test lists for demonstration
csv_dir_new = 'lists_test/'
csv_dir_processed = 'lists_test/'

Load current list of species and get total number of species possible

In [4]:
# Use scientific name as index for easy lookup by scientific names later
all_species = pd.read_csv(current_list, index_col = 'scientific name')
total_spp_possible = all_species.shape[0]

In [5]:
all_species_scientific = all_species.index
all_species_common = all_species['English name'].values

def check_spp(sub_list, list_type):
    '''
    Make sure sub_list contains only 
    elements that are in master_list
    
    Returns:
        True if sub_list is subset of master_list
        False if not
    '''

    if list_type == 'scientific':
        return set(sub_list).issubset(set(all_species_scientific))
    elif list_type == 'common':
        return set(sub_list).issubset(set(all_species_common))
    else:
        return False

def add_spp(sub_list, list_type):
    '''
    For species in sub_list, increment
    seen column of master dataframe inplace
    
    Inputs:
        sub_list: list of species to increment
        list_type (string): either 'scientific' or 'common'
    
        Also assumes the existence of global variables:
            all_species (DataFrame)
            name_of_seen_column (column to increment)
    '''
    if list_type == 'scientific':
        for scientific_name in sub_list:
            all_species.loc[scientific_name, name_of_seen_column] += 1
            
    else: #list_type == 'common'
        for common_name in sub_list:
             all_species.loc[all_species['English name'] == common_name, name_of_seen_column] += 1
                
    return 0

In [6]:
csvs_to_process = [file for file in os.listdir(csv_dir_new) if file.endswith('.csv')]


# For all new csvs
for csv_name in csvs_to_process:
    
    csv_path = os.path.join(csv_dir_new, csv_name)
    life_list = pd.read_csv(csv_path)
    errors = False
    
    # Increment rows of all_species DataFrame, using different row
    # locator algorithm depending on format of .csv
    
    try:
        assert 'Species' in life_list.columns
        species = life_list.Species
        first_species = species[0]
    except AssertionError:
        first_species = None
    
    # If 'Species' column of .csv is in "English name - scientific name" format:
    
    if ' - ' in first_species:
        print(f'{csv_name}: splitting species column & using scientific name')
        
        # Get species on life list
        species_seen_by_birder = [entry.split(' - ')[-1] for entry in species]
        list_type = 'scientific'
    
    # If 'Species' column of .csv is in English name only format
    elif first_species in all_species_common:
        print(f'{csv_name}: using English name')
        
        species_seen_by_birder = life_list['Species'].values
        list_type = 'common'
           
    # If 'Species' column of .csv is in scientific name only format
    elif first_species in all_species_scientific:
        print(f'{csv_name}: using scientific name')
        
        species_seen_by_birder = life_list['Species'].values
        list_type = 'scientific'
    
    else:
        errors = True
        print('Unknown species type')
        
        
    # Check that all species are valid
    if check_spp(species_seen_by_birder, list_type) and not errors:

        # Add species to dataframe based on scientific name
        errors = add_spp(species_seen_by_birder, list_type)

    # Not all species were valid
    else:
        errors = True

    
    if not errors:
          
        # Move file to 'processed' folder
        os.rename(csv_path, os.path.join(csv_dir_processed, csv_name))
        
    else:
        print(f'Failure for {csv_name}')
  

test_ebird_world_life_list_scinames.csv: using scientific name
test_ebird_world_life_list_commonnames.csv: using English name
test_ebird_world_life_list_common-and-scinames.csv: splitting species column & using scientific name


Ensure we haven't added any extra species!

In [7]:
assert(all_species.shape[0] == total_spp_possible)

Save to `.csv`

In [8]:
all_species.to_csv(current_list)