# Update species seen file

Creates an aggregate life list using checklists 

Before running this file:
* Use `initialize_list.ipynb` to generate a base list of all scientific and English (United States) common names
* Ensure your eBird "Species name display" preference is set to one of the following:
    * "Both"
    * "Scientific name"
    * "Common name" with the English (United States) translation option selected
* Obtain your eBird life list from [this link](https://ebird.org/MyEBird?cmd=list&rtype=custom&r=world&time=life&fmt=csv) while logged into your eBird account
* Place all `.csv`s to be processed in the `csv_dir_new` directory
* Once each has been processed, it will be moved to the `csv_dir_processed` directory

In [1]:
import os
import pandas as pd

A list is initialized from scratch using the following function.
It requires that the eBird Clements Checklist be downloaded from [here](http://www.birds.cornell.edu/clementschecklist/download/?__hstc=60209138.9ba7c0f73f1c53dfb3e4130b0b7ae9f6.1521551026771.1563310960962.1563569810031.123&__hssc=60209138.3.1563569810031&__hsfp=1566389198)


In [2]:
def initialize_list(seen_list_name, seen_column_name):
    # Read in eBird Clements Checklist .csv
    fields = ['category', 'scientific name', 'English name']
    all_taxa = pd.read_csv(
        "Clements-Checklist-v2018-August-2018.csv",
        usecols=fields,
        index_col='scientific name',
        encoding='latin1')
    
    # Select species only
    all_taxa = all_taxa[all_taxa['category'] == 'species']
    all_species = all_taxa.drop('category', axis=1)
    
    # Add a column for number of people who have seen the species
    all_species.loc[:, seen_column_name] = 0
    
    # Save the file
    all_species.to_csv(seen_list_name)

Decide which mode to use. This variable changes the directory that life lists are obtained from.

* `new` mode: create a new "species seen" file and populate it with new life lists (`lists_new/`)
* `add` mode: use a pre-existing "species seen" file and add new life lists to it (`lists_new/`)
* `redo` mode: create a new "species seen" file and populate it with all previously processed life lists (`lists_processed`)
* `test` mode: create a demonstration `.csv` using fake life lists (`lists_test/`)

In [3]:
#mode = 'new' 
#mode = 'add'
#mode = 'redo'
mode = 'test'

Where life lists are stored:
* `csv_dir_new`: this is the directory of lists to perform the update using. Can be one of three values:
    * `'lists_test/'`: test lists 
    * `'lists_new/'`: update the current `.csv` with individual lists in the `'lists_new/'` dir. 
    * `'lists_processed/'`: use after running `initialize_list()` to re-process all processed files.
* `csv_dir_processed`: this is the directory to which lists will be moved after processing

In [4]:
seen_list_name = 'seen_list.csv'
seen_column_name = 'number of people seeing species'


if mode == 'new':
    # Initialize a new "seen list"
    initialize_list(seen_list_name = seen_list_name, seen_column_name = seen_column_name)
    
    csv_dir_new = 'lists_new/'
    csv_dir_processed = 'lists_processed/'

elif mode == 'add':
    # Do not reinitialize list
    
    csv_dir_new = 'lists_new/'
    csv_dir_processed = 'lists_processed/'
    
elif mode == 'redo':
    # Reinitialize "seen list"
    initialize_list(seen_list_name = seen_list_name, seen_column_name = seen_column_name)
    
    csv_dir_new = 'lists_processed/'
    csv_dir_processed = 'lists_processed/'

elif mode == 'test':
    # Initialize a test "seen list"
    seen_list_name = 'test_list.csv'
    initialize_list(seen_list_name = seen_list_name, seen_column_name = seen_column_name)
    
    csv_dir_new = 'lists_test/'
    csv_dir_processed = 'lists_test/'

    
else:
    print('not a valid mode')
    seen_list_name = None
    csv_dir_new = None
    csv_dir_processed = None

Load current list of species and get total number of species possible

In [5]:
# Use scientific name as index for easy lookup by scientific names later
all_species = pd.read_csv(seen_list_name, index_col = 'scientific name')
total_spp_possible = all_species.shape[0]

# Lists of all species possible, by scientific and common name
all_species_scientific = all_species.index
all_species_common = all_species['English name'].values

In [6]:
def check_spp(sub_list, list_type):
    '''
    Make sure sub_list contains only species
    that are in the master dataframe, all_species.
    Checks a different column of all_species
    depending on the value of `list_type`.
    Prints any invalid species found.
    
    Inputs:
        sub_list: list to check 
        list_type (str): 'scientific' or 'common'.
            Determines whether to check sub_list against
            the scientific or common name lists (described
            in "Globals" below)
    
    Globals:
        all_species_scientific: list of valid scientific names
        all_species_common: list of valid common names
    
    Returns:
        A list of all valid species on `sub_list`.
    '''

    # Get list of items 
    if list_type == 'scientific':
        non_subset_items = list(set(sub_list) - set(all_species_scientific))
    elif list_type == 'common':
        non_subset_items = list(set(sub_list) - set(all_species_common))
    else:
        print(f"Received list_type = {list_type}; valid types are 'scientific' and 'common'")
        return []
    
    # Print any invalid items
    if non_subset_items:
        print('Invalid species found:')
        for item in non_subset_items:
            print(item)
    
    # Return all valid species
    return list(set(sub_list) - set(non_subset_items))


def add_spp(sub_list, list_type):
    '''
    For species in sub_list, increment
    seen column of master dataframe inplace
    
    Inputs:
        sub_list: list of species to increment
        list_type (string): either 'scientific' or 'common'.
            Determines whether to search for row to 
            increment based on scientific name or common name.
    
    Globals:
        all_species (DataFrame): the dataframe to update
        seen_column_name: the column to increment in 
            the all_species dataframe
    '''
    if list_type == 'scientific':
        for scientific_name in sub_list:
            all_species.loc[scientific_name, seen_column_name] += 1
            
    else: #list_type == 'common'
        for common_name in sub_list:
             all_species.loc[all_species['English name'] == common_name, seen_column_name] += 1
    print(f'Adding {len(sub_list)} species')
    
    return 0

If `mode == 'liberal'`, if there are invalid species in a life list, the script below will still add any 
valid species to the master dataframe, and will move the problematic life_list to the "processed" folder. If `mode != 'liberal'`, the life list will not be processed and moved.

In [7]:
mode = 'liberal'

The code below does the following for each `.csv`:
* Gets a raw species list from the `Species` column of the `.csv`
    * If there is no Species column or the list is empty, an error will be thrown
* Figures out the format of the raw species list:
    * Scientific names
    * Common names, assumed to be English (United States)
    * "Common name - scientific name" (scientific name will be used)
* Extracts either common or scientific names from the raw species list
* Uses `check_spp()` to make sure that the extracted list contains only names that are in the dataframe of all sightings (`all_species`)
* 

In [8]:
# Life lists to process
csvs_to_process = [file for file in os.listdir(csv_dir_new) if file.endswith('.csv')]

# TODO: strip species of spaces before and after
for csv_name in csvs_to_process:
    
    csv_path = os.path.join(csv_dir_new, csv_name)
    life_list = pd.read_csv(csv_path)
    errors = False
    
    # Get raw species listing from .csv
    try:
        assert 'Species' in life_list.columns
        species = life_list.Species
        first_species = species[0]
        assert(len(species) > 1)
        
    except:
        # No Species column in life list
        # or species list produced was empty
        first_species = None
    
    
    # Determine the format of the species list in the .csv (i.e., 
    # common names, scientific names, or "common name - scientific name")
    # and extract a species list.
    if ' - ' in first_species:
        # 'Species' column of .csv is in "English name - scientific name" format
        print(f'{csv_name}: splitting species column & using scientific name')
        
        # Get species on life list
        species_seen_by_birder = [entry.split(' - ')[-1] for entry in species]
        list_type = 'scientific'
    
    elif first_species in all_species_common:
        # 'Species' column of .csv is in English name only format
        print(f'{csv_name}: using English name')
        
        species_seen_by_birder = life_list['Species'].values
        list_type = 'common'
    
    elif first_species in all_species_scientific:
        # 'Species' column of .csv is in scientific name only format
        print(f'{csv_name}: using scientific name')
        
        species_seen_by_birder = life_list['Species'].values
        list_type = 'scientific'
    
    else:
        # Format not recognized, or first_species was set to None
        # in the try/except logic above
        print('Error in species column')
        errors = True
        
        
    # Check that all species are valid
    if not errors:
        checked_list = check_spp(species_seen_by_birder, list_type) 
        
        # If some species were removed during checking
        if len(checked_list) != len(species_seen_by_birder):
            print('Invalid species detected')
            if mode != 'liberal':
                errors = True
            else:
                print('Adding valid species only')
    
    # If all species were valid, or if there were some invalid 
    # species removed from `checked_list` but `mode == 'liberal',
    # increment all species on `checked_list` and move 
    # the .csv to the "processed" folder, `csv_dir_processed`
    if not errors:
        # Add species to dataframe
        errors = add_spp(checked_list, list_type)
            
        # Move file to 'processed' folder
        os.rename(csv_path, os.path.join(csv_dir_processed, csv_name))
    
    # If there were unacceptable errors, do not increment
    # the species in the life list, and do not move the life list
    # to the "processed" folder
    else:
        print(f'Failure for {csv_name}')
        
    print()

test_ebird_world_life_list_scinames.csv: using scientific name
Adding 398 species

test_ebird_world_life_list_commonnames.csv: using English name
Adding 398 species

test_ebird_world_life_list_common-and-scinames.csv: splitting species column & using scientific name
Adding 398 species



Ensure we haven't added any extra species!

In [9]:
assert(all_species.shape[0] == total_spp_possible)

Save to `.csv`

In [10]:
all_species.to_csv(seen_list_name)