In [521]:
import pandas as pd
from pathlib import Path
import h3
import folium
from geojson import Feature, Point, FeatureCollection
import json
import matplotlib
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import ast
pd.options.mode.copy_on_write = True 
from functools import reduce
from folium import GeoJsonTooltip

In [569]:
import dask as dd

## Create JSON of subspecies

In [522]:
# Load eBird taxonomy
taxonomy = pd.read_csv("/Users/tessa/Code/scratchpad/subspecies_plotter/eBird_taxonomy_v2024.csv")
taxonomy.head()

Unnamed: 0,TAXON_ORDER,CATEGORY,SPECIES_CODE,TAXON_CONCEPT_ID,PRIMARY_COM_NAME,SCI_NAME,ORDER,FAMILY,SPECIES_GROUP,REPORT_AS
0,2,species,ostric2,,Common Ostrich,Struthio camelus,Struthioniformes,Struthionidae (Ostriches),Ostriches,
1,7,species,ostric3,,Somali Ostrich,Struthio molybdophanes,Struthioniformes,Struthionidae (Ostriches),Ostriches,
2,8,slash,y00934,,Common/Somali Ostrich,Struthio camelus/molybdophanes,Struthioniformes,Struthionidae (Ostriches),Ostriches,
3,10,species,soucas1,,Southern Cassowary,Casuarius casuarius,Casuariiformes,Casuariidae (Cassowaries and Emu),Cassowaries and Emu,
4,11,species,dwacas1,,Dwarf Cassowary,Casuarius bennetti,Casuariiformes,Casuariidae (Cassowaries and Emu),Cassowaries and Emu,


In [523]:
# What categories are there in the taxonomy?
taxonomy.CATEGORY.unique()


array(['species', 'slash', 'issf', 'hybrid', 'spuh', 'domestic', 'form',
       'intergrade'], dtype=object)

In [524]:
# How many species are there?
species = taxonomy[taxonomy.CATEGORY == 'species']
len(species)


11145

In [525]:
# How many infraspecific entries are there?
infrasp_categories = ['issf', 'form', 'intergrade']
infraspp = taxonomy[taxonomy.CATEGORY.isin(infrasp_categories)]
len(infraspp)

3843

Outdated: Clements taxonomy

In [526]:
# Load Clements taxonomy
# taxonomy = pd.read_csv("/Users/tessa/Code/scratchpad/subspecies_plotter/Clements-v2024-October-2024-rev.csv")
# taxonomy.head()

# What categories are there in the taxonomy?
# taxonomy.category.unique()

# How many species are there?
# species = taxonomy[taxonomy.category == 'species']
# len(species)

# How many infraspecific entries are there?
#infrasp_categories = ['subspecies', 'group (monotypic)', 'group (polytypic)', 'form']
#infraspp = taxonomy[taxonomy.category.isin(infrasp_categories)]
#len(infraspp)

In [527]:
species

Unnamed: 0,TAXON_ORDER,CATEGORY,SPECIES_CODE,TAXON_CONCEPT_ID,PRIMARY_COM_NAME,SCI_NAME,ORDER,FAMILY,SPECIES_GROUP,REPORT_AS
0,2,species,ostric2,,Common Ostrich,Struthio camelus,Struthioniformes,Struthionidae (Ostriches),Ostriches,
1,7,species,ostric3,,Somali Ostrich,Struthio molybdophanes,Struthioniformes,Struthionidae (Ostriches),Ostriches,
3,10,species,soucas1,,Southern Cassowary,Casuarius casuarius,Casuariiformes,Casuariidae (Cassowaries and Emu),Cassowaries and Emu,
4,11,species,dwacas1,,Dwarf Cassowary,Casuarius bennetti,Casuariiformes,Casuariidae (Cassowaries and Emu),Cassowaries and Emu,
5,12,species,norcas1,,Northern Cassowary,Casuarius unappendiculatus,Casuariiformes,Casuariidae (Cassowaries and Emu),Cassowaries and Emu,
...,...,...,...,...,...,...,...,...,...,...
17406,35548,species,thbsal1,,Thick-billed Saltator,Saltator maxillosus,Passeriformes,Thraupidae (Tanagers and Allies),Tanagers and Allies,
17407,35549,species,gobsal1,,Golden-billed Saltator,Saltator aurantiirostris,Passeriformes,Thraupidae (Tanagers and Allies),Tanagers and Allies,
17408,35556,species,massal1,,Masked Saltator,Saltator cinctus,Passeriformes,Thraupidae (Tanagers and Allies),Tanagers and Allies,
17409,35557,species,slcgro1,,Slate-colored Grosbeak,Saltator grossus,Passeriformes,Thraupidae (Tanagers and Allies),Tanagers and Allies,


In [528]:
# Create a dictionary mapping species to their infraspecies, by category
# Most species have a single category of infraspecies (e.g. either form or subspecies, not both)
# However some species have infraspecies in multiple categories, e.g. Brant (Branta bernicla) has both subspecies and forms
spp_dict = species[['SPECIES_CODE', 'PRIMARY_COM_NAME', 'SCI_NAME']].set_index("SCI_NAME").T.to_dict()
# Add infraspecies to spp_json
for sp in tqdm(spp_dict.keys()):
    # Get infraspecies for this species
    infraspp_for_sp = infraspp[infraspp['SCI_NAME'].apply(lambda x: x[:len(sp)] == sp)]
    infraspp_dict = dict()

    # Add infraspecies to spp_json by category
    for cat in infrasp_categories:
        infrasp_in_category = infraspp_for_sp[infraspp_for_sp.CATEGORY == cat]
        infrasp_cat_dict = infrasp_in_category[
            ['SPECIES_CODE', 'PRIMARY_COM_NAME', 'SCI_NAME']].set_index("SCI_NAME").T.to_dict()
        if len(infrasp_cat_dict.keys()) > 0:
            infraspp_dict[cat] = infrasp_cat_dict
    spp_dict[sp]['infraspecies'] = infraspp_dict

  0%|          | 0/11145 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [278]:
with open("/Users/tessa/Code/scratchpad/subspecies_plotter/infraspecies_ebird.json", 'w') as f:
    f.write(json.dumps(spp_dict, indent=4))

## Prep eBird data

H3 resolutions
```
Res	Average Hexagon Area (km2)	Pentagon Area* (km2)	Ratio (P/H)
0	4,357,449.416078381	2,562,182.162955496	0.5880
1	609,788.441794133	328,434.586246469	0.5386
2	86,801.780398997	44,930.898497879	0.5176
3	12,393.434655088	6,315.472267516	0.5096
4	1,770.347654491	896.582383141	0.5064
5	252.903858182	127.785583023	0.5053
6	36.129062164	18.238749548	0.5048
7	5.161293360	2.604669397	0.5047
```

H3 number of cells
```
Res	Total number of cells	Number of hexagons	Number of pentagons
0	122	110	12
1	842	830	12
2	5,882	5,870	12
3	41,162	41,150	12
4	288,122	288,110	12
5	2,016,842	2,016,830	12
6	14,117,882	14,117,870	12
```

## Determine number of sightings of each subspecies per grid cell

### Calculate in batches

In [566]:

resolutions = [2,3,4,5]
def clean_ebd(
    full_df,
    remove_unconfirmed=True, 
    remove_reviewed=False,
    resolutions = resolutions,
):

    # Remove duplicate checklists
    full_df = full_df[full_df['GROUP IDENTIFIER'].isnull() | ~full_df[full_df['GROUP IDENTIFIER'].notnull()].duplicated(subset=["GROUP IDENTIFIER", "SCIENTIFIC NAME"],keep='first')]

    # Removed unconfirmed observations or reviewed observations, if desired
    if remove_unconfirmed:
        full_df = full_df[full_df["APPROVED"] == 1]
    if remove_reviewed:
        full_df = full_df[full_df["REVIEWED"] == 0]

    # Just subset to the needed columns
    needed_columns = [
        'TAXONOMIC ORDER','CATEGORY', 'TAXON CONCEPT ID', 'COMMON NAME', 
        'SCIENTIFIC NAME','SUBSPECIES COMMON NAME', 'SUBSPECIES SCIENTIFIC NAME',
        'SAMPLING EVENT IDENTIFIER',
        'LATITUDE', 'LONGITUDE', 'REVIEWED', 'OBSERVATION DATE']
    full_df = full_df[needed_columns]
    full_df.head()

    # Convert latitude and longitude to an H3 hexagon ID
    for resolution in resolutions:
        full_df[f'hex_id_{resolution}'] = full_df.apply(lambda row:  h3.latlng_to_cell(row.LATITUDE, row.LONGITUDE, resolution), axis=1)
    
    return full_df

def get_grid_cell_species_data(cell_df, sp, subspp, resolution):
    """Get # of checklists containing a species and each subspecies

    Args:
    - cell_df: pd.DataFrame, dataframe of data for a single grid cell (1 row per observation)
    - sp: str, scientific name of species
    - subspp: list of str, scientific names of subspecies for this species

    Returns:
    - cell_data: dict, with keys 'cell_id', species name, and subspecies names
    """
    # Total number of checklists containing the species
    num_checklists = cell_df["SAMPLING EVENT IDENTIFIER"].nunique()

    # Create a dict of # checklists containing sp for all cells
    cell_data = {'cell_id': cell_df[f"hex_id_{resolution}"].iloc[0]}
    cell_data[sp] = num_checklists

    # Add number of checklists containing each subspecies
    for subsp in subspp:
        num_subsp = cell_df[cell_df["SUBSPECIES SCIENTIFIC NAME"] == subsp].shape[0]
        cell_data[subsp] = num_subsp

    return cell_data


def get_species_df(sp, sp_df, subspp, resolution):
    """Make dataframe of species & subspecies data for every cell for a given species

    Args:
    - sp: str, scientific name of species
    - df: pd.DataFrame, dataframe of data for this species
    - subspp: list of str, scientific names of subspecies for this species
    - resolution: int, H3 resolution level
    """

    # Create a dict of # checklists containing sp for all cells
    cell_dicts = []
    for cell in sp_df[f"hex_id_{resolution}"].unique():
        cell_df = sp_df[sp_df[f"hex_id_{resolution}"] == cell]
        cell_data = get_grid_cell_species_data(cell_df, sp, subspp, resolution)
        cell_dicts.append(cell_data)

    sp_cell_df = pd.DataFrame(cell_dicts, index=range(len(cell_dicts)))
    sp_cell_df.set_index("cell_id", inplace=True)

    return sp_cell_df



spp_dict = json.load(open("/Users/tessa/Code/scratchpad/subspecies_plotter/infraspecies_ebird.json"))

# caja_df = get_species_df(sp, full_df, subspp_dict)
# filepath = Path('/Users/tessa/Code/scratchpad/subspecies_plotter/batches').joinpath(filename)
# caja_df.to_csv(filepath)

#dataset_filepath = "/Users/tessa/Code/scratchpad/subspecies_plotter/ebd-sample.txt"
#sp_code = 'rethaw'
use_cols = [
        'TAXONOMIC ORDER','CATEGORY', 'TAXON CONCEPT ID', 'COMMON NAME', 
        'SCIENTIFIC NAME','SUBSPECIES COMMON NAME', 'SUBSPECIES SCIENTIFIC NAME',
        'SAMPLING EVENT IDENTIFIER',
        'LATITUDE', 'LONGITUDE', 'REVIEWED', 'APPROVED', 'GROUP IDENTIFIER', 'OBSERVATION DATE']

sp_codes = [x.name.split('_')[1] for x in list(Path("/Users/tessa/Code/scratchpad/subspecies_plotter/data/").glob("*.zip"))]
for sp_code in sp_codes:
    print("\n\n\nProcessing", sp_code)
    dataset_filepath = f"/Users/tessa/Code/scratchpad/subspecies_plotter/data/ebd_{sp_code}_relOct-2024/ebd_{sp_code}_relOct-2024.txt"

    resolution = resolutions[0]

    ssp_batch_directory = Path('/Users/tessa/Code/scratchpad/subspecies_plotter/batches/')
    ssp_batch_directory.mkdir(exist_ok=True)

    # Read in CSV in batches
    chunk_rows = 100000
    tracker_filepath = f"/Users/tessa/Code/scratchpad/subspecies_plotter/{sp_code}_tracker_rowsperchunk-{chunk_rows}.csv"

    if Path(tracker_filepath).exists():
        tracker = pd.read_csv(tracker_filepath)
        tracker["spp_to_do"] = tracker["spp_to_do"].apply(ast.literal_eval) 
        tracker["spp_done"] = tracker["spp_done"].apply(ast.literal_eval) 
        start_idx = tracker.index[-1]
        spp_to_do = set(tracker.loc[start_idx].spp_to_do) - set(tracker.loc[start_idx].spp_done)
        if spp_to_do == set():
            skiprows = tracker.loc[start_idx].end_row
            start_idx = start_idx + 1
            spp_to_do = None
        else:
            skiprows = tracker.loc[start_idx].start_row

    else:
        tracker = pd.DataFrame(columns=["start_row", "end_row", "spp_to_do", "spp_done"])
        start_idx = 0
        spp_to_do = None
        skiprows=0

    # TODO: DEAL WITH BUG (BELOW)
    # SWITCH TO DASK TO PARALLELIZE
    for idx, chunk in enumerate(pd.read_csv(dataset_filepath, chunksize=chunk_rows, skiprows=range(1,skiprows), usecols=use_cols, sep="\t")):
        if chunk.shape[0] == 0:
            print("No more data to process, total rows in dataset: ", (start_idx + idx)*chunk_rows)
            break
        if chunk.shape[0] < chunk_rows:
            # Some kind of weird bug/issue with the last chunk 
            # which finds a single row left to process claiming to be in the next 100,000 rows after the last one
            # This only happens after the first time I rerun this cell
            end_row = (start_idx+idx)*chunk_rows + chunk.shape[0]
            print(f"Last chunk, total rows in dataset:", end_row)
        else:
            end_row = (start_idx + idx)*chunk_rows+chunk_rows
        cleaned = clean_ebd(chunk)
        if spp_to_do == None: # Add new row
            spp_to_do = list(set(cleaned["SCIENTIFIC NAME"].unique()))
            tracker.loc[start_idx+idx] = [(start_idx + idx)*chunk_rows, end_row, spp_to_do, []]

        
        for sp in spp_to_do:
            cleaned_sp = cleaned[cleaned["SCIENTIFIC NAME"] == sp]
            if cleaned_sp.shape[0] == 0:
                #print(f"No data for {sp}")
                continue

            # Get list of subspecies
            subspp = []
            for k, val in spp_dict[sp]['infraspecies'].items():
                subspp.extend(val.keys())
            
            # Get data on presence of each subspp for each resolution
            for resolution in resolutions:
                species_df = get_species_df(sp, cleaned_sp, subspp, resolution)
                filename = ssp_batch_directory.joinpath(f'{sp}_row{(start_idx+idx)*chunk_rows}-{end_row}_resolution{resolution}.csv')
                species_df.to_csv(filename)

            tracker.loc[start_idx+idx].spp_done += [sp]
            tracker.to_csv(tracker_filepath, index=False)

        spp_to_do = None
    




Processing strher
No more data to process



Processing easmea
No more data to process



Processing yerwar
No more data to process



Processing eurjay1
No more data to process



Processing brant
No more data to process



Processing whcspa
No more data to process



Processing cacgoo1
No more data to process



Processing horlar
No more data to process



Processing coatit2
No more data to process



Processing foxspa
No more data to process



Processing daejun
Last chunk, total rows in dataset: 14296832



Processing orcwar
Last chunk, total rows in dataset: 2834032



Processing yebcha


  for idx, chunk in enumerate(pd.read_csv(dataset_filepath, chunksize=chunk_rows, skiprows=range(1,skiprows), usecols=use_cols, sep="\t")):


Last chunk, total rows in dataset: 944389



Processing cangoo


  for idx, chunk in enumerate(pd.read_csv(dataset_filepath, chunksize=chunk_rows, skiprows=range(1,skiprows), usecols=use_cols, sep="\t")):


Last chunk, total rows in dataset: 20222796



Processing whiwag


  for idx, chunk in enumerate(pd.read_csv(dataset_filepath, chunksize=chunk_rows, skiprows=range(1,skiprows), usecols=use_cols, sep="\t")):


Last chunk, total rows in dataset: 2793921



Processing norfli


  for idx, chunk in enumerate(pd.read_csv(dataset_filepath, chunksize=chunk_rows, skiprows=range(1,skiprows), usecols=use_cols, sep="\t")):


Last chunk, total rows in dataset: 14604156



Processing rethaw


  for idx, chunk in enumerate(pd.read_csv(dataset_filepath, chunksize=chunk_rows, skiprows=range(1,skiprows), usecols=use_cols, sep="\t")):


KeyboardInterrupt: 

## Sum up the batches

In [568]:
sp_cell_df_directory = Path('/Users/tessa/Code/scratchpad/subspecies_plotter/sp_cell_dfs/')
sp_cell_df_directory.mkdir(exist_ok=True)

def parse_batch_files(ssp_batch_directory):
    batch_files = list(ssp_batch_directory.glob("*.csv"))
    file_info = [n.name.split("_") for n in batch_files]
    files = pd.DataFrame(file_info, columns=['SCIENTIFIC NAME', 'ROW RANGE', 'RESOLUTION'])
    files['FILENAME'] = batch_files
    for (species, resolution), species_df in files.groupby(["SCIENTIFIC NAME", 'RESOLUTION']):
        species = species.replace(" ", "-")
        resolution = resolution[:-4]
        all_dataframes = [pd.read_csv(f, index_col=0) for f in species_df.FILENAME] 
        sp_cell_df = reduce(lambda a, b: a.add(b, fill_value=0), all_dataframes)
        filename = sp_cell_df_directory.joinpath(f'{species}_{resolution}.csv')
        sp_cell_df.to_csv(filename)



parse_batch_files(ssp_batch_directory)

# Create maps

In [564]:

import networkx as nx
import numpy as np
import colorsys
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def name_to_base_hue(name):
    """Generate a base hue from a name."""
    base_hue = hash(name) % 360
    return base_hue

def average_hues(hues):
    """Average a list of hues on the circular scale."""
    x = np.mean([np.cos(np.radians(h)) for h in hues])
    y = np.mean([np.sin(np.radians(h)) for h in hues])
    avg_hue = np.degrees(np.arctan2(y, x)) % 360
    return avg_hue

# def assign_colors(subspecies, overlap_matrix):
#     """Assign colors to subspecies based on overlap relationships."""
#     # Step 1: Generate base hues
#     base_hues = {subsp: name_to_base_hue(subsp) for subsp in subspecies}
    
#     # Step 2: Adjust hues based on overlap using graph coloring
#     G = nx.Graph()
#     for i, sp1 in enumerate(subspecies):
#         for j, sp2 in enumerate(subspecies):
#             if overlap_matrix[i][j] > 0.1:  # Threshold for "overlap"
#                 G.add_edge(sp1, sp2)
    
#     coloring = nx.coloring.greedy_color(G, strategy="largest_first")
#     color_mapping = {}
    
#     for subsp, color_idx in coloring.items():
#         hue = (base_hues[subsp] + color_idx * 60) % 360  # Spread hues by 60° to maximize contrast
#         saturation, lightness = 0.8, 0.5  # Vivid, medium colors
#         r, g, b = colorsys.hls_to_rgb(hue / 360, lightness, saturation)
#         color_mapping[subsp] = rgb_to_hex((int(r * 255), int(g * 255), int(b * 255)))
    
#     return color_mapping
from colorsys import hsv_to_rgb
import numpy as np

def adjust_hue_similarity(colors, min_hue_diff=0.2):
    """
    Adjust colors to ensure hues are sufficiently different.
    
    Parameters:
    - colors: A list of (hue, saturation, value) tuples.
    - min_hue_diff: Minimum allowed difference between any two hues.
    
    Returns:
    - adjusted_colors: A list of adjusted (hue, saturation, value) tuples.
    """
    adjusted_colors = []
    for i, color in enumerate(colors):
        hue, sat, val = color
        too_close = any(abs(hue - other_hue) < min_hue_diff for other_hue, _, _ in adjusted_colors)
        
        if too_close:
            # Adjust saturation and brightness to differentiate
            sat = max(0.5, sat * 0.5)  # Slightly desaturate
            val = min(1.0, val * 1.2)  # Brighten slightly
        
        adjusted_colors.append((hue, sat, val))
    return adjusted_colors

def generate_distinct_colors_with_hue_adjustment(adjacency_matrix, subspecies_names, min_hue_diff=0.1):
    """
    Generate distinct colors for each subspecies with adjustments for hue similarity.
    
    Parameters:
    - adjacency_matrix: The adjacency matrix for subspecies.
    - subspecies_names: A list of subspecies names corresponding to the matrix indices.
    - min_hue_diff: Minimum hue difference between any two subspecies.
    
    Returns:
    - subspecies_colors: A dictionary mapping subspecies names to RGB colors.
    """
    n = len(adjacency_matrix)
    initial_hues = np.linspace(0, 1, n, endpoint=False)  # Evenly spaced hues
    
    # Sort subspecies by adjacency density to prioritize distinction
    adjacency_density = adjacency_matrix.sum(axis=1)
    sorted_indices = np.argsort(-adjacency_density)
    sorted_hues = initial_hues[sorted_indices]
    
    # Assign initial HSV values
    hsv_colors = [(hue, 0.7, 0.9) for hue in sorted_hues]  # Start with vivid colors
    
    # Adjust for hue similarity
    adjusted_hsv_colors = adjust_hue_similarity(hsv_colors, min_hue_diff=min_hue_diff)
    
    # Convert HSV to RGB and map to subspecies names
    rgb_colors = {
        subspecies_names[idx]: rgb_to_hex(tuple(int(c * 255) for c in hsv_to_rgb(*adjusted_hsv_colors[i])))
        for i, idx in enumerate(sorted_indices)
    }
    
    return rgb_colors




def create_distribution_adjacency_matrix(data, subspecies_cols, cell_col='cell_id'):
    """
    Create an adjacency matrix based on subspecies distribution similarities.

    Parameters:
    - data: DataFrame with cells as rows and subspecies counts as columns.
    - subspecies_cols: List of column names corresponding to subspecies counts.
    - cell_col: Column name for cell identifiers (optional, for reference).

    Returns:
    - adjacency_matrix: A NumPy array where element [i, j] is the similarity between subspecies distributions.
    - subspecies_list: The order of subspecies corresponding to matrix rows/columns.
    """
    # Subset the subspecies columns
    subspecies_data = data[subspecies_cols]

    # Normalize each cell's counts to proportions
    subspecies_distribution = subspecies_data.div(subspecies_data.sum(axis=1), axis=0).fillna(0)

    # Compute cosine similarity between each pair of subspecies
    adjacency_matrix = cosine_similarity(subspecies_distribution.T)

    # Return the matrix and list of subspecies
    return adjacency_matrix, subspecies_cols

def hex_to_rgb(hex_color):
    """Convert hex color (#RRGGBB) to an (R, G, B) tuple."""
    hex_color = hex_color.lstrip('#')
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

def rgb_to_hex(rgb):
    """Convert an (R, G, B) tuple to a hex color (#RRGGBB)."""
    return "#{:02x}{:02x}{:02x}".format(*rgb)

def combine_rgb_colors(rgb_colors, fracs):
    """Combine a list of RGB colors proportionally."""
    if sum(fracs) == 0:
        return "#999999"
    else:
        combined_rgb = tuple(
            int(sum(frac * color[channel] for color, frac in zip(rgb_colors, fracs)))
            for channel in range(3)
        )
    return rgb_to_hex(combined_rgb)

def style_function(feature, subspp_colors):
    """Style a cell based on the proportion of subspecies."""
    properties = feature['properties']
    subspecies_values = {subsp: properties.get(subsp, 0) for subsp in subspp_colors.keys()}
    
    # Normalize the values to sum up to 1 for proportional allocation
    total = sum(subspecies_values.values())
    if total > 0:
        fracs = [value / total for value in subspecies_values.values()]
    else:
        fracs = [0 for _ in subspecies_values]
    
    # Get RGB colors for each subspecies
    hex_colors = [subspp_colors[subsp] for subsp in subspecies_values]
    rgb_colors = [hex_to_rgb(color) for color in hex_colors]
    
    # Combine colors based on the proportional fractions
    cell_color = combine_rgb_colors(rgb_colors, fracs)
    
    return {
        'fillColor': cell_color,  # Cell color
        'color': cell_color,  # Border color
        'weight': 1,  # Border weight
        'fillOpacity': 0.6,  # Cell fill transparency
    }


def choropleth_map(sp_cell_df, common_name, adjacency_matrix):
    """Creates a choropleth map given species data."""
    
    f = folium.Figure()
    map = folium.Map(location=[47, -122], zoom_start=5, tiles="cartodbpositron")
    f.add_child(map)

    sp = sp_cell_df.columns[0]
    subspp = sp_cell_df.columns[1:]
    

    # adjacency_matrix, subspp = create_distribution_adjacency_matrix(sp_cell_df, subspp)



    subspp_colors = generate_distinct_colors_with_hue_adjustment(adjacency_matrix, subspp)

    # subspp_colors = assign_colors(subspp, adjacency_matrix)

    # # Generate random colors for each subspecies
    # subspp_colors = {subsp: generate_color_from_name(subsp) for subsp in subspp}
    
    list_features = []
    for _, row in sp_cell_df.iterrows():
        #percentages = (row[subspp] / row[sp]) # For the previous implementation that colored the map by % of total sightings instead of % of ssp sightings
        percentages = (row[subspp] / sum(row[subspp]))*100
        percentages_dict = percentages.to_dict()
        
        # Precompute tooltip text showing only non-zero percentages
        percentages_dict_ordered = pd.DataFrame(percentages_dict, index=['pct']).T.query('pct > 0')['pct'].sort_values(ascending=False).to_dict()

        tooltip_text = []
        for subsp, percent in percentages_dict_ordered.items():
            tooltip_text.append(f"{subsp}: {percent:.0f}%")
        
        # Add tooltip as a string to the properties
        percentages_dict["tooltip"] = "<br>".join(tooltip_text) if tooltip_text else "No data"

        geometry_for_row = h3.cells_to_geo(cells=[row.name])
        feature = Feature(
            geometry=geometry_for_row,
            id=row.name,
            properties=percentages_dict)
        list_features.append(feature)

    feat_collection = FeatureCollection(list_features)
    geojson_result = json.dumps(feat_collection)
    
    # Add GeoJSON layer to the map
    folium.GeoJson(
        geojson_result,
        style_function=lambda feature: style_function(feature, subspp_colors),
        name=f'{sp} Subspecies Map'
    ).add_to(map)
    
    # Add tooltips
    folium.GeoJson(
        geojson_result,
        style_function=lambda feature: {
            'weight': 0,  # No border weight
            'color': 'transparent',  # No border color
            'fillOpacity': 0.6  # Fill transparency
        },
        tooltip=GeoJsonTooltip(
            #fields=list(subspp),
            #aliases=[subsp[len(sp)+1:] for subsp in subspp], # Removes ssp name
            fields=["tooltip"],
            aliases=["Reported\nSubspecies"],
            localize=True,
            sticky=True,
            labels=True,
            labels_format="{:.2f}%",
            #highlight_function=lambda x: x.update({'text': [f'{k}: {v:.2f}%' for k, v in x['properties'].items() if v > 0]})
        )
    ).add_to(map)


    # Add legend
    legend_html = f"""
    <div style="position: fixed; top: 10px; right: 10px; width: 150px; height: auto; z-index: 9999; background-color: white; box-shadow: 0 0 5px rgba(0, 0, 0, 0.2); border: 1px solid lightgray; border-radius: 5px; padding: 10px; font-size: 10px;">
        <strong>{common_name} infraspecies</strong><br>
    """
    for subsp, color in subspp_colors.items():
        legend_html += f"""
        <div style="margin-top: 10px;">
            <span style="display: inline-block; width: 20px; height: 10px; margin-right: 10px; background-color: {color};"></span>
            {subsp}
        </div>
        """
    legend_html += "</div>"
    legend_element = folium.Element(legend_html)
    map.get_root().html.add_child(legend_element)

    # Calculate bounds and adjust the map's view
    bounds = get_bounds(geojson_result)
    map.fit_bounds(bounds)

    return map

# Example usage:
#for sp_code in sp_codes:
remake_maps = False
for sp_code in sp_codes:
    common_name = taxonomy[taxonomy['SPECIES_CODE'] == sp_code].PRIMARY_COM_NAME.values[0]
    for resolution in [2,3,4,5]:
        species = taxonomy[taxonomy['PRIMARY_COM_NAME'] == common_name].SCI_NAME.values[0]
        dataname = f"sp_cell_dfs/{species.replace(' ', '-')}_resolution{resolution}.csv"
        if not Path(dataname).exists():
            continue
        map_filename = f"docs/maps/{species.replace(' ', '-')}_{resolution}.html"
        if Path(map_filename).exists() and not remake_maps:
            continue
        sp_cell_df = pd.read_csv(dataname, index_col=0)
        sp_cell_df.columns = sp_cell_df.columns.str.replace(species + ' ', "")
        subspecies = sp_cell_df.columns[1:]
        if resolution == 2:
            adjacency_matrix, subspecies = create_distribution_adjacency_matrix(sp_cell_df, subspecies)

        m = choropleth_map(sp_cell_df, common_name, adjacency_matrix)
        m.save(f"docs/maps/{species.replace(' ', '-')}_{resolution}.html")


## Create a CSV of map URLs for the website

In [529]:
df = pd.DataFrame(columns=["common_name", "scientific_name", "resolution", "map_url"])
maps_dir = Path("/Users/tessa/Code/scratchpad/subspecies_plotter/docs/maps")
for idx, file in enumerate(maps_dir.glob("*.html")):
    resolution = file.stem.split("_")[-1]
    species = file.stem.replace(f"_{resolution}", "")
    common_name = taxonomy[taxonomy['SCI_NAME'] == species.replace('-', ' ')].PRIMARY_COM_NAME.values[0]
    map_url = Path(Path(file).parent.stem).joinpath(Path(file).name)
    print(map_url)
    df.loc[idx] = [common_name, species, resolution, map_url]
df.to_csv("docs/data/map_data.csv", index=False)

maps/Butorides-striata_4.html
maps/Butorides-striata_5.html
maps/Butorides-striata_2.html
maps/Butorides-striata_3.html
maps/Branta-bernicla_3.html
maps/Setophaga-coronata_3.html
maps/Buteo-jamaicensis_4.html
maps/Zonotrichia-leucophrys_3.html
maps/Sturnella-magna_5.html
maps/Loxia-curvirostra_3.html
maps/Garrulus-glandarius_4.html
maps/Garrulus-glandarius_5.html
maps/Loxia-curvirostra_2.html
maps/Sturnella-magna_4.html
maps/Zonotrichia-leucophrys_2.html
maps/Buteo-jamaicensis_5.html
maps/Setophaga-coronata_2.html
maps/Branta-bernicla_2.html
maps/Sturnella-magna_3.html
maps/Loxia-curvirostra_5.html
maps/Garrulus-glandarius_2.html
maps/Branta-bernicla_5.html
maps/Setophaga-coronata_5.html
maps/Buteo-jamaicensis_2.html
maps/Branta-hutchinsii_2.html
maps/Zonotrichia-leucophrys_5.html
maps/Zonotrichia-leucophrys_4.html
maps/Buteo-jamaicensis_3.html
maps/Setophaga-coronata_4.html
maps/Branta-bernicla_4.html
maps/Garrulus-glandarius_3.html
maps/Loxia-curvirostra_4.html
maps/Sturnella-magna_2

# Old color creation

```
# def generate_random_color():
#     """Generates a random color in hex format."""
#     return "#{:02x}{:02x}{:02x}".format(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))




def generate_color_from_name(name, luminance=0.5, saturation=0.8):
    """
    Generate a vivid, medium color based on a subspecies name using its hash value.
    
    Args:
    - name: The subspecies name (string) used to generate a unique color.
    - luminance: The luminance of the color (0 = dark, 1 = light, default is medium).
    - saturation: The saturation of the color (0 = grey, 1 = full saturation, default is high saturation).
    
    Returns:
    - A string representing the color in HEX format.
    """
    # Create a hash from the subspecies name
    hash_value = int(hashlib.sha256(name.encode('utf-8')).hexdigest(), 16)
    
    # Normalize the hash to be between 0 and 1 for hue (360 degrees for hue)
    hue = ((hash_value + 100) % 360) / 360.0

    # Normalize the hash for luminance and saturation
    #luminance = (hash_value % 100) / 200.0 + 0.45
    #saturation = (hash_value % 100) / 200.0 + 0.5
    
    # Generate the color in HLS (Hue, Lightness, Saturation) space
    r, g, b = colorsys.hls_to_rgb(hue, luminance, saturation)
    
    # Convert the RGB color to hexadecimal
    return rgb_to_hex((int(r*255), int(g*255), int(b*255)))

def hex_to_rgb(hex_color):
    """Convert hex color (#RRGGBB) to an (R, G, B) tuple."""
    hex_color = hex_color.lstrip('#')
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

def rgb_to_hex(rgb):
    """Convert an (R, G, B) tuple to a hex color (#RRGGBB)."""
    return "#{:02x}{:02x}{:02x}".format(*rgb)

def combine_rgb_colors(rgb_colors, fracs):
    """Combine a list of RGB colors proportionally."""
    if sum(fracs) == 0:
        return "#999999"
    else:
        combined_rgb = tuple(
            int(sum(frac * color[channel] for color, frac in zip(rgb_colors, fracs)))
            for channel in range(3)
        )
    return rgb_to_hex(combined_rgb)

def style_function(feature, subspp_colors):
    """Style a cell based on the proportion of subspecies."""
    properties = feature['properties']
    subspecies_values = {subsp: properties.get(subsp, 0) for subsp in subspp_colors.keys()}
    
    # Normalize the values to sum up to 1 for proportional allocation
    total = sum(subspecies_values.values())
    if total > 0:
        fracs = [value / total for value in subspecies_values.values()]
    else:
        fracs = [0 for _ in subspecies_values]
    
    # Get RGB colors for each subspecies
    hex_colors = [subspp_colors[subsp] for subsp in subspecies_values]
    rgb_colors = [hex_to_rgb(color) for color in hex_colors]
    
    # Combine colors based on the proportional fractions
    cell_color = combine_rgb_colors(rgb_colors, fracs)
    
    return {
        'fillColor': cell_color,  # Cell color
        'color': cell_color,  # Border color
        'weight': 1,  # Border weight
        'fillOpacity': 0.6,  # Cell fill transparency
    }


def choropleth_map(sp_cell_df, common_name):
    """Creates a choropleth map given species data."""
    
    f = folium.Figure()
    map = folium.Map(location=[47, -122], zoom_start=5, tiles="cartodbpositron")
    f.add_child(map)

    sp = sp_cell_df.columns[0]
    subspp = sp_cell_df.columns[1:]
    
    # Generate random colors for each subspecies
    subspp_colors = {subsp: generate_color_from_name(subsp) for subsp in subspp}
    
    list_features = []
    for _, row in sp_cell_df.iterrows():
        #percentages = (row[subspp] / row[sp]) # For the previous implementation that colored the map by % of total sightings instead of % of ssp sightings
        percentages = (row[subspp] / sum(row[subspp]))*100
        percentages_dict = percentages.to_dict()
        
        # Precompute tooltip text showing only non-zero percentages
        percentages_dict_ordered = pd.DataFrame(percentages_dict, index=['pct']).T.query('pct > 0')['pct'].sort_values(ascending=False).to_dict()

        tooltip_text = []
        for subsp, percent in percentages_dict_ordered.items():
            tooltip_text.append(f"{subsp}: {percent:.0f}%")
        
        # Add tooltip as a string to the properties
        percentages_dict["tooltip"] = "<br>".join(tooltip_text) if tooltip_text else "No data"

        geometry_for_row = h3.cells_to_geo(cells=[row.name])
        feature = Feature(
            geometry=geometry_for_row,
            id=row.name,
            properties=percentages_dict)
        list_features.append(feature)

    feat_collection = FeatureCollection(list_features)
    geojson_result = json.dumps(feat_collection)
    
    # Add GeoJSON layer to the map
    folium.GeoJson(
        geojson_result,
        style_function=lambda feature: style_function(feature, subspp_colors),
        name=f'{sp} Subspecies Map'
    ).add_to(map)
    
    # Add tooltips
    folium.GeoJson(
        geojson_result,
        style_function=lambda feature: {
            'weight': 0,  # No border weight
            'color': 'transparent',  # No border color
            'fillOpacity': 0.6  # Fill transparency
        },
        tooltip=GeoJsonTooltip(
            #fields=list(subspp),
            #aliases=[subsp[len(sp)+1:] for subsp in subspp], # Removes ssp name
            fields=["tooltip"],
            aliases=["Reported\nSubspecies"],
            localize=True,
            sticky=True,
            labels=True,
            labels_format="{:.2f}%",
            #highlight_function=lambda x: x.update({'text': [f'{k}: {v:.2f}%' for k, v in x['properties'].items() if v > 0]})
        )
    ).add_to(map)


    # Add legend
    legend_html = f"""
    <div style="position: fixed; top: 10px; right: 10px; width: 150px; height: auto; z-index: 9999; background-color: white; box-shadow: 0 0 5px rgba(0, 0, 0, 0.2); border: 1px solid lightgray; border-radius: 5px; padding: 10px; font-size: 10px;">
        <strong>{common_name} subspecies</strong><br>
    """
    for subsp, color in subspp_colors.items():
        legend_html += f"""
        <div style="margin-top: 10px;">
            <span style="display: inline-block; width: 20px; height: 10px; margin-right: 10px; background-color: {color};"></span>
            {subsp}
        </div>
        """
    legend_html += "</div>"
    legend_element = folium.Element(legend_html)
    map.get_root().html.add_child(legend_element)

    # Calculate bounds and adjust the map's view
    bounds = get_bounds(geojson_result)
    map.fit_bounds(bounds)

    return map

# Example usage:
for sp_code in sp_codes:
    common_name = taxonomy[taxonomy['SPECIES_CODE'] == sp_code].PRIMARY_COM_NAME.values[0]
    for resolution in [2,3,4,5]:
        species = taxonomy[taxonomy['PRIMARY_COM_NAME'] == common_name].SCI_NAME.values[0]
        dataname = f"sp_cell_dfs/{species.replace(' ', '-')}_resolution{resolution}.csv"
        if not Path(dataname).exists():
            continue
        sp_cell_df = pd.read_csv(dataname, index_col=0)
        sp_cell_df.columns = sp_cell_df.columns.str.replace(species + ' ', "")
        m = choropleth_map(sp_cell_df, common_name)
        m.save(f"docs/maps/{species.replace(' ', '-')}_{resolution}.html")

```