In [1]:
import pandas as pd

# Prep data

In [190]:
#df = pd.read_csv("ebd_US-AL-101_202204_202204_relApr-2022_SAMPLE/ebd_US-AL-101_202204_202204_relApr-2022.txt", sep="\t", nrows=10)
df = pd.read_csv("ebd-sample.txt", sep="\t")

## Remove duplicate checklists

In [191]:
df = df[df['GROUP IDENTIFIER'].isnull() | ~df[df['GROUP IDENTIFIER'].notnull()].duplicated(subset=["GROUP IDENTIFIER", "SCIENTIFIC NAME"],keep='first')]


## Remove unconfirmed observations

In [192]:
df = df[df["APPROVED"] == 1]

## Just subset to the columns we need

In [None]:
df.columns

In [None]:
needed_columns = ['TAXONOMIC ORDER',
       'CATEGORY', 'TAXON CONCEPT ID', 'COMMON NAME', 'SCIENTIFIC NAME',
       'SUBSPECIES COMMON NAME', 'SUBSPECIES SCIENTIFIC NAME', 'EXOTIC CODE',
       'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'SAMPLING EVENT IDENTIFIER',
       'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE']
df = df[needed_columns]
df.head()

## Are there subspsecies in this dataset?

In [None]:
df["SUBSPECIES COMMON NAME"].unique()

# Give everyone a grid cell

In [196]:
import h3

H3 resolutions
```
Res	Average Hexagon Area (km2)	Pentagon Area* (km2)	Ratio (P/H)
0	4,357,449.416078381	2,562,182.162955496	0.5880
1	609,788.441794133	328,434.586246469	0.5386
2	86,801.780398997	44,930.898497879	0.5176
3	12,393.434655088	6,315.472267516	0.5096
4	1,770.347654491	896.582383141	0.5064
5	252.903858182	127.785583023	0.5053
6	36.129062164	18.238749548	0.5048
7	5.161293360	2.604669397	0.5047
```

H3 number of cells
```
Res	Total number of cells	Number of hexagons	Number of pentagons
0	122	110	12
1	842	830	12
2	5,882	5,870	12
3	41,162	41,150	12
4	288,122	288,110	12
5	2,016,842	2,016,830	12
6	14,117,882	14,117,870	12
```

In [197]:
# Create a ~ grid of the earth at the 4th resolution (288,122 cells, avging 1700 km^2 in area)
resolution = 4

In [198]:
# Convert latitude and longitude to an H3 hexagon ID
df['hex_id'] = df.apply(lambda row:  h3.latlng_to_cell(row.LATITUDE, row.LONGITUDE, resolution), axis=1)

# Calculate the color for each ssp for each grid cell

## Get the subspecies for each species

In [None]:
df["SUBSPECIES COMMON NAME"][0]

In [None]:
pd.unique()

In [None]:
spp = df["SCIENTIFIC NAME"].unique()
subspp_dict = {s: df[df["SCIENTIFIC NAME"] == s]["SUBSPECIES SCIENTIFIC NAME"].dropna().unique() for s in spp}
subspp_dict

## For the species
### For the grid cell
#### Calculate total number of checklists of that species
#### Calculate percentage of those checklists containing each group

In [245]:
def get_grid_cell_species_data(cell_df, sp, subspp):
    num_checklists = cell_df["SAMPLING EVENT IDENTIFIER"].nunique()
    cell_data = {'cell_id': cell_df["hex_id"].iloc[0]}
    cell_data[sp] = num_checklists
    for subsp in subspp:
        num_subsp = cell_df[cell_df["SUBSPECIES SCIENTIFIC NAME"] == subsp].shape[0]
        cell_data[subsp] = num_subsp/num_checklists
    return cell_data

In [None]:
sp = spp[0]
subsp = subspp_dict[sp]
cell_dicts = []
for cell in df["hex_id"].unique():
    cell_df = df[df["hex_id"] == cell]
    cell_data = get_grid_cell_species_data(cell_df, sp, subsp)
    cell_dicts.append(cell_data)
sp_cell_df = pd.DataFrame(cell_dicts, index=range(len(cell_dicts)))
sp_cell_df.set_index("cell_id", inplace=True)
sp_cell_df

In [None]:
sp_cell_df.loc["8428d59ffffffff"]

In [171]:
import folium
from geojson import Feature, Point, FeatureCollection
import json
import matplotlib
import matplotlib.pyplot as plt



In [398]:
colormap_names = ["Reds", "Blues", "Oranges", "Purples", "Greens"]

def generate_legend_html(subspp_colors, common_name, sci_name):
    """
    Generate HTML for the legend dynamically based on subspecies and colormaps.

    Created by ChatGPT.

    Args:
    - subspp_colors: Dictionary mapping subspecies to (colormap, norm) tuples.

    Returns:
    - HTML string for the legend.
    """
    legend_html = f"""
    <div style="
        position: fixed;
        bottom: 50px;
        left: 50px;
        width: 150px;
        height: auto;
        z-index: 9999;
        background-color: white;
        box-shadow: 0 0 5px rgba(0, 0, 0, 0.2);
        border: 1px solid lightgray;
        border-radius: 5px;
        padding: 10px;
        font-size: 10px;
        line-height: 10px;
    ">
        <strong>{common_name} subspecies</strong><br>
    """

    for subsp, (cmap, norm) in subspp_colors.items():
        # Generate gradient for the colormap
        gradient = "background: linear-gradient(to right, "
        gradient += ", ".join(
            matplotlib.colors.to_hex(cmap(norm(v)))
            for v in [0, 0.25, 0.5, 0.75, 1]
        )
        gradient += ");"
        
        legend_html += f"""
        <div style="margin-top: 10px;">
            <span style="display: inline-block; width: 20px; height: 10px; margin-right: 10px; {gradient}"></span>
            {subsp[len(sci_name)+1:]}
        </div>
        """
    
    legend_html += "</div>"
    return legend_html

def get_bounds(geojson_result):
    """
    Calculate the bounding box of all features in the GeoJSON.

    Args:
    - geojson_result: GeoJSON string with features.

    Returns:
    - Bounds as [[southwest_lat, southwest_lon], [northeast_lat, northeast_lon]].
    """
    import json
    geojson_data = json.loads(geojson_result)
    all_coords = []

    for feature in geojson_data['features']:
        # Extract all coordinates from the polygon or multipolygon
        coords = feature['geometry']['coordinates']
        if feature['geometry']['type'] == "Polygon":
            all_coords.extend(coords[0])  # Add outer ring of the polygon
        elif feature['geometry']['type'] == "MultiPolygon":
            for poly in coords:
                all_coords.extend(poly[0])  # Add outer ring of each polygon

    # Extract longitudes (x) and latitudes (y) correctly
    lons, lats = zip(*all_coords)
    return [[min(lats), min(lons)], [max(lats), max(lons)]]

def get_color(cmap, value, norm):
    """Get a color from a colormap based on a value
    Args:
    - cmap: Colormap object
    - value: Value to get color for
    - norm: a matplotlib Normalize object to apply to the colormap

    Created by ChatGPT
    """
    rgba = cmap(norm(value))  # Get RGBA from colormap
    return matplotlib.colors.to_hex(rgba)  # Convert to HEX for Folium

def hex_to_rgb(hex_color):
    """
    Convert hex color (#RRGGBB) to an (R, G, B) tuple.

    Created by ChatGPT
    """
    hex_color = hex_color.lstrip('#')
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

def rgb_to_hex(rgb):
    """
    Convert an (R, G, B) tuple to a hex color (#RRGGBB).

    Created by ChatGPT
    """
    return "#{:02x}{:02x}{:02x}".format(*rgb)

def proportionally_combine_hex_colors(hex_colors, fracs):
    """
    Combine a list of hex colors proportionally.

    Created by ChatGPT
    """
    # Convert hex colors to RGB
    rgb_colors = [hex_to_rgb(color) for color in hex_colors]

    # Combine colors proportionally
    combined_rgb = tuple(
        sum(frac * color[channel] for color, frac in zip(rgb_colors, fracs))
        for channel in range(3)
    )
    
    final_hex = rgb_to_hex(combined_rgb)
    if final_hex == "#000000":
        return "#FFFFFF"
    return final_hex

# Style function
def style_function(feature, subspp_to_cmap):
    """Style a cell based on the proportion of subspecies.

    Created with assistance of ChatGPT and GitHub Copilot.

    Args:
    - feature: GeoJSON feature
    - subspp_to_cmap: Dictionary mapping subspecies to (colormap, norm) tuples

    Returns:
    - Style dictionary for the cell
    """

    properties = feature['properties']
    subspecies_values = {subsp: properties.get(subsp, 0) for subsp in subspp_to_cmap.keys()}
    #subspecies_colors = {subsp: get_color(cmap, subspecies_values[subsp], norm) for subsp, (cmap, norm) in subspp_to_cmap.items()}

    
    # Normalize the values to sum up to 1 for proportional allocation
    total = sum(subspecies_values.values())
    if total > 0:
        fracs = [ int(value / total) for value in subspecies_values.values()]
    else:
        fracs = [0 for value in subspecies_values.values()]

    # Get hexadecimal colors
    hex_colors = [get_color(cmap, value, norm) for value, (cmap, norm) in zip(subspecies_values.values(), subspp_to_cmap.values())]

    cell_color = proportionally_combine_hex_colors(hex_colors, fracs)

    return {
        'fillColor': cell_color,
        'color': '#000000',  # Border color
        'weight': 0.1,
        'fillOpacity': 0.7,
    }


def choropleth_map(sp_cell_df, common_name, colormap_names, border_color = 'black', fill_opacity = 0.7, initial_map = None):
    """
    Creates choropleth maps given the aggregated data. initial_map can be an existing map to draw on top of.

    Adapted from: https://jens-wirelesscar.medium.com/lhexagone-in-hexagons-uber-h3-map-1566bc412172
    """    

    f = folium.Figure(width=600, height=450)
    if initial_map is None:
        initial_map = folium.Map(location= [47, -122], zoom_start=5, tiles="cartodbpositron")
    f.add_child(initial_map)

    sp = sp_cell_df.columns[0]
    subspp = sp_cell_df.columns[1:]
    subspp_colors = dict(zip(subspp, colormap_names))

    subsp_containing_rows = sp_cell_df[sp_cell_df[subspp].sum(axis=1) > 0]
    list_features = []

    # Produce a GeoJSON feature containing properties for the frequency of each subspecies for each cell
    #for i, row in subsp_containing_rows.iterrows():
    for i, row in sp_cell_df.iterrows():
        percentages = (row[subspp]/row[sp])
        percentages_dict = percentages.to_dict()
        geometry_for_row = h3.cells_to_geo(cells=[row.name])
        feature = Feature(geometry = geometry_for_row , id=row.name, properties = percentages_dict)
        list_features.append(feature)

    feat_collection = FeatureCollection(list_features)
    geojson_result = json.dumps(feat_collection)

    # Define colormaps for subspecies
    subspp_colors = dict()
    for subsp, cmap_name in zip(subspp, colormap_names):
        cmap = matplotlib.colormaps.get_cmap(cmap_name)
        vmax = subsp_containing_rows[subsp].max()
        vmin = 0
        norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
        subspp_colors[subsp] = (cmap, norm)

    folium.GeoJson(
        geojson_result,
        style_function=lambda feature: style_function(feature, subspp_colors),
        name = f'{sp} Subspecies Map'
    ).add_to(initial_map)

    # Add legend
    legend_html = generate_legend_html(subspp_colors, common_name=common_name, sci_name=sp)
    legend_element = folium.Element(legend_html)
    initial_map.get_root().html.add_child(legend_element)

    # Calculate bounds and adjust the map's view
    bounds = get_bounds(geojson_result)
    initial_map.fit_bounds(bounds)
    print(bounds)

    return initial_map



choropleth_map(sp_cell_df, common_name="Canada Jay", colormap_names=colormap_names)

[[14.78097, -152.596164], [64.947214, -52.522921]]
