In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from os import path, environ
from lib import get_mysql_conn, get_sqlite_conn
from IPython.display import display
from datetime import datetime

In [3]:
SQL_CONFIG_FILE = path.join(environ["HOME"], ".my.cnf")

In [4]:
TARGET_FAMILIES = [
    'andrenidae',
    'apidae',
    'colletidae',
    'halictidae',
    'megachilidae',
    'melittidae',
]

SQL_GET_FAMILIES = """
    SELECT tid FROM taxa
    WHERE LOWER(sciName) IN ({}) 
""".format(','.join(["'{}'".format(f) for f in TARGET_FAMILIES]))

SQL_GET_RANK_BY_NAME = """
    SELECT rankID from taxonunits
    WHERE LOWER(rankName) = %s
"""

SQL_GET_ALL_BEES = """
    SELECT t.sciName FROM taxa t
    INNER JOIN taxaenumtree te
    ON t.tid = te.tid
    WHERE t.rankID = %s
    AND te.parenttid IN %s
    GROUP BY t.tid
"""

OUT_FILE = "{}_scan-bee-list.csv".format(datetime.now().strftime("%F"))

In [5]:
scan_conn = get_mysql_conn(SQL_CONFIG_FILE)

try:
    with scan_conn.cursor() as cursor:
        cursor.execute(SQL_GET_FAMILIES)
        family_tids = [f['tid'] for f in cursor.fetchall()]
    
finally:
    scan_conn.close()

In [6]:
scan_conn = get_mysql_conn(SQL_CONFIG_FILE)

try:
    with scan_conn.cursor() as cursor:
        cursor.execute(SQL_GET_RANK_BY_NAME, ('species',))
        species_rank_id = cursor.fetchone()["rankID"]

finally:
    scan_conn.close()

In [7]:
scan_conn = get_mysql_conn(SQL_CONFIG_FILE)

try:
    species_df = pd.read_sql(
        SQL_GET_ALL_BEES, 
        scan_conn, 
        params=(species_rank_id, family_tids)
    )
    species_df = species_df.drop_duplicates(subset=["sciName"])
    display(species_df.head(n=40))

finally:
    scan_conn.close()

Unnamed: 0,sciName
0,Alcidamea truncata
1,Anthocopa abjecta
2,Anthocopa anceyi
3,Anthocopa anthodyta
4,Anthocopa arizonensis
5,Anthocopa beameri
6,Anthocopa bidentata
7,Anthocopa compacta
8,Anthocopa copelandica
9,Anthocopa croatica


In [8]:
species_df.to_csv(OUT_FILE)