# select hawkears detections to localize

summarize detections of all species in HawkEars into a 0/1 table with a set of species of interest

use a threshold of -1.0 (logit score) for all species

save sparse dataframes for space and memory efficiency

Note: the complete set of HawkEars scores used by this script is not provided in this repository, the script is provided for reference only


# aggregate detections and store as sparse dataframe
let's do just one deployment to start

In [None]:
import pandas as pd
from glob import glob
from pathlib import Path
from scipy.sparse import csr_matrix

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize']=[12,8] #for big visuals
%config InlineBackend.figure_format = 'retina'

# paths
score_files_dir = "REDACTED" # matches out_dir of 1_detect_species_HawkEars.py
save_dir = "REDACTED" # matches 
audio_dir = "REDACTED" 

# set up variables

In [None]:
dpl = "SBT-6-76"
score_files = glob(f"{score_files_dir}/{dpl}/*/*.csv")
audio_files = glob(f"{audio_dir}/{dpl}/*/*.wav")
save_dir = f"{save_dir}/{dpl}"
Path(save_dir).mkdir(exist_ok=True)
len(audio_files), len(score_files)

(2160, 50)

# Note: 
audio folders have a few small files (1-10Mb) that seem to be trials and then 160Mb files that seem to be the real deal. 

For SBT-3-18, for example, only one day has the full set of recordings: 29-minute recordings starting on the half hour from 05:00 to 08:30

There are 4 short files at other miscellaneous times.

We should probably only use the longer, scheduled recordings. I'm guessing the others won't line up with recordings across the arry and were not meant to be used as data. 

Some grids have >1 day: e.g. SBT-6-83 has 5 days of complete recording schedule. 

## aggregate score dfs into sparse detection df
- Threshold detections
- make sparse df
- concatenate across all cards in the grid


In [None]:
threshold = -1  # Species detection threshold

# remove some non-species classes from HawkEars
classes_to_skip = [
    "Noise",
    "Mashup",
    "Other",
]
#'Philadelphia Vireo','Lazuli Bunting',"Bewick's Wren","MacGillivray's Warbler","Squirrel","Purple Finch", "Canine", "Noise", "Other", "Gray Treefrog", 'Spring Peeper', 'American Goshawk', 'Pacific-slope Flycatcher', 'Mashup', 'Rooster', 'Black-crowned Night Heron']

Path(save_dir).mkdir(exist_ok=True)


def make_spars_df(path):
    d = pd.read_csv(path, index_col=[0, 1, 2])
    d = d.drop(columns=classes_to_skip)
    sp_arr = csr_matrix(d > threshold)
    return pd.DataFrame.sparse.from_spmatrix(sp_arr, index=d.index, columns=d.columns)


sparse_detection_dfs = [make_spars_df(path) for path in score_files]
detections = pd.concat(sparse_detection_dfs)
# detections.to_pickle('./sparse_detections_grid1.pkl')
# reloaded = pd.read_pickle('./sparse_detections_grid1.pkl')
# reloaded.shape
# (detections.astype(int).sum(0) == reloaded.astype(int).sum(0)).all()

# can skip rows with no classes detected
filtered = detections[detections.astype(bool).sum(1) > 0]
dest = save_dir + f"dets_{dpl}_thresh{threshold}.pkl"
filtered.to_pickle(dest)
print(f"saved pickled dets to {dest}")

In [None]:
dpl_cnts = filtered.sum(0)

In [None]:
dpl_cnts[dpl_cnts > 0].sort_values(ascending=False).head(20)

Red-eyed Vireo                  481445
White-throated Sparrow           80963
Ovenbird                         68431
Winter Wren                      45076
Yellow-rumped Warbler            44519
American Robin                   26629
Western Tanager                  20836
Black-throated Green Warbler     12854
Mourning Warbler                 11296
Blue-headed Vireo                10967
Rose-breasted Grosbeak           10738
Swainson's Thrush                10576
Purple Finch                      9764
Squirrel                          9697
Harris's Sparrow                  8536
Philadelphia Vireo                7584
Hermit Thrush                     6554
Mountain Chickadee                6182
Canada Warbler                    6153
Townsend's Solitaire              5341
dtype: Sparse[int64, 0]

# repeat for all arrays

In [None]:
all_dpl = [Path(f).name for f in glob(f"{score_files_dir}/*")]
for dpl in all_dpl:
    score_files = glob(f"{score_files_dir}/{dpl}/*/*.csv")
    save_path = f"{save_dir}/{dpl}"
    Path(save_path).mkdir(exist_ok=True)

    sparse_detection_dfs = [make_spars_df(path) for path in score_files]
    if len(sparse_detection_dfs) > 0:
        detections = pd.concat(sparse_detection_dfs)

        # can skip rows with no classes detected
        filtered = detections[detections.astype(bool).sum(1) > 0]
        dest = f"{save_path}/dets_{dpl}_thresh{threshold}.pkl"
        filtered.to_pickle(dest)
        print(f"saved pickled dets to {dest}")
    else:
        print(f"no dets for {dpl}")