# Filtrar detecciones 

In [25]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import birdnames as bn
from pathlib import Path

# Leer todos las detecciones para luego fusionarlas
folder_path = "./Detecciones_estandarizadas"

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in csv_files]
df_concat = pd.concat(dfs, ignore_index=True)

# Número de especies y proyectos
print(f"Número de especies: {df_concat['speciesName'].nunique()}")
print(f"Número de proyectos: {df_concat['projectName'].nunique()}")


Número de especies: 631
Número de proyectos: 1


## Filtrar especies objetivo

In [26]:
targets_df = pd.read_csv('./Especies_objetivo_escuchaton.csv')
spp_putumayo = pd.read_csv('./Especies_objetivo_putumayo.csv')

# Solo para putumayo (datos IAvH)
targets_merged = pd.concat([targets_df, spp_putumayo], ignore_index=True)

df_concat = df_concat[df_concat['speciesName'].isin(targets_merged ['Clements2024_Scientific_Name'])]

# Numero de clips por especie con al menos 10 clips
df_concat = df_concat.groupby('speciesName').filter(lambda x: len(x) >= 50)
df_concat['speciesName'].value_counts()

speciesName
Nothocrax urumutum          11874
Myrmothera campanisona       4002
Cyanocorax violaceus         3762
Liosceles thoracicus         2104
Ramphastos tucanus           1951
Piaya melanogaster           1852
Percnostola rufifrons         849
Hylophylax naevius            426
Akletos melanoceps            367
Thamnomanes caesius           144
Chaetura pelagica             110
Corythopis torquatus          101
Phlegopsis nigromaculata       85
Pandion haliaetus              76
Crax alector                   71
Formicarius colma              63
Name: count, dtype: int64

## Filtar por puntaje de deteccion (confidence)

En este paso se van a seleccionar los clips para verificacion siguiendo el protocolo de Navine et al (2024) de 4 grupos (bins) logaritmicos, en la cual los puntajes de deteccion se van a convertir en logit y se seleccionaran un numero igual de grabaciones en 4 grupos de acuerdo a estos criterios:

- El 50% de los puntajes mas bajos se asignaran al grupo 1
- El 25% de los siguientes puntajes mas bajos de asignaran al grupo 2
- El 12,5 % de los siguientes puntajes mas bajos de asignaran al grupo 3
- El 12,5 % de los siguientes puntajes mas bajos de asignaran al grupo 4


In [27]:

def safe_logit(p, eps=1e-6):
    # Convert to numpy array
    p = np.asarray(p, dtype=float)
    # Apply epsilon adjustment
    p = np.clip(p, eps, 1 - eps)
    # Compute logit
    return np.log(p / (1 - p))

quantiles = [0, 0.50, 0.75, 0.875, 1.0]
labels = [1, 2, 3, 4]


In [28]:
def get_additional_selection_table(csv):
        table=csv[['speciesName','confidence','segmentID','filePath','startTime','endTime','classifiedBy','timestamp','recorderID','startTime','endTime','Begin Path']]
        raven = pd.DataFrame()
        raven = table.sort_values(by='confidence', ascending=False).reset_index(drop=True)

        raven['Selection'] = range(1, len(raven) + 1)
        raven['View'] = 'Spectrogram 1'
        raven['Channel'] = 1
        raven['groupID'] = raven['segmentID'].astype(str).str.split('_').str[0]
        raven['Begin Time (s)'] = 0
        raven['End Time (s)'] = 3
        raven['File Offset (s)'] = 0
        raven = raven.rename(columns={'speciesName': 'Scientific Name'})

        # Convert to scientific names
        converter = bn.Converter(
            from_type="scientific_name",
            to_type="common_name",
            from_authority="ebird"
        )
        raven['Common Name'] = converter.convert(raven['Scientific Name'])
        
        #raven = raven.rename(columns={'segmentID': 'Begin Path'})
        #raven['Begin Path'] = raven['filePath'].str.split('/').str[-1]
        
        raven = raven.rename(columns={'filePath': ' Orig_path'})
        raven['verify'] = 'NA'
        raven['song_type'] = 'NA'
        raven['reference'] = 'NA'
        raven['add2library'] = 0
        raven['notes'] = 'NA'
        raven['Method'] = raven['classifiedBy'].apply(lambda x: 'BirdNET' if ('birdnet' in str(x).lower()) else ("Perch" if ('perch' in str(x).lower()) else None))
        raven['Date'] = raven['timestamp'].apply(lambda x: x[:10] if pd.notna(x) else None)
        raven = raven.rename(columns={'recorderID': 'Punto'})
        raven = raven.rename(columns={'startTime': 'Orig_start'})
        raven = raven.rename(columns={'endTime': 'Orig_end'})
        raven = raven.loc[:, ~raven.columns.duplicated()]

        order= ['Selection', 'View', 'Channel', 'group_id', 'Begin Time (s)',
        'End Time (s)', 'File Offset (s)', 'Common Name', 'Scientific Name',
        'Confidence', 'Scientific Name2', 'Scientific Name3', 'Date', 'Punto',
        'Begin Path', 'Orig_path', 'Orig_start', 'Orig_end', 'verify',
        'song_type', 'reference', 'add2library', 'notes', 'method']

        # Sort the columns of the excel DataFrame based on the 'order' list, keeping only those columns that exist in excel
        existing_columns = [col for col in order if col in raven.columns]
        raven = raven.reindex(columns=existing_columns)
        return raven
 

In [29]:
def get_selections(df, clips_por_grupo):
    for species, group in df.groupby('speciesName'):
        group = group.copy()
        # Use the safe_logit function for logits
        group['logits'] = safe_logit(group['confidence'])
        group["logit_group"] = pd.qcut(group["logits"], q=quantiles, labels=labels, duplicates="drop" )
   
        logit_group_order = [4, 3, 2, 1]  # from highest to lowest group
        target_per_group = clips_por_grupo
        sampled_rows_1 = []
        sampled_rows_2 = []

        extra_needed_1 = 0
        extra_needed_2 = 0

        # Loop over groups from highest to lowest
        for current_group in logit_group_order:
            current_rows = group[group['logit_group'] == current_group]
            n_rows = len(current_rows)

            # FIRST subset (as before, random_state=42)
            if n_rows >= (target_per_group + extra_needed_1):
                sampled_current_1 = current_rows.sample(n=target_per_group + extra_needed_1, random_state=42)
                sampled_rows_1.append(sampled_current_1)
                extra_needed_1 = 0
                # For the 2nd subset, use the remaining (exclude those from sample 1)
                remaining_rows = current_rows.drop(sampled_current_1.index)
                available_for_2 = len(remaining_rows)
                need_2 = target_per_group + extra_needed_2
                if available_for_2 >= need_2:
                    sampled_current_2 = remaining_rows.sample(n=need_2, random_state=99)
                    sampled_rows_2.append(sampled_current_2)
                    extra_needed_2 = 0
                else:
                    sampled_current_2 = remaining_rows
                    sampled_rows_2.append(sampled_current_2)
                    extra_needed_2 = need_2 - available_for_2
            else:
                # If not enough for subset 1, just assign all, so subset 2 has nothing from this bin
                sampled_current_1 = current_rows
                sampled_rows_1.append(sampled_current_1)
                extra_needed_1 = (target_per_group + extra_needed_1) - n_rows
                # For subset 2, nothing left
                sampled_current_2 = current_rows.iloc[0:0]
                sampled_rows_2.append(sampled_current_2)
                extra_needed_2 += (target_per_group)  # Not enough for the second subset either

        # Concatenate selected rows
        sampled_1 = pd.concat(sampled_rows_1, ignore_index=True)
        sampled_2 = pd.concat(sampled_rows_2, ignore_index=True)
        output_folder = "./Detecciones_filtradas"
        output_additional = "./Detecciones_filtradas_adicionales"
        species_folder = os.path.join(output_folder)
        additional_folder = os.path.join(output_additional)
        os.makedirs(species_folder, exist_ok=True)
        os.makedirs(additional_folder, exist_ok=True)
        del sampled_1['logits']
        del sampled_2['logits']
        output_path_1 = os.path.join(species_folder, f"{species}.csv")
        output_path_2 = os.path.join(additional_folder, f"{species}_adicionales.csv")
        sampled_1.to_csv(output_path_1, index=False)
        sampled_2.to_csv(output_path_2, index=False)
        raven_1 = get_additional_selection_table(csv=sampled_1)
        raven_2 = get_additional_selection_table(csv=sampled_2)
        dir_tables = f"./Datos/{species}"
        dir_tables_additional = f"./Datos_adicionales/{species}"
        os.makedirs(dir_tables, exist_ok=True)
        os.makedirs(dir_tables_additional, exist_ok=True)
        output_path_raven_1 = os.path.join(dir_tables, f"{species}.csv")
        output_path_raven_2 = os.path.join(dir_tables_additional, f"{species}_adicionales.csv")
        raven_1.to_csv(output_path_raven_1.replace('.csv','_S1.txt'), index=False, sep='\t')
        raven_2.to_csv(output_path_raven_2.replace('.csv','_S1.txt'), index=False, sep='\t')


In [None]:
get_selections(df=df_concat, clips_por_grupo=12)