# Filtrar detecciones 

In [10]:
import os

begin_path = '/Users/santiagoruiz/Downloads/Validation_1to5'

# Recursively list all .wav files in begin_path and subfolders
wav_files = []
for root, dirs, files in os.walk(begin_path):
    for file in files:
        if file.lower().endswith('.wav'):
            wav_files.append(os.path.join(root, file))

len(wav_files)


2061

In [11]:
import pandas as pd
data=pd.read_csv('/Users/santiagoruiz/Documents/Escuchaton/Detecciones_estandarizadas/nambi_std_subset.csv')
# Create a DataFrame with 'segmentID' and full path by matching with wav_files

# Get the list of segmentIDs as a set for fast lookup/matching
segment_ids = set(data['segmentID'])

# Function to extract segmentID from a file path (without .wav extension)
def extract_segment_id_from_path(path):
    return os.path.splitext(os.path.basename(path))[0]

# Build a mapping from segmentID to full path
id_to_path = {}
for wav_path in wav_files:
    seg_id = extract_segment_id_from_path(wav_path)
    if seg_id in segment_ids:
        id_to_path[seg_id] = wav_path

# Build the output dataframe
segment_df = pd.DataFrame({
    'segmentID': [],
    'Full_path': [],
})

for seg_id in data['segmentID']:
    full_path = id_to_path.get(seg_id, None)
    segment_df = pd.concat([
        segment_df,
        pd.DataFrame({'segmentID': [seg_id], 'Full_path': [full_path]})
    ], ignore_index=True)

segment_df[segment_df['Full_path'].notna()]


Unnamed: 0,segmentID,Full_path


In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import birdnames as bn
from pathlib import Path

# Leer todos las detecciones para luego fusionarlas
folder_path = "./Detecciones_estandarizadas"
begin_path='/Users/santiagoruiz/Downloads/Validation_1to5'

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in csv_files]
df_concat = pd.concat(dfs, ignore_index=True)

# Número de especies y proyectos
print(f"Número de especies: {df_concat['speciesName'].nunique()}")
print(f"Número de proyectos: {df_concat['projectName'].nunique()}")


Número de especies: 32
Número de proyectos: 1


## Filtrar especies objetivo

In [13]:
targets_df = pd.read_csv('./especies_objetivo_escuchaton.csv')
df_concat = df_concat[df_concat['speciesName'].isin(targets_df['Clements2024_Scientific_Name'])]

# Numero de clips por especie
df_concat = df_concat.groupby('speciesName').filter(lambda x: len(x) >= 50)
df_concat['speciesName'].value_counts()

speciesName
Grallaria alleni              28904
Cephalopterus penduliger      20783
Hypopyrrhus pyrohypogaster      926
Glaucidium nubicola             806
Henicorhina negreti             586
Vireo masteri                   430
Grallaria rufocinerea           420
Pyroderus scutatus              356
Odontophorus melanonotus        223
Galbula pastazae                163
Dysithamnus occidentalis        151
Sericossypha albocristata       132
Ammodramus savannarum            82
Name: count, dtype: int64

## Filtar por puntaje de deteccion (confidence)

En este paso se van a seleccionar los clips para verificacion siguiendo el protocolo de Navine et al (2024) de 4 grupos (bins) logaritmicos, en la cual los puntajes de deteccion se van a convertir en logit y se seleccionaran un numero igual de grabaciones en 4 grupos de acuerdo a estos criterios:

- El 50% de los puntajes mas bajos se asignaran al grupo 1
- El 25% de los siguientes puntajes mas bajos de asignaran al grupo 2
- El 12,5 % de los siguientes puntajes mas bajos de asignaran al grupo 3
- El 12,5 % de los siguientes puntajes mas bajos de asignaran al grupo 4


In [14]:

clips_por_grupo = 12


def safe_logit(p, eps=1e-6):
    # Convert to numpy array
    p = np.asarray(p, dtype=float)
    # Apply epsilon adjustment
    p = np.clip(p, eps, 1 - eps)
    # Compute logit
    return np.log(p / (1 - p))

quantiles = [0, 0.50, 0.75, 0.875, 1.0]
labels = [1, 2, 3, 4]

for species, group in df_concat.groupby('speciesName'):
    group = group.copy()
    # Use the safe_logit function for logits
    group['logits'] = safe_logit(group['confidence'])
    group["logit_group"] = pd.qcut(group["logits"], q=quantiles, labels=labels, duplicates="drop" )
    # INSERT_YOUR_CODE
    # Rewritten selection according to specification
    logit_group_order = [4, 3, 2, 1]  # from highest to lowest group
    target_per_group = clips_por_grupo
    sampled_rows = []

    group_counts = group['logit_group'].value_counts().reindex(logit_group_order, fill_value=0)
    extra_needed = 0

    # Loop over groups from highest to lowest
    for current_group in logit_group_order:
        current_rows = group[group['logit_group'] == current_group]
        n_rows = len(current_rows)

        if n_rows >= (target_per_group + extra_needed):
            sampled_current = current_rows.sample(n=target_per_group + extra_needed, random_state=42)
            sampled_rows.append(sampled_current)
            extra_needed = 0
        else:
            sampled_current = current_rows
            sampled_rows.append(sampled_current)
            extra_needed = (target_per_group + extra_needed) - n_rows

    # Concatenate selected rows
    sampled = pd.concat(sampled_rows, ignore_index=True)
    output_folder = "./Detecciones_filtradas"
    species_folder = os.path.join(output_folder)
    os.makedirs(species_folder, exist_ok=True)
    del sampled['logits']
    output_path = os.path.join(species_folder, f"{species}.csv")
    sampled.to_csv(output_path, index=False)

    table=sampled
    table=table[['speciesName','confidence','segmentID','filePath','startTime','endTime','classifiedBy','timestamp','recorderID','startTime','endTime']]
    raven = pd.DataFrame()
    raven = table.sort_values(by='confidence', ascending=False).reset_index(drop=True)

    raven['Selection'] = range(1, len(raven) + 1)
    raven['View'] = 'Spectrogram 1'
    raven['Channel'] = 1
    raven['groupID'] = raven['segmentID'].astype(str).str.split('_').str[0]
    raven['Begin Time (s)'] = 0
    raven['End Time (s)'] = raven['classifiedBy'].apply(lambda x: 3 if ('birdnet' in str(x).lower()) else (5 if ('perch' in str(x).lower()) else None))
    raven['File Offset (s)'] = 0
    raven = raven.rename(columns={'speciesName': 'Scientific Name'})

    # Convert to scientific names
    converter = bn.Converter(
        from_type="scientific_name",
        to_type="common_name",
        from_authority="ebird"
    )
    raven['Common Name'] = converter.convert(raven['Scientific Name'])
    

    ############################################################################################################################
    raven = raven.rename(columns={'segmentID': 'Begin Path'})

    # Build list of all files (recursively) in the begin_path folder
    all_files = [str(path) for path in Path(begin_path).rglob('*') if path.is_file()]

    def find_full_path(snippet):
        # Search for the first file path that contains the snippet
        # (handles case when input is e.g. just filename without extension or prefix)
        matches = [file for file in all_files if snippet in os.path.basename(file)]
        return matches[0] if matches else None

    raven['Begin Path'] = raven['Begin Path'].apply(find_full_path)
    
    ############################################################################################################################
    raven = raven.rename(columns={'filePath': ' Orig_path'})
    raven['verify'] = 'NA'
    raven['song_type'] = 'NA'
    raven['reference'] = 'NA'
    raven['add2library'] = 0
    raven['notes'] = 'NA'
    raven['Method'] = raven['classifiedBy'].apply(lambda x: 'BirdNET' if ('birdnet' in str(x).lower()) else ("Perch" if ('perch' in str(x).lower()) else None))
    raven['Date'] = raven['timestamp'].apply(lambda x: x[:10] if pd.notna(x) else None)
    raven = raven.rename(columns={'recorderID': 'Punto'})
    raven = raven.rename(columns={'startTime': 'Orig_start'})
    raven = raven.rename(columns={'endTime': 'Orig_end'})
    raven = raven.loc[:, ~raven.columns.duplicated()]

    order= ['Selection', 'View', 'Channel', 'group_id', 'Begin Time (s)',
       'End Time (s)', 'File Offset (s)', 'Common Name', 'Scientific Name',
       'Confidence', 'Scientific Name2', 'Scientific Name3', 'Date', 'Punto',
       'Begin Path', 'Orig_path', 'Orig_start', 'Orig_end', 'verify',
       'song_type', 'reference', 'add2library', 'notes', 'method']

    # Sort the columns of the excel DataFrame based on the 'order' list, keeping only those columns that exist in excel
    existing_columns = [col for col in order if col in raven.columns]
    raven = raven.reindex(columns=existing_columns)
    output_raven='./Tablas_de_seleccion'

    species_folder_raven = os.path.join(output_raven)
    os.makedirs(species_folder_raven, exist_ok=True)

    output_raven = os.path.join(species_folder_raven, f"{species}.csv")
    raven.to_csv(output_raven.replace('.csv', '_merged.txt'), index=False, sep='\t')