# If running in SageMaker, set up Python environment / install dependencies

In [None]:
import os
import sys
import importlib.util

def is_package_installed(package_name):
    if package_name in sys.modules:
        return True
    else:
        return importlib.util.find_spec(package_name) is not None

running_on_aws = True if os.environ.get("AWS_DEFAULT_REGION") else False
if not running_on_aws:
    print("Not running on AWS -- Assume that the Python environment is already set up.")
else:
    print("Running on AWS -- Making sure needed packages are installed...")
    if is_package_installed("soundfile"):
        print("  Soundfile already installed.")
    else:
        print("\n\nInstalling soundfile...")
        #%pip install pysoundfile
        %conda install -c conda-forge pysoundfile
    if is_package_installed("librosa"):
        print("  Librosa already installed.")
    else:
        print("\n\nInstalling librosa (this may take a while)...")
        #%pip install librosa
        %conda install -c conda-forge librosa
    if is_package_installed("audiot"):
        print("  AudioT package already installed.")
    else:
        print("\n\nInstalling AudioT package in development / editable mode...")
        # Use the relative path to the folder containing setup.py, which is the parent folder ../ in this case
        %pip install -e ../

### <font color='red'>**If any new packages were installed above, restart the kernel before running the remainder of this notebook**</font>

# Imports

In [None]:
# Standard Python
from datetime import datetime, timedelta
import io
import random
import re
import pickle
import logging
import logging.handlers
from pathlib import Path

# ML / numeric / plotting
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

# AWS
import boto3

# AudioT
from audiot.audio_signal import AudioSignal
from audiot.dataset import Dataset
from audiot.signal_processing.functions import calc_signal_strength_features, compute_pitch_upsweep_downsweep, compute_chirp_features_for_segments
from audiot.signal_processing.auto_segmenter import AutoSegmenter
from audiot.signal_processing.pitch_tracker import PitchTracker

# Show matplotlib plots inline in Jupyter notebooks without having to call show()
%matplotlib inline

# Configure logging

This code will likely take a long time to process the full data for a flock (I estimate about 20 hours to process 50 days worth of data).  Your web login to SageMaker will probably time out if you're not present / showing activity during that entire time.  When it times out, the link from stdout to the output visible in this notebook will be broken, so it will look like nothing is happening even though it is actually still running.  To provide a way to still be able to monitor the processing progress, this cell sets up a logger that outputs to both stdout and to a log file.  That way you can monitor the contents of the log file to check if things are still running after logging back in to SageMaker's web interface.

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.handlers.clear()  # Avoid accumulating duplicate handlers in case this cell gets run multiple times
log_file = Path("analyze_chirps.log")
log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=2**20, backupCount=3)
file_handler.setFormatter(log_formatter)
logger.addHandler(file_handler)
console_handler = logging.StreamHandler(stream=sys.stdout)
console_handler.setFormatter(log_formatter)
logger.addHandler(console_handler)
if log_file.exists():
    logger.handlers[0].doRollover()
logger.info("New log file started.")

# Define utility functions

In [None]:
def load_audio_signal_from_s3(bucket, key):
    """
    Reads an audio file directly from S3 (without saving it to disk) and returns
    it as an AudioSignal object.
    
    Args:
        bucket (S3.Bucket): A Bucket object for the bucket to download from.
        key (str): The key (or path within the bucket) pointing to the audio file
            object to read in.
    """
    buffer = io.BytesIO()
    bucket.download_fileobj(key, buffer)
    return AudioSignal.from_bytes_io(buffer)

def save_results(pickle_file):
    """
    Saves out results so far to the specified pickle file path.
    
    Note that this function assumes that all the module-level variables below are
    defined, and thus will throw errors if it is called before those variables 
    have actually been defined.
    """
    logger.info(f"Saving results so far to {pickle_file}.")
    # Put all the variables we want to save into a dictionary
    results_dict = {
        # Dataset location and information
        "house_number": house_number,
        "mic_number": mic_number,
        "bucket_name": bucket_name,
        "flock_prefix": flock_prefix,
        "recorder_number": recorder_number,
        "dataset_prefix": dataset_prefix,
        "flock_start_date": flock_start_date,
        "flock_end_date": flock_end_date,
        # Time and frequency axes for results
        "flock_hours": flock_hours,
        "flock_minutes": flock_minutes,
        "frequency_axis": frequency_axis,
        # Minute resolution metrics
        "recording_duration_by_minute": recording_duration_by_minute,
        "segment_count_by_minute": segment_count_by_minute, 
        "total_segment_duration_by_minute": total_segment_duration_by_minute,
        "chirp_count_by_minute": chirp_count_by_minute,
        "total_chirp_duration_by_minute": total_chirp_duration_by_minute,
        # Hour resolution metrics
        "chirp_max_frequency_histogram_by_hour": chirp_max_frequency_histogram_by_hour,
        "chirp_min_frequency_histogram_by_hour": chirp_min_frequency_histogram_by_hour,
        "chirp_median_frequency_histogram_by_hour": chirp_median_frequency_histogram_by_hour,
        "chirp_upsweep_histogram_by_hour": chirp_upsweep_histogram_by_hour,
        "chirp_downsweep_histogram_by_hour": chirp_downsweep_histogram_by_hour,
        "chirp_frequency_histogram_by_hour": chirp_frequency_histogram_by_hour,
        # Info on where it currently is in the processing
        "hour_index": hour_index,
        # Info on how long it took to process
        "processing_start_time": processing_start_time,
        "processing_end_time": processing_end_time,
    }
    # Save out the dictionary
    with open(pickle_file, "wb") as file_out:
        pickle.dump(results_dict, file_out)

# Parameters / dataset info / S3 connection

Update the `house_number` and `mic_number` variables below to the data you want to run on.  (If you want to run on data from a different flock, you'll need to update the `bucket_name`, `flock_prefix`, `flock_start_date`, and `flock_end_date` as well.)
    
The `debugging` variable below is set to `True` for test runs, and will limit the amount of data that actually gets processed.  It should be set to `False` when you're ready to actually process the full flock's data.  Keep in mind that it will likely take a long time (~20 hours for 50 days worth of data) and that the output below will stop working if the web interface times out, even though the code is still running and processing the data.  At that point, you'll need to check the log file if you want to see the output and be able to monitor its progress in processing the data.  Also, you might not be able to use other notebooks while its running since the python kernel will be tied up.  (Maybe you still could use other notebooks if you use different kernels?  I'm not sure on that.)

If you need to interrupt / stop the processing at some point (especially after the web interface has timed out), then the stop button above might not work either.  But you can use `Kernel --> Shut Down All Dernels..` from the menu above to kill it (and any other running kernels).  Then you should be able to run things in this and other notebooks again once the kernel has restarted.  The partial results from the data that did get processed should still be available in the output pickle file.  Currently, the code below is not set up to resume processing where it left off though.

In [None]:
# Set a flag indicating if we are debugging / testing or not.  When set to true, it limits the amount of data that will be processed.
debugging = True
max_recordings_to_process_per_hour_if_debugging = 2

# Dataset location and information
house_number = 3  # Should be 0, 1, 2, or 3 for TRF0 through TRF3
mic_number = 0
bucket_name = "audiot-disk03"
flock_prefix = f"TRF{house_number}_2020-12/"
recorder_number = mic_number // 4
dataset_prefix = f"{flock_prefix}trf{house_number}-recorder-{recorder_number}/"

flock_start_date = datetime(2020, 12, 14)  # The date the birds were placed
flock_end_date = datetime(2021, 2, 3)      # The date the birds were caught

# Construct a list of each hour of time during the flock. 
# (Add a day to the end date to go through the end of that day.)
flock_hours = pd.date_range(flock_start_date, flock_end_date + timedelta(days=1), freq="H", closed="left")
flock_minutes = pd.date_range(flock_start_date, flock_end_date + timedelta(days=1), freq="min", closed="left")
if debugging:
    # If we're debugging / testing, limit the amount of data to process
    flock_hours = flock_hours[500:510]
n_hours = len(flock_hours)
n_minutes = len(flock_minutes)

# Connect to S3
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)

# Get information about the frequency axis by loading and computing features for the first file in the dataset.
# We need to know what the frequency axis looks like so that we can pre-allocate arrays to store histograms with
# bins that match up with the frequency axis.
# We assume that the frequency axis is the same for all other files (this is true if they all have the same 
# sampling rate, which they should).
first_file = list(bucket.objects.filter(Prefix=f"{dataset_prefix}").limit(1))[0]
audio_signal = load_audio_signal_from_s3(bucket, first_file.key)
signal_strength_features = calc_signal_strength_features(audio_signal)
frequency_axis = signal_strength_features.frequency_axis
n_frequency_bins = len(frequency_axis)

# Get signal processing objects

In [None]:
segmenter = AutoSegmenter.get_default_segmenter()
pitch_tracker = PitchTracker()

# Pre-allocate variables to store the results

In [None]:
# Minute resolution metrics
recording_duration_by_minute = np.zeros(n_minutes)
segment_count_by_minute = np.zeros(n_minutes, dtype=int)
total_segment_duration_by_minute = np.zeros(n_minutes)
chirp_count_by_minute = np.zeros(n_minutes, dtype=int)
total_chirp_duration_by_minute = np.zeros(n_minutes)

# Hour resolution metrics
chirp_max_frequency_histogram_by_hour = np.zeros([n_frequency_bins, n_hours], dtype=int)
chirp_min_frequency_histogram_by_hour = np.zeros([n_frequency_bins, n_hours], dtype=int)
chirp_median_frequency_histogram_by_hour = np.zeros([n_frequency_bins, n_hours], dtype=int)
chirp_upsweep_histogram_by_hour = np.zeros([n_frequency_bins, n_hours], dtype=int)
chirp_downsweep_histogram_by_hour = np.zeros([n_frequency_bins, n_hours], dtype=int)
chirp_frequency_histogram_by_hour = np.zeros([n_frequency_bins, n_hours], dtype=int)

# Process all the data and save results to disk

Currently, the code below does not run an actual chirp detector, but just assigns classifications randomly or calls everything a chirp.  Actual chirp detector code needs to be plugged in to the section marked (replacing the dummy chirp detection code).

In [None]:
# Define how often output should occur (both printing/logging progress and saving out partial results)
hours_to_process_per_output = 5

# Variables to track processing time
processing_start_time = datetime.now()
processing_end_time = None  # Define this now so that results can be saved out along the way before it has actually finished
logger.info("Starting processing of audio files.")

# Build a pickle file name to output results to.  Make it unique based on the house/mic numbers and when this was executed to prevent overwriting previous results.
timestamp_str = processing_start_time.strftime("%Y-%m-%d_%H.%M.%S")
pickle_file = f"TRF{house_number}_mic{mic_number:02d}_chirp_results_{timestamp_str}.pickle"
if debugging:
    pickle_file = "DEBUG_" + pickle_file  # Name outputs from debugging runs differently

# === Process the data =======================================================================================
for hour_index, hour in enumerate(flock_hours):
    date_str = hour.strftime("%Y-%m-%d")
    hour_str = hour.strftime("%H")
    recordings_for_hour = bucket.objects.filter(Prefix=f"{dataset_prefix}{date_str}/{hour_str}/TRF{house_number}_mic{mic_number:02d}")
    
    # If debugging, limit the number of files to process per hour
    if debugging:
        recordings_for_hour = list(recordings_for_hour)
        if len(recordings_for_hour) > max_recordings_to_process_per_hour_if_debugging:
            recordings_for_hour = recordings_for_hour[0:max_recordings_to_process_per_hour_if_debugging]
    
    # Process the recordings for each hour
    for recording in recordings_for_hour:
        dataset_tag, file_datetime, mic_number = Dataset.parse_file_path(recording.key)
        minute_index = flock_minutes.get_loc(file_datetime)
        
        # Load the file and compute features
        audio_signal = load_audio_signal_from_s3(bucket, recording.key)
        recording_duration_by_minute[minute_index] = audio_signal.duration
        signal_strength_features = calc_signal_strength_features(audio_signal)
        
        # Extract segment information
        segment_index_list = segmenter.segment_signal_strength_features(signal_strength_features)
        if len(segment_index_list) == 0:
            continue  # No further processing to be done if no segments detected
        segment_time_list = segmenter.convert_segments_from_indexes_to_seconds(segment_index_list, signal_strength_features)
        segment_count_by_minute[minute_index] = len(segment_index_list)
        total_segment_duration_by_minute[minute_index] = sum([seg_end - seg_start for seg_start, seg_end, _ in segment_time_list])
        
        # Extract chirp information
        chirp_features = compute_chirp_features_for_segments(signal_strength_features, segment_index_list)
        
        # ============================================================================================================================================
        # TODO: The code delimited by the above and below lines doesn't actually do chirp detection.  Instead, it either randomly classifies 
        # segments as chirps, or classifies them all as chirps (depending on which line is uncommented).  Replace this code with your actual chirp 
        # detector.  The expected output is just a list of strings (corresponding to the list of segments) that are either "chirp" or "non-chirp", 
        # stored in the chirp_classifications variable.
        
        #chirp_classifications = random.choices(["chirp", "non-chirp"], k=len(chirp_features))
        chirp_classifications = ["chirp"] * len(chirp_features)
        # ============================================================================================================================================
        
        # Separate out the info for the segments classified as chirps
        chirp_segment_index_list = [segment for segment, classification in zip(segment_index_list, chirp_classifications) if classification == "chirp"]
        if len(chirp_segment_index_list) == 0:
            continue  # No further processing to be done if no chirps detected
        chirp_segment_time_list = [segment for segment, classification in zip(segment_time_list, chirp_classifications) if classification == "chirp"]
        chirp_count_by_minute[minute_index] = len(chirp_segment_index_list)
        total_chirp_duration_by_minute = sum([seg_end - seg_start for seg_start, seg_end, _ in chirp_segment_time_list])
        
        # Iterate through the segments classified as chirps and apply pitch tracking
        for segment_start_idx, segment_end_idx, segment_signal_strength in chirp_segment_index_list:
            segment_features = signal_strength_features.features[:, segment_start_idx:segment_end_idx]
            pitch_indexes, peak_strength = pitch_tracker.track_pitch_across_segment(segment_features)
            max_pitch_index = np.max(pitch_indexes)
            min_pitch_index = np.min(pitch_indexes)
            median_pitch_index = int(np.median(pitch_indexes))
            total_upsweep, total_downsweep = compute_pitch_upsweep_downsweep(pitch_indexes)
        
            # Add this chirp's info into the hour metric tallies
            chirp_max_frequency_histogram_by_hour[max_pitch_index, hour_index] += 1
            chirp_min_frequency_histogram_by_hour[min_pitch_index, hour_index] += 1
            chirp_median_frequency_histogram_by_hour[median_pitch_index, hour_index] += 1
            chirp_upsweep_histogram_by_hour[min(total_upsweep, n_frequency_bins-1), hour_index] += 1
            chirp_downsweep_histogram_by_hour[min(total_downsweep, n_frequency_bins-1), hour_index] += 1
            # Tally up every tracked pitch across the full duration of the chirp and add them to this histogram.
            idx_values, idx_counts = np.unique(pitch_indexes, return_counts=True)
            chirp_frequency_histogram_by_hour[idx_values, hour_index] += idx_counts
            
    # Periodically print the progress, estimate completion time, and save results so far.
    if hour_index % hours_to_process_per_output == hours_to_process_per_output - 1:
        save_results(pickle_file)
        current_time = datetime.now()
        duration_per_hour = (current_time - processing_start_time) / (hour_index + 1)
        estimated_remaining_duration = (n_hours - hour_index - 1) * duration_per_hour
        logger.info(f"Finished processing {hour_index+1} / {n_hours} hours.  Estimated time remaining = {estimated_remaining_duration}.")

# Save out the final results
processing_end_time = datetime.now()
save_results(pickle_file)
logger.info(f"Finished processing all files.  Total processing duration = {processing_end_time - processing_start_time}.")

# Visualize some of the results

### Plot minute metrics

In [None]:
figure_size = [30, 5]

plt.figure(figsize=figure_size)
plt.plot(flock_minutes, segment_count_by_minute)
plt.plot(flock_minutes, chirp_count_by_minute)
plt.legend(["segment_count", "chirp_count"])

### Plot hour metrics

In [None]:
plt.figure(figsize=figure_size)
plt.imshow(chirp_frequency_histogram_by_hour, aspect="auto", origin="lower", interpolation="none", extent=[0, n_hours/24, frequency_axis[0], frequency_axis[-1]])
plt.title("Frequency histograms of all chirps, by hour")
plt.xlabel("Day")
plt.ylabel("Frequency (Hz)")
plt.colorbar()

plt.figure(figsize=figure_size)
plt.imshow(chirp_max_frequency_histogram_by_hour, aspect="auto", origin="lower", interpolation="none", extent=[0, n_hours/24, frequency_axis[0], frequency_axis[-1]])
plt.title("Histogram of maximum frequencies of chirps, by hour")
plt.xlabel("Day")
plt.ylabel("Frequency (Hz)")
plt.colorbar()

plt.figure(figsize=figure_size)
plt.imshow(chirp_min_frequency_histogram_by_hour, aspect="auto", origin="lower", interpolation="none", extent=[0, n_hours/24, frequency_axis[0], frequency_axis[-1]])
plt.title("Histogram of minimum frequencies of chirps, by hour")
plt.xlabel("Day")
plt.ylabel("Frequency (Hz)")
plt.colorbar()

plt.figure(figsize=figure_size)
plt.imshow(chirp_median_frequency_histogram_by_hour, aspect="auto", origin="lower", interpolation="none", extent=[0, n_hours/24, frequency_axis[0], frequency_axis[-1]])
plt.title("Histogram of median frequencies of chirps, by hour")
plt.xlabel("Day")
plt.ylabel("Frequency (Hz)")
plt.colorbar()

plt.figure(figsize=figure_size)
plt.imshow(chirp_upsweep_histogram_by_hour, aspect="auto", origin="lower", interpolation="none", extent=[0, n_hours/24, frequency_axis[0], frequency_axis[-1]])
plt.title("Histogram of chirp upsweep, by hour")
plt.xlabel("Day")
plt.ylabel("Frequency (Hz)")
plt.ylim([frequency_axis[0], frequency_axis[-1]/2])
plt.colorbar()

plt.figure(figsize=figure_size)
plt.imshow(chirp_downsweep_histogram_by_hour, aspect="auto", origin="lower", interpolation="none", extent=[0, n_hours/24, frequency_axis[0], frequency_axis[-1]])
plt.title("Histogram of chirp downsweep, by hour")
plt.xlabel("Day")
plt.ylabel("Frequency (Hz)")
plt.ylim([frequency_axis[0], frequency_axis[-1]/2])
plt.colorbar()