# Required Libraries

In [1]:
import copy
import pandas as pd
import numpy as np
import os.path
from IPython.display import Audio
# import pafy
import subprocess as sp

# Go get the data
The goal of this section is to serve up recordings. Those can be in video format or audio format - it doesn't really matter. The output is simply some manner of audio file we can tackle with `ffmpeg`

In [2]:
!which ffmpeg
ffmpeg = '/usr/local/bin/ffmpeg'

/usr/local/bin/ffmpeg


# Convert the audio data to .wav audio data
We need our audio files to be in 16Khz .wav format. This section is about taking the inputs in whatever format they are currently found and standardizing them to .wav.

In [3]:
def convert_to_wav(label, uri, output_dir):
    """ Given a label, a uniform resource id, and an output directory
        relative to the current function,
        convert the file from its existing format to
        a .wav file at 16000 Khz like we need for our models
    """
    input_filepath = uri
    output_filepath = os.path.join(output_dir, label, '.wav')
    
#     ffmpeg_path = sp.Popen('which ffmpeg', stdout=sp.PIPE, stderr=sp.PIPE)
    ffmpeg_path = sp.run(['which', 'ffmpeg'],
                         capture_output=True,
                         text=True).stdout.replace('\n','')
    audio_codec = 'pcm_s16le'
    
    
    # This is the command we use to convert our mp3s to wav elsewhere
    # ffmpeg  -hide_banner
    #         -nostats
    #         -loglevel fatal
    #         -nostdin
    #         -i ./ferris-bueller.mp3
    #         -acodec pcm_s16le
    #         -ac 1
    #         -ar 16000
    #         -y ./ferris-bueller.wav
    
    # Define the args as a dict to make them easier to deal with
    audio_dl_args = [ffmpeg_path,
    #     '-ss', str(ts_start),    # The beginning of the trim window if any
    #     '-t', str(duration),     # Specify the duration of the output
        '-hide_banner',
        '-nostats',
        '-loglevel', 'fatal',
        '-nostdin',
        '-i', input_filepath,    # Specify the input video URL
        '-vn',                   # Suppress the video stream
        '-ac', '1',              # Set the number of channels
        '-acodec', audio_codec,  # Specify the output encoding
        '-ar', '16000',          # Specify the audio sample rate
        '-y', output_filepath]
    
    proc = sp.Popen(audio_dl_args, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = proc.communicate()
    if proc.returncode != 0:
        print(stderr)
    else:
        print(f'Exported converted audio to {output_filepath}')

# Run the .wav data through the model
We need to pipe our .wav files through the predictions models to generate `.rttm` outputs.

# Processing the model's prediction outputs

The output of our model's predictions is an `.rttm` file. We want to load that into a DataFrame to be able to derive the analyses we need for our visualizations and reporting. There are several steps in this process.
1. Get a [pandas] DataFrame from the RTTM
1. Transform the DataFrame. This can be either 
  1. a very simple segment-indexed DataFrame that filters out `SPEECH` segments and only leaves `FEM`, `MAL`, and `CHI` segments, or
  1. a time-indexed DataFrame that retains `SPEECH`, `MAL`, `FEM`, `CHI`, etc. labels as one-hot encoded features. (The current version is indexed at the millisecond level.)
1. Generate a CSV from the transformed data.

## Defining the functions we're going to need for this pipeline

In [4]:
# Step 1: Get a pandas DF from the RTTM
def df_from_rttm(rttm):
    """ Given an RTTM file, parses it into a Pandas DataFrame.
    """
    df = pd.read_csv(rttm,
                     sep=' ',
                     names=['task','inputFile','one','start','duration',
                     'NA_1','NA_2','class','NA_3', 'NA_4'])
    return df

In [5]:
# Step 2A: Generate an utterance-indexed pandas DF
def rttm_to_utterance_indexed_speaker_activity(df, outfile=None):
    """ Given an RTTM input file, generate a dataframe structured
        to support a visualization of type 'Speaker Activity' and optionally
        export to a csv located at {outfile}

        df = Pandas DataFrame containing a standard .rttm file
        outfile = destination for exported CSV (path, filename, extension)
    """

    # Check whether an outfile has been defined
    if outfile is not None:
        export = True

    # Drop the columns we don't care about from a base RTTM
    vizframe = copy.deepcopy(df) \
        .drop(
        columns=[
            'task',
            'inputFile',
            'one',
            'NA_1',
            'NA_2',
            'NA_3',
            'NA_4'])

    # Rename columns for our viz's purposes
    vizframe = vizframe.rename(columns={
        'start': 'START',
        'duration': 'DUR',
        'class': 'LABEL'
    })

    # Remap the model classes for this viz's purposes
    vizframe['LABEL'] = vizframe['LABEL'].map({
        'KCHI': 'CHILD',
        'CHI': 'CHILD',
        'FEM': 'ADULT',
        'MAL': 'ADULT'
    })

    # Filter the dataframe to just the 'clean' (non-'SPEECH') classes
    vizframe = vizframe[vizframe['LABEL'].isin(['CHILD', 'ADULT'])]
    vizframe['LABEL_NUM'] = vizframe['LABEL'] \
        .apply(lambda x: 1 if x == 'CHILD'
               else (-1 if x == 'ADULT' else NaN))
    vizframe['DUR_TRANS'] = vizframe['LABEL_NUM'] * vizframe['DUR']
    vizframe['COUNT'] = 1

    if export:
        vizframe.to_csv(outfile)

    return vizframe

In [6]:
# Step 2B: Generate a time-indexed (curr. millisecond-indexed) DF

# We need to know the latest timestamp in a given label-subsetted
# DF in order to fill in the appropriate timestamps

def get_latest_timestamp_needed(input_df):
    """ Given an RTTM-derived dataframe,
        extract the last timestamp we'll need
        as a scalar
    """
    last_row = input_df[input_df['start']==input_df['start'].max()][['start', 'duration']]
    last_row.reset_index(drop=True, inplace=True)
    latest_timestamp = last_row.at[0,'start'] + last_row.at[0,'duration']
    return round(latest_timestamp, 1)


# We want a time-indexed range. In this case, we're working with a millisecond
# level of resolution b/c that's how our data comes out in the RTTM. Collapsing
# to a less-granular resolution should be done in a later step.

def build_millisecond_range(start, duration, value='', valname='value', verbose=False):
    """ Given a start time, and end time, and a value,
        create a dataframe with a timedelta index containing
        that value for the range between the endpoints
    """
    
    # Turn the endpoints into a millisecond-denominated target
    low_end = pd.to_timedelta(round(start, 1), unit='milliseconds')
    span = pd.to_timedelta(round(duration, 1), unit='milliseconds')
    
    if verbose:
        print(f"Low end: {low_end}\nDuration: {span}\n")
    
    # Create a range between them
    rng = pd.timedelta_range(low_end, low_end+span, freq='L')

    # Turn that series into a DataFrame and rename the index for clarity
    df = pd.Series(value, index=rng).to_frame(name=valname)
    df.index.name='milliseconds'
    if verbose:
        print(df.head(3))
    return df


# Given an .rttm-derived DF of the format defined above, we can generate
# millisecond-indexed DF with one-hot encoded labels for our classes of interest

def build_millisecond_indexed_df(input_df, label_list=[], verbose=True):
    """ Given an RTTM-generated DataFrame, generate a pivoted DF
        containing all of the labels of interest, one-hot encoded
    """
    
    # To generalize, we're taking in a second-denominated timestamp from our
    # RTTM. We need to multiply by 1000 to get our milliseconds
    input_df[['start', 'duration']] = input_df[['start', 'duration']]*1000
    
    max_seconds_needed = get_latest_timestamp_needed(input_df)
    if verbose:
        print(f'max_seconds_needed is of type {type(max_seconds_needed)} and equal to {max_seconds_needed}')
    outer_df = build_millisecond_range(
                0,
                max_seconds_needed,
                np.nan,
                'base',
                verbose=True)
    
    if verbose:
        print(f'The outer_df frame will contain {len(outer_df)} records.')
    
    # Loop through labels, subsetting the original DF so
    # we can merge it back into the main outer DF
    for label in label_list:
        print(f'Processing label: {label}\n')

        if label not in input_df['class'].unique():
            print(f'Label {label} not found in this dataset')
            label_base_df = pd.DataFrame(columns=[label])
            continue
        
        # Gotta avoid errors from accidentally manipulating original DFs
        temp_df = copy.deepcopy(input_df)

        # Generate a temp_df that contains only records for the label of interest
        temp_df = input_df[['start','duration','class']][input_df['class']==label]

        if verbose:
            print(f'The temp_df subset for label {label} contains {len(temp_df)} rows')
            print(temp_df.head(5))
        
        # The subsetted DF retains the original index unless you reset it
        temp_df.reset_index(drop=True, inplace=True)

        if verbose:
            print(f'The temp_df frame is as follows:\n{temp_df}')

        # Creating the base DF for this label - ranges from 0 to the earliest record
        label_base_df = build_millisecond_range(0, temp_df['start'].min(),
                                     np.nan, label, verbose=verbose)
        if verbose:
            # The head() will always be the same, so we need to look at the tail() to verify
            print(f'\n>>> The last few rows of the label_base_df for label {label} are:\n{label_base_df.tail()}')
            print(f"\n>>> Base DF size for label {label}: {len(temp_df)}\n")

        for i in range(1, len(temp_df)):
            label_base_df = label_base_df.append(
                build_millisecond_range(
                    temp_df.loc[i]['start'],
                    temp_df.loc[i]['duration'],
                    str(label), str(label),
                    verbose=verbose
                ))

            if verbose:
                print(f'\n>>> Base DF size after {i} rounds: {len(label_base_df)}')
                print(f'\n>>> The head:\n{label_base_df.head(10)}\n>>> The tail:\n{label_base_df.tail(10)}')
                print(f'\n>>> A few of its contents:\n{label_base_df[~label_base_df[label].isna()].head(5)}')

        # Creating a placeholder for the update call
        outer_df[label] = np.nan
        
        # When attempting the update method:
        print(f'Attempting update with DF from label {label}')
        outer_df.update(label_base_df, overwrite=True)
        
        if verbose:
            try:
                print(outer_df[~outer_df[label].isna()].head())
            except:
                print(outer_df)
            
        # When attempting the merge method:
#         outer_df = pd.merge(left=outer_df, left_index=True,
#                             right=label_base_df, right_index=True,
#                             how='inner', suffixes=('_base',''),
#                             indicator=True, validate='1:1')
#         if verbose:
#             print(outer_df[~outer_df[f'{label}_y'].isna()].head(5))

    return outer_df

## Implementing the pipeline
Here we actually go ahead and use the functions defined above to compose an actual pipeline of transformation for our files.

In [57]:
oftf_dict = {
    'oftf-aj':'OneFishTwoFish_AnnaJacobson.wav',
    'oftf-ts':'OneFishTwoFish_TimothySlade.wav',
    'oftf-ds':'OneFishTwoFish_DavidSlade.wav',
}
for k, v in oftf_dict.items():
    convert_to_wav(k, v, './')

b''
b''
b''


total 805680
drwxr-xr-x@ 65 tsslade  staff   2.0K Jun 28 15:10 [34m.[m[m
drwxr-xr-x@ 12 tsslade  staff   384B Jun 15 17:53 [34m..[m[m
drwxr-xr-x@ 10 tsslade  staff   320B Jun 19 17:27 [34m.boneyard[m[m
drwxr-xr-x@ 24 tsslade  staff   768B Jun 27 09:53 [34m.git[m[m
-rw-r--r--@  1 tsslade  staff    77B Jun 22 12:23 .gitignore
drwxr-xr-x@  7 tsslade  staff   224B Jun 27 10:07 [34m.ipynb_checkpoints[m[m
drwxr-xr-x@  6 tsslade  staff   192B Jun 15 17:53 [34m.secrets[m[m
drwxr-xr-x@  2 tsslade  staff    64B May 30 09:48 [34m.test[m[m
-rw-r--r--@  1 tsslade  staff   688K Jun 14 21:26 AR31_021108a_dampened.mp3
-rw-r--r--@  1 tsslade  staff   6.7M Jun 14 21:23 AR31_021108a_dampened.wav
-rw-r--r--@  1 tsslade  staff   2.6K Jun 22 12:23 AudioSetReference.md
drwxr-xr-x@  5 tsslade  staff   160B Jun 22 12:23 [34mDiViMe[m[m
-rw-r--r--@  1 tsslade  staff    16M Jun 27 10:03 OneFishTwoFish_AnnaJacobson.wav
-rw-r--r--@  1 tsslade  staff    20M Jun 27 10:04 OneFishT

In [54]:
ffmpeg = sp.run(['which', 'ffmpeg'], capture_output=True, text=True).stdout.replace('\n','')

In [52]:
ffmpeg = ffmpeg.replace('\n','')

In [55]:
ffmpeg

'/usr/local/bin/ffmpeg'

+ https://docs.python.org/3/library/subprocess.html