In [5]:
import copy
import pandas as pd
import numpy as np

## Header just to fold

In [1]:
def rttm_to_utterance_indexed_speaker_activity(df, outfile=None):
    """ Given an RTTM input file, generate a dataframe structured
        to support a visualization of type 'Speaker Activity' and optionally
        export to a csv located at {outfile}

        df = Pandas DataFrame containing a standard .rttm file
        outfile = destination for exported CSV (path, filename, extension)
    """

    # Check whether an outfile has been defined
    if outfile is not None:
        export = True

    # Drop the columns we don't care about from a base RTTM
    vizframe = copy.deepcopy(df) \
        .drop(
        columns=[
            'task',
            'inputFile',
            'one',
            'NA_1',
            'NA_2',
            'NA_3',
            'NA_4'])

    # Rename columns for our viz's purposes
    vizframe = vizframe.rename(columns={
        'start': 'START',
        'duration': 'DUR',
        'class': 'LABEL'
    })

    # Remap the model classes for this viz's purposes
    vizframe['LABEL'] = vizframe['LABEL'].map({
        'KCHI': 'CHILD',
        'CHI': 'CHILD',
        'FEM': 'ADULT',
        'MAL': 'ADULT'
    })

    # Filter the dataframe to just the 'clean' (non-'SPEECH') classes
    vizframe = vizframe[vizframe['LABEL'].isin(['CHILD', 'ADULT'])]
    vizframe['LABEL_NUM'] = vizframe['LABEL'] \
        .apply(lambda x: 1 if x == 'CHILD'
               else (-1 if x == 'ADULT' else NaN))
    vizframe['DUR_TRANS'] = vizframe['LABEL_NUM'] * vizframe['DUR']
    vizframe['COUNT'] = 1

    if export:
        vizframe.to_csv(outfile)

    return vizframe

In [2]:
def df_from_rttm(rttm):
    """ Given an RTTM file, parses it into a Pandas DataFrame.
    """
    df = pd.read_csv(rttm,
                     sep=' ',
                     names=['task','inputFile','one','start','duration',
                     'NA_1','NA_2','class','NA_3', 'NA_4'])
    return df

In [3]:
def get_latest_timestamp_needed(input_df):
    """ Given an RTTM-derived dataframe,
        extract the last timestamp we'll need
        as a scalar
    """
    last_row = input_df[input_df['start']==input_df['start'].max()][['start', 'duration']]
    last_row.reset_index(drop=True, inplace=True)
    latest_timestamp = last_row.at[0,'start'] + last_row.at[0,'duration']
    return round(latest_timestamp, 1)

## Pulling in our data to work with
We have an RTTM for a NY video. We need to load that up and convert it a DataFrame

In [6]:
ny_7759 = df_from_rttm('./ny_7759.rttm')
ny_7759

Unnamed: 0,task,inputFile,one,start,duration,NA_1,NA_2,class,NA_3,NA_4
0,SPEAKER,ny_7759,1,6.038,5.191,,,SPEECH,,
1,SPEAKER,ny_7759,1,6.198,0.813,,,KCHI,,
2,SPEAKER,ny_7759,1,7.773,0.145,,,KCHI,,
3,SPEAKER,ny_7759,1,9.560,0.380,,,MAL,,
4,SPEAKER,ny_7759,1,10.694,0.240,,,KCHI,,
...,...,...,...,...,...,...,...,...,...,...
143,SPEAKER,ny_7759,1,235.567,3.714,,,FEM,,
144,SPEAKER,ny_7759,1,238.321,0.120,,,MAL,,
145,SPEAKER,ny_7759,1,238.641,0.125,,,MAL,,
146,SPEAKER,ny_7759,1,238.888,0.351,,,KCHI,,


In [None]:
# We need times to be in milliseconds
# ny_7759[['start', 'duration']] = ny_7759[['start', 'duration']]*1000

In [None]:
# This is to make our lives easier later
ny_7759_bak = copy.deepcopy(ny_7759)
# ny_7759 = copy.deepcopy(ny_7759_bak)

+ https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html?highlight=time%20range#timeseries-offset-aliases
+ https://analyzingalpha.com/time-series-analysis-with-python
https://pandas.pydata.org/pandas-docs/stable/user_guide/timedeltas.html?highlight=timedelta%20range
+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.timedelta_range.html#pandas.timedelta_range
+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.to_frame.html

We'll want a function to handle creating the millisecond-indexed dataframes for us...

In [11]:
def build_millisecond_range(start, duration, value='', valname='value', verbose=False):
    """ Given a start time, and end time, and a value,
        create a dataframe with a timedelta index containing
        that value for the range between the endpoints
    """
    
    # Turn the endpoints into a millisecond-denominated target
    low_end = pd.to_timedelta(round(start, 1), unit='milliseconds')
    span = pd.to_timedelta(round(duration, 1), unit='milliseconds')
    
    if verbose:
        print(f"Low end: {low_end}\nDuration: {span}\n")
    
    # Create a range between them
    rng = pd.timedelta_range(low_end, low_end+span, freq='L')

    # Turn that series into a DataFrame and rename the index for clarity
    df = pd.Series(value, index=rng).to_frame(name=valname)
    df.index.name='milliseconds'
    if verbose:
        print(df.head(3))
    return df

In [None]:
ny_7759.tail(5)

In [7]:
# We only care about a subset of the RTTM's columns. We'll ignore the others.
# Also, for now, we only want to worry about the 'SPEECH' class.
ny_7759_short = ny_7759[['start', 'duration', 'class']][ny_7759['class']=='SPEECH'][0:10]
ny_7759_short.reset_index(drop=True, inplace=True)

In [None]:
# When we eventually merge our data back onto one DataFrame, the timestamp columns (start, duration, etc.)
# need to have a dtype that plays nicely with timedelta dtypes. So we convert it here.
# ny_7759['start'] = pd.to_timedelta(ny_7759['start'], unit='L')

# Consider using approach here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reindex.html#pandas.DataFrame.reindex
# To cast the index of the RTTM-extracted DF to milliseconds and then update. NaNs will be created in-between; we could
# then fill down using .fillna()


# # Here we'll append everything to see if it worked as we'd expect
# print(f"Base DF size: {len(left_df)}")
# for i in range(1, len(ny_7759_short)):
#     left_df = left_df.append(
#         build_millisecond_range(
#             ny_7759_short.loc[i]['start'],
#             ny_7759_short.loc[i]['duration'],
#             'SPEECH', 'class',
# #             verbose=True
#             verbose=False
#         ))
# #     print(left_df.index.max(),'\n')
#     print(f"Base DF size after {i} rounds: {len(left_df)}")

In [None]:
def build_millisecond_indexed_df(input_df, label_list=[], verbose=True):
    """ Given an RTTM-generated DataFrame, generate a pivoted DF
        containing all of the labels of interest, one-hot encoded
    """
    
    max_seconds_needed = get_latest_timestamp_needed(input_df)
    if verbose:
        print(f'max_seconds_needed is of type {type(max_seconds_needed)} and equal to {max_seconds_needed}')
    outer_df = build_millisecond_range(
                0,
                max_seconds_needed,
                np.nan,
                'base',
                verbose=True)
    
    if verbose:
        print(f'The outer_df frame will contain {len(outer_df)} records.')
    
    # Loop through labels, subsetting the original DF so
    # we can merge it back into the main outer DF
    for label in label_list:
        print(f'Processing label: {label}\n')

        if label not in input_df['class'].unique():
            print(f'Label {label} not found in this dataset')
            label_base_df = pd.DataFrame(columns=[label])
            continue
        
        # Gotta avoid errors from accidentally manipulating original DFs
        temp_df = copy.deepcopy(input_df)

        # Generate a temp_df that contains only records for the label of interest
        temp_df = input_df[['start','duration','class']][input_df['class']==label]

        if verbose:
            print(f'The temp_df subset for label {label} contains {len(temp_df)} rows')
            print(temp_df.head(5))
        
        # The subsetted DF retains the original index unless you reset it
        temp_df.reset_index(drop=True, inplace=True)

        if verbose:
            print(f'The temp_df frame is as follows:\n{temp_df}')

        # Creating the base DF for this label - ranges from 0 to the earliest record
        label_base_df = build_millisecond_range(0, temp_df['start'].min(),
                                     np.nan, label, verbose=verbose)
        if verbose:
            # The head() will always be the same, so we need to look at the tail() to verify
            print(f'\n>>> The last few rows of the label_base_df for label {label} are:\n{label_base_df.tail()}')
            print(f"\n>>> Base DF size for label {label}: {len(temp_df)}\n")

        for i in range(1, len(temp_df)):
            label_base_df = label_base_df.append(
                build_millisecond_range(
                    temp_df.loc[i]['start'],
                    temp_df.loc[i]['duration'],
                    str(label), str(label),
                    verbose=verbose
                ))

            if verbose:
                print(f'\n>>> Base DF size after {i} rounds: {len(label_base_df)}')
                print(f'\n>>> The head:\n{label_base_df.head(10)}\n>>> The tail:\n{label_base_df.tail(10)}')
                print(f'\n>>> A few of its contents:\n{label_base_df[~label_base_df[label].isna()].head(5)}')

        # Creating a placeholder for the update call
        outer_df[label] = np.nan
        
        # When attempting the update method:
        print(f'Attempting update with DF from label {label}')
        outer_df.update(label_base_df, overwrite=True)
        
        if verbose:
            try:
                print(outer_df[~outer_df[label].isna()].head())
            except:
                print(outer_df)
            
        # When attempting the merge method:
#         outer_df = pd.merge(left=outer_df, left_index=True,
#                             right=label_base_df, right_index=True,
#                             how='inner', suffixes=('_base',''),
#                             indicator=True, validate='1:1')
#         if verbose:
#             print(outer_df[~outer_df[f'{label}_y'].isna()].head(5))

    return outer_df
        

In [12]:
tester = df_from_rttm('./ny_7759.rttm')
tester[['start', 'duration']] = tester[['start', 'duration']]*1000
labels = ['MAL', 'CHI', 'KCHI', 'FEM', 'SPEECH']
result = build_millisecond_indexed_df(tester, label_list=labels, verbose=True)

max_seconds_needed is of type <class 'numpy.float64'> and equal to 239147.0
Low end: 0 days 00:00:00
Duration: 0 days 00:03:59.147000

                 base
milliseconds         
00:00:00          NaN
00:00:00.001000   NaN
00:00:00.002000   NaN
The outer_df frame will contain 239148 records.
Processing label: MAL

The temp_df subset for label MAL contains 26 rows
      start  duration class
3    9560.0     380.0   MAL
5   12565.0    1787.0   MAL
8   18416.0    1573.0   MAL
15  25397.0     118.0   MAL
16  25822.0     250.0   MAL
The temp_df frame is as follows:
       start  duration class
0     9560.0     380.0   MAL
1    12565.0    1787.0   MAL
2    18416.0    1573.0   MAL
3    25397.0     118.0   MAL
4    25822.0     250.0   MAL
5    55991.0    2678.0   MAL
6    59493.0    1637.0   MAL
7    72441.0     417.0   MAL
8    75013.0    2190.0   MAL
9    79323.0     552.0   MAL
10  101017.0     969.0   MAL
11  102122.0     120.0   MAL
12  106812.0     245.0   MAL
13  120992.0     815.0   MA

00:01:19.325000  MAL

>>> Base DF size after 9 rounds: 20772

>>> The head:
                 MAL
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 MAL
milliseconds        
00:01:19.866000  MAL
00:01:19.867000  MAL
00:01:19.868000  MAL
00:01:19.869000  MAL
00:01:19.870000  MAL
00:01:19.871000  MAL
00:01:19.872000  MAL
00:01:19.873000  MAL
00:01:19.874000  MAL
00:01:19.875000  MAL

>>> A few of its contents:
                 MAL
milliseconds        
00:00:12.565000  MAL
00:00:12.566000  MAL
00:00:12.567000  MAL
00:00:12.568000  MAL
00:00:12.569000  MAL
Low end: 0 days 00:01:41.017000
Duration: 0 days 00:00:00.969000

                 MAL
milliseconds        
00:01:41.017000  MAL
00:01:41.018000  MAL
00:01:41.019000  MAL

>>> Base DF size after 10 rounds: 21742

>>> The head:
    

                 base  MAL
milliseconds              
00:00:12.565000   NaN  MAL
00:00:12.566000   NaN  MAL
00:00:12.567000   NaN  MAL
00:00:12.568000   NaN  MAL
00:00:12.569000   NaN  MAL
Processing label: CHI

The temp_df subset for label CHI contains 24 rows
      start  duration class
19  33011.0     363.0   CHI
22  37144.0     257.0   CHI
28  41050.0     167.0   CHI
29  42356.0     142.0   CHI
30  42992.0     353.0   CHI
The temp_df frame is as follows:
       start  duration class
0    33011.0     363.0   CHI
1    37144.0     257.0   CHI
2    41050.0     167.0   CHI
3    42356.0     142.0   CHI
4    42992.0     353.0   CHI
5    45259.0    1752.0   CHI
6    48011.0     294.0   CHI
7    51104.0     424.0   CHI
8    54010.0     351.0   CHI
9    80048.0     110.0   CHI
10   80377.0    3134.0   CHI
11   97991.0     429.0   CHI
12  113034.0     199.0   CHI
13  114171.0     208.0   CHI
14  125992.0     132.0   CHI
15  134011.0     116.0   CHI
16  134480.0     481.0   CHI
17  159602.0   

00:00:37.148000  CHI
Low end: 0 days 00:01:54.171000
Duration: 0 days 00:00:00.208000

                 CHI
milliseconds        
00:01:54.171000  CHI
00:01:54.172000  CHI
00:01:54.173000  CHI

>>> Base DF size after 13 rounds: 40845

>>> The head:
                 CHI
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 CHI
milliseconds        
00:01:54.370000  CHI
00:01:54.371000  CHI
00:01:54.372000  CHI
00:01:54.373000  CHI
00:01:54.374000  CHI
00:01:54.375000  CHI
00:01:54.376000  CHI
00:01:54.377000  CHI
00:01:54.378000  CHI
00:01:54.379000  CHI

>>> A few of its contents:
                 CHI
milliseconds        
00:00:37.144000  CHI
00:00:37.145000  CHI
00:00:37.146000  CHI
00:00:37.147000  CHI
00:00:37.148000  CHI
Low end: 0 days 00:02:05.992000
Duration: 0 days 00:00:00.1

                 base  MAL  CHI
milliseconds                   
00:00:37.144000   NaN  NaN  CHI
00:00:37.145000   NaN  NaN  CHI
00:00:37.146000   NaN  NaN  CHI
00:00:37.147000   NaN  NaN  CHI
00:00:37.148000   NaN  NaN  CHI
Processing label: KCHI

The temp_df subset for label KCHI contains 26 rows
      start  duration class
1    6198.0     813.0  KCHI
2    7773.0     145.0  KCHI
4   10694.0     240.0  KCHI
7   16661.0     319.0  KCHI
13  23492.0     505.0  KCHI
The temp_df frame is as follows:
       start  duration class
0     6198.0     813.0  KCHI
1     7773.0     145.0  KCHI
2    10694.0     240.0  KCHI
3    16661.0     319.0  KCHI
4    23492.0     505.0  KCHI
5    34965.0    1856.0  KCHI
6    37511.0    1509.0  KCHI
7    40011.0     208.0  KCHI
8    41016.0     494.0  KCHI
9    66812.0    1475.0  KCHI
10   87458.0    1534.0  KCHI
11  113030.0     486.0  KCHI
12  115236.0     257.0  KCHI
13  132167.0     118.0  KCHI
14  133432.0    1509.0  KCHI
15  141077.0     986.0  KCHI
16  145

00:02:14.941000  KCHI

>>> A few of its contents:
                 KCHI
milliseconds         
00:00:07.773000  KCHI
00:00:07.774000  KCHI
00:00:07.775000  KCHI
00:00:07.776000  KCHI
00:00:07.777000  KCHI
Low end: 0 days 00:02:21.077000
Duration: 0 days 00:00:00.986000

                 KCHI
milliseconds         
00:02:21.077000  KCHI
00:02:21.078000  KCHI
00:02:21.079000  KCHI

>>> Base DF size after 15 rounds: 17855

>>> The head:
                KCHI
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 KCHI
milliseconds         
00:02:22.054000  KCHI
00:02:22.055000  KCHI
00:02:22.056000  KCHI
00:02:22.057000  KCHI
00:02:22.058000  KCHI
00:02:22.059000  KCHI
00:02:22.060000  KCHI
00:02:22.061000  KCHI
00:02:22.062000  KCHI
00:02:22.063000  KCHI

>>> A few of its contents:
      

                 KCHI
milliseconds         
00:03:58.888000  KCHI
00:03:58.889000  KCHI
00:03:58.890000  KCHI

>>> Base DF size after 25 rounds: 23012

>>> The head:
                KCHI
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 KCHI
milliseconds         
00:03:59.230000  KCHI
00:03:59.231000  KCHI
00:03:59.232000  KCHI
00:03:59.233000  KCHI
00:03:59.234000  KCHI
00:03:59.235000  KCHI
00:03:59.236000  KCHI
00:03:59.237000  KCHI
00:03:59.238000  KCHI
00:03:59.239000  KCHI

>>> A few of its contents:
                 KCHI
milliseconds         
00:00:07.773000  KCHI
00:00:07.774000  KCHI
00:00:07.775000  KCHI
00:00:07.776000  KCHI
00:00:07.777000  KCHI
Attempting update with DF from label KCHI
                 base  MAL  CHI  KCHI
milliseconds                         
00:0


                 FEM
milliseconds        
00:01:01.012000  FEM
00:01:01.013000  FEM
00:01:01.014000  FEM

>>> Base DF size after 10 rounds: 49310

>>> The head:
                 FEM
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 FEM
milliseconds        
00:01:01.121000  FEM
00:01:01.122000  FEM
00:01:01.123000  FEM
00:01:01.124000  FEM
00:01:01.125000  FEM
00:01:01.126000  FEM
00:01:01.127000  FEM
00:01:01.128000  FEM
00:01:01.129000  FEM
00:01:01.130000  FEM

>>> A few of its contents:
                 FEM
milliseconds        
00:00:27.471000  FEM
00:00:27.472000  FEM
00:00:27.473000  FEM
00:00:27.474000  FEM
00:00:27.475000  FEM
Low end: 0 days 00:01:01.317000
Duration: 0 days 00:00:02.486000

                 FEM
milliseconds        
00:01:01.317000  FEM
00:01:01.318000 

00:01:54.857000  FEM

>>> A few of its contents:
                 FEM
milliseconds        
00:00:27.471000  FEM
00:00:27.472000  FEM
00:00:27.473000  FEM
00:00:27.474000  FEM
00:00:27.475000  FEM
Low end: 0 days 00:01:54.974000
Duration: 0 days 00:00:07.037000

                 FEM
milliseconds        
00:01:54.974000  FEM
00:01:54.975000  FEM
00:01:54.976000  FEM

>>> Base DF size after 24 rounds: 102854

>>> The head:
                 FEM
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 FEM
milliseconds        
00:02:02.002000  FEM
00:02:02.003000  FEM
00:02:02.004000  FEM
00:02:02.005000  FEM
00:02:02.006000  FEM
00:02:02.007000  FEM
00:02:02.008000  FEM
00:02:02.009000  FEM
00:02:02.010000  FEM
00:02:02.011000  FEM

>>> A few of its contents:
                 FEM
milliseco


>>> Base DF size after 36 rounds: 134292

>>> The head:
                 FEM
milliseconds        
00:00:00         NaN
00:00:00.001000  NaN
00:00:00.002000  NaN
00:00:00.003000  NaN
00:00:00.004000  NaN
00:00:00.005000  NaN
00:00:00.006000  NaN
00:00:00.007000  NaN
00:00:00.008000  NaN
00:00:00.009000  NaN
>>> The tail:
                 FEM
milliseconds        
00:02:54.093000  FEM
00:02:54.094000  FEM
00:02:54.095000  FEM
00:02:54.096000  FEM
00:02:54.097000  FEM
00:02:54.098000  FEM
00:02:54.099000  FEM
00:02:54.100000  FEM
00:02:54.101000  FEM
00:02:54.102000  FEM

>>> A few of its contents:
                 FEM
milliseconds        
00:00:27.471000  FEM
00:00:27.472000  FEM
00:00:27.473000  FEM
00:00:27.474000  FEM
00:00:27.475000  FEM
Low end: 0 days 00:02:55.265000
Duration: 0 days 00:00:07.271000

                 FEM
milliseconds        
00:02:55.265000  FEM
00:02:55.266000  FEM
00:02:55.267000  FEM

>>> Base DF size after 37 rounds: 141564

>>> The head:
                 FEM
m

                 base  MAL  CHI KCHI  FEM
milliseconds                             
00:00:27.471000   NaN  NaN  NaN  NaN  FEM
00:00:27.472000   NaN  NaN  NaN  NaN  FEM
00:00:27.473000   NaN  NaN  NaN  NaN  FEM
00:00:27.474000   NaN  NaN  NaN  NaN  FEM
00:00:27.475000   NaN  NaN  NaN  NaN  FEM
Processing label: SPEECH

The temp_df subset for label SPEECH contains 25 rows
      start  duration   class
0    6038.0    5191.0  SPEECH
6   14011.0    3144.0  SPEECH
9   18488.0     209.0  SPEECH
10  19064.0     209.0  SPEECH
11  20993.0    3537.0  SPEECH
The temp_df frame is as follows:
       start  duration   class
0     6038.0    5191.0  SPEECH
1    14011.0    3144.0  SPEECH
2    18488.0     209.0  SPEECH
3    19064.0     209.0  SPEECH
4    20993.0    3537.0  SPEECH
5    25323.0   38575.0  SPEECH
6    64095.0    5543.0  SPEECH
7    69879.0   14924.0  SPEECH
8    85357.0    1947.0  SPEECH
9    87413.0   25204.0  SPEECH
10  112992.0    9766.0  SPEECH
11  123491.0    1509.0  SPEECH
12  125280.

00:02:03.493000  SPEECH

>>> Base DF size after 11 rounds: 110617

>>> The head:
                SPEECH
milliseconds          
00:00:00           NaN
00:00:00.001000    NaN
00:00:00.002000    NaN
00:00:00.003000    NaN
00:00:00.004000    NaN
00:00:00.005000    NaN
00:00:00.006000    NaN
00:00:00.007000    NaN
00:00:00.008000    NaN
00:00:00.009000    NaN
>>> The tail:
                 SPEECH
milliseconds           
00:02:04.991000  SPEECH
00:02:04.992000  SPEECH
00:02:04.993000  SPEECH
00:02:04.994000  SPEECH
00:02:04.995000  SPEECH
00:02:04.996000  SPEECH
00:02:04.997000  SPEECH
00:02:04.998000  SPEECH
00:02:04.999000  SPEECH
00:02:05         SPEECH

>>> A few of its contents:
                 SPEECH
milliseconds           
00:00:14.011000  SPEECH
00:00:14.012000  SPEECH
00:00:14.013000  SPEECH
00:00:14.014000  SPEECH
00:00:14.015000  SPEECH
Low end: 0 days 00:02:05.280000
Duration: 0 days 00:00:03.664000

                 SPEECH
milliseconds           
00:02:05.280000  SPEECH
00:02:0


>>> A few of its contents:
                 SPEECH
milliseconds           
00:00:14.011000  SPEECH
00:00:14.012000  SPEECH
00:00:14.013000  SPEECH
00:00:14.014000  SPEECH
00:00:14.015000  SPEECH
Low end: 0 days 00:02:55.249000
Duration: 0 days 00:00:11.407000

                 SPEECH
milliseconds           
00:02:55.249000  SPEECH
00:02:55.250000  SPEECH
00:02:55.251000  SPEECH

>>> Base DF size after 20 rounds: 166117

>>> The head:
                SPEECH
milliseconds          
00:00:00           NaN
00:00:00.001000    NaN
00:00:00.002000    NaN
00:00:00.003000    NaN
00:00:00.004000    NaN
00:00:00.005000    NaN
00:00:00.006000    NaN
00:00:00.007000    NaN
00:00:00.008000    NaN
00:00:00.009000    NaN
>>> The tail:
                 SPEECH
milliseconds           
00:03:06.647000  SPEECH
00:03:06.648000  SPEECH
00:03:06.649000  SPEECH
00:03:06.650000  SPEECH
00:03:06.651000  SPEECH
00:03:06.652000  SPEECH
00:03:06.653000  SPEECH
00:03:06.654000  SPEECH
00:03:06.655000  SPEECH
00:03:0

In [14]:
result.loc['00:03:30.840000':'00:03:30.860000']

Unnamed: 0_level_0,base,MAL,CHI,KCHI,FEM,SPEECH
milliseconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00:03:30.840000,,MAL,,,FEM,SPEECH
00:03:30.841000,,MAL,,,FEM,SPEECH
00:03:30.842000,,MAL,,,FEM,SPEECH
00:03:30.843000,,MAL,,,FEM,SPEECH
00:03:30.844000,,MAL,,,FEM,SPEECH
00:03:30.845000,,MAL,,,FEM,SPEECH
00:03:30.846000,,MAL,,,FEM,SPEECH
00:03:30.847000,,MAL,,,FEM,SPEECH
00:03:30.848000,,MAL,,,FEM,SPEECH
00:03:30.849000,,MAL,,,FEM,SPEECH


In [23]:
for col in result.columns:
    print(f""" This DataFrameresult[col].describe())

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: base, dtype: float64
count     19728
unique        1
top         MAL
freq      19728
Name: MAL, dtype: object
count     19408
unique        1
top         CHI
freq      19408
Name: CHI, dtype: object
count     16721
unique        1
top        KCHI
freq      16721
Name: KCHI, dtype: object
count     167996
unique         1
top          FEM
freq      167996
Name: FEM, dtype: object
count     211447
unique         1
top       SPEECH
freq      211447
Name: SPEECH, dtype: object
