In [1]:
import copy
import pandas as pd
import numpy as np

# The strategy

We have data that comes in as positionally-indexed tags (`CHI`, `FEM`, `MAL`, `SPEECH`) of varying durations. As a result, the data from multiple different indexes overlaps, but the endpoints are inconsistent. This makes it difficult to calculate things like relative durations, overlaps, pauses, etc.

The goal of this series of functions is twofold. Both goals begin with an `.rttm` file generated by our model classification.

**Goal 1**: generate a dataframe with the simplified structure required for our existing visualizations. This dataframe is positionally-indexed, with variable-length durations mapped to specific output classes.

**Goal 2**: generate a dataframe that is time-indexed with one-hot-encoded classes.

The intermediate product for **Goal 2** is granular at the millisecond range. The final output for **Goal 2** has that same granularity collapsed to a human timescale of 200 ms.

## Goal 1: generate a positionally-indexed dataframe

In [2]:
def RttmToUtteranceIndexedSpeakerActivity(df, outfile=None):
    """ Given an RTTM input file, generate a dataframe structured
        to support a visualization of type 'Speaker Activity' and optionally
        export to a csv located at {outfile}

        df = Pandas DataFrame containing a standard .rttm file
        outfile = destination for exported CSV (path, filename, extension)
    """

    # Check whether an outfile has been defined
    if outfile is not None:
        export = True

    # Drop the columns we don't care about from a base RTTM
    vizframe = copy.deepcopy(df) \
        .drop(
        columns=[
            'task',
            'inputFile',
            'one',
            'NA_1',
            'NA_2',
            'NA_3',
            'NA_4'])

    # Rename columns for our viz's purposes
    vizframe = vizframe.rename(columns={
        'start': 'START',
        'duration': 'DUR',
        'class': 'LABEL'
    })

    # Remap the model classes for this viz's purposes
    vizframe['LABEL'] = vizframe['LABEL'].map({
        'KCHI': 'CHILD',
        'CHI': 'CHILD',
        'FEM': 'ADULT',
        'MAL': 'ADULT'
    })

    # Filter the dataframe to just the 'clean' (non-'SPEECH') classes
    vizframe = vizframe[vizframe['LABEL'].isin(['CHILD', 'ADULT'])]
    vizframe['LABEL_NUM'] = vizframe['LABEL'] \
        .apply(lambda x: 1 if x == 'CHILD'
               else (-1 if x == 'ADULT' else NaN))
    vizframe['DUR_TRANS'] = vizframe['LABEL_NUM'] * vizframe['DUR']
    vizframe['COUNT'] = 1

    if export:
        vizframe.to_csv(outfile)

    return vizframe

## Generate a time-indexed dataframe

To execute this strategy we proceed as follows:
1. Load the `.rttm` into a dataframe that is an unaltered representation of the modeling output
2. Identify the latest timestamp that needs to be represented in our time-indexed dataframe.
   1. We actually want to bookend our dataframe, but the early endpoint is `t = 0.00`, so we can ignore it at this point.  
   1. The late endpoint is the maximum combined value of the `start` column and the `duration` column. (**N.B.**: it doesn't necessarily have to be the last timestamp. An earlier timestamp could run until later if it has a larger duration.)  
3. Build out a `reference` dataframe that runs from `t = 0.00` to `t = max(start+duration)` and populate it with `np.nan` values.
4. Generate subsetted dataframes from the original `rttm` dataframe that contain only `start` and `duration` values for each of the unique labels
5. Convert those dataframes from positionally-indexed to time-indexed with the same granularity and resolution as the `reference` dataframe
6. Add to the `reference` dataframe a column for each unique `class` containing `np.nan` values.
7. Merge (or update in place) the `reference` dataframe's `class` column with the subsetted `class` dataframe

### Steps 1-2: Loading DF and getting latest timestamp

In [3]:
# Step 1: Load the .rttm into a dataframe
def DfFromRttm(rttm):
    """ Given an RTTM file, parses it into a Pandas DataFrame.
    """
    df = pd.read_csv(rttm,
                     sep=' ',
                     names=['task','inputFile','one','start','duration',
                     'NA_1','NA_2','class','NA_3', 'NA_4'])    
    return df

In [4]:
## Step 2: Identify the latest timestamp we need.
def GetLatestTimestampNeeded(input_df, verbose=True):
    """ Given an RTTM-derived dataframe,
        extract the last timestamp we'll need
        as a scalar. It will be the maximum value of
        the `start` + `duration` columns.
    """
    input_df['end_time'] = input_df['start'] + input_df['duration']
    end_row = input_df['end_time'].idxmax()
    latest_timestamp = input_df.at[end_row,'start'] + input_df.at[end_row,'duration']
    if verbose:
        print(f'''
        >> This DF has data that runs until {latest_timestamp}.
        >> That value was found at row {end_row} and is the sum of
           {input_df.at[end_row, 'start']} and {input_df.at[end_row, 'duration']}
        ''')
    return latest_timestamp

In [5]:
# Testing our Steps 1 and 2
ny_7759_short = DfFromRttm('./ny_7759.rttm')[0:10]
GetLatestTimestampNeeded(ny_7759_short, True)
ny_7759_short.head(10)


        >> This DF has data that runs until 19.989.
        >> That value was found at row 8 and is the sum of
           18.416 and 1.5730000000000002
        


Unnamed: 0,task,inputFile,one,start,duration,NA_1,NA_2,class,NA_3,NA_4,end_time
0,SPEAKER,ny_7759,1,6.038,5.191,,,SPEECH,,,11.229
1,SPEAKER,ny_7759,1,6.198,0.813,,,KCHI,,,7.011
2,SPEAKER,ny_7759,1,7.773,0.145,,,KCHI,,,7.918
3,SPEAKER,ny_7759,1,9.56,0.38,,,MAL,,,9.94
4,SPEAKER,ny_7759,1,10.694,0.24,,,KCHI,,,10.934
5,SPEAKER,ny_7759,1,12.565,1.787,,,MAL,,,14.352
6,SPEAKER,ny_7759,1,14.011,3.144,,,SPEECH,,,17.155
7,SPEAKER,ny_7759,1,16.661,0.319,,,KCHI,,,16.98
8,SPEAKER,ny_7759,1,18.416,1.573,,,MAL,,,19.989
9,SPEAKER,ny_7759,1,18.488,0.209,,,SPEECH,,,18.697


### Step 3: Generate reference DF from `t = 0 ... latest_timestamp`

In [6]:
def MakeMillisecondIntegerIndexedDf(
        start,
        duration,
        colname='value',
        colvalue='',
        decimals=4,
        verbose=False):
    """ Create an integer-indexed dataframe covering a 
            {duration} from
            {start} having a single column with the name
            {colname} that contains a default value of
            {colvalue} for that named column.
        By default it runs quietly rather than {verbose}.
    """

    if verbose:
        print(f' >>> From [{start}] for [{duration}] seconds' \
              f' until [{start+duration}] the col [{colname}]' \
              f' will contain the value [{colvalue}]')
    
    # Create a range between the start and stop
    rng = pd.RangeIndex(start = int(round(start, decimals)*1000),
                        stop  = int(round(start+duration, decimals)*1000),
                        step  = 1)
    
    # Turn that series into a DataFrame and rename the index for clarity
    df = pd.Series(colvalue, index=rng).to_frame(name=colname)
    df.index.name=f'millisecond_ints'
    if verbose:
        print(df.head(3), df.tail(3))
    return df

In [7]:
# Testing our Step 3
base_df = MakeMillisecondIntegerIndexedDf(
           start     = 0,
           duration  = 19.989,
           colname   = 'base_col',
           colvalue  = np.nan,
           decimals  = 4,
           verbose   = True)

 >>> From [0] for [19.989] seconds until [19.989] the col [base_col] will contain the value [nan]
                  base_col
millisecond_ints          
0                      NaN
1                      NaN
2                      NaN                   base_col
millisecond_ints          
19986                  NaN
19987                  NaN
19988                  NaN


### Step 4: Generate subsetted DFs

In [8]:
def SubsetDfByLabel(df, column_list, key_col, value):
    """ Given a dataframe {df}, return the subset
        of the dataframe defined by {column_list}
        containing {value} in the {key_col} column
    """
    return df[df[key_col] == value][[x for x in column_list]]

In [9]:
# Testing our Step 4
ny_7759_MAL = SubsetDfByLabel(ny_7759_short, column_list=['start', 'duration', 'class'], key_col='class', value='MAL')

In [10]:
# Testing with Step 5
base_df['MAL'] = np.nan
for i in ny_7759_MAL.index:
    print('*' * 10 + f'i = {i}' + '*' * 10)
    s, d, cn = ny_7759_MAL.loc[i, ['start', 'duration', 'class']]
    time_indexed_df = MakeMillisecondIntegerIndexedDf(
                start      = s,
                duration   = d,
                colname    = cn,
                colvalue   = 1,
                decimals   = 4,
                verbose    = True)
    base_df.update(other = time_indexed_df,
               overwrite = False)

**********i = 3**********
 >>> From [9.56] for [0.38] seconds until [9.940000000000001] the col [MAL] will contain the value [1]
                  MAL
millisecond_ints     
9560                1
9561                1
9562                1                   MAL
millisecond_ints     
9937                1
9938                1
9939                1
**********i = 5**********
 >>> From [12.565] for [1.787] seconds until [14.352] the col [MAL] will contain the value [1]
                  MAL
millisecond_ints     
12565               1
12566               1
12567               1                   MAL
millisecond_ints     
14349               1
14350               1
14351               1
**********i = 8**********
 >>> From [18.416] for [1.5730000000000002] seconds until [19.989] the col [MAL] will contain the value [1]
                  MAL
millisecond_ints     
18416               1
18417               1
18418               1                   MAL
millisecond_ints     
19986               1


In [11]:
# Verifying the test outcomes
base_df['MAL'].sum()

3740.0

In [12]:
# This is to make our lives easier later
ny_7759 = DfFromRttm('./ny_7759.rttm')
ny_7759_bak = copy.deepcopy(ny_7759)
# ny_7759 = copy.deepcopy(ny_7759_bak)

+ https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html?highlight=time%20range#timeseries-offset-aliases
+ https://analyzingalpha.com/time-series-analysis-with-python
https://pandas.pydata.org/pandas-docs/stable/user_guide/timedeltas.html?highlight=timedelta%20range
+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.timedelta_range.html#pandas.timedelta_range
+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.to_frame.html

In [13]:
def BuildOneHotEncodedDf(
    input_df,
    label_list=[],
    verbose=True
    ):
    """ Given an RTTM-generated DataFrame, generate a DF containing
        all of the labels of interest in one-hot encoded format
        against an integer-indexed DF representing milliseconds from
        the beginning of the recording
    """
    
    max_seconds_needed = GetLatestTimestampNeeded(input_df)
    if verbose:
        print(f'max_seconds_needed is equal to {max_seconds_needed}')

    outer_df = MakeMillisecondIntegerIndexedDf(
                start      = 0,
                duration   = max_seconds_needed,
                colname    = 'base_col',
                colvalue   = np.nan,
                decimals   = 4,
                verbose    = True)

    if verbose:
        print(f'The outer_df frame will contain {outer_df.shape[0]} records.')
    
    # Loop through labels, subsetting the original DF so
    # we can merge it back into the main outer DF
    for label in label_list:
        print(f'>>> Processing label: {label}\n')

        if label not in input_df['class'].unique():
            print(f'Label {label} not found in this dataset')
            label_base_df = pd.DataFrame(columns=[label])
            continue
        
        # Gotta avoid errors from accidentally manipulating original DFs
        temp_df = copy.deepcopy(input_df)

        # Replace the deep-copied DF with a subset of itself
        # that contains only records for the label of interest
        temp_df = SubsetDfByLabel(
            input_df,
            column_list = ['start', 'duration', 'class'],
            key_col     = 'class',
            value       = label
        )

        if verbose:
            print(f'The temp_df subset for label {label}' \
                  f' contains {temp_df.shape[0]} rows')
            print(temp_df.head(5),
                  temp_df.tail(5))
        
        # The subsetted DF retains the original index unless you reset it
        temp_df.reset_index(drop=True,
                            inplace=True)

        if verbose:
            print(f'The temp_df frame is as follows:\n{temp_df}')

        # Creating the base DF for this label - ranges from 0 to the earliest record
        label_base_df = MakeMillisecondIntegerIndexedDf(
                start      = 0,
                duration   = temp_df['start'].min(),
                colname    = label,
                colvalue   = np.nan,
                decimals   = 4,
                verbose    = verbose)
        
        if verbose:
            # The head() will always be the same, so we need to look at the tail() to verify
            print('\n>>> The last few rows of the label_base_df for label' \
                  f' {label} are:\n{label_base_df.tail()}' \
                  f'\n>>> Base DF size for label {label}: {len(temp_df)}\n')

        for i in temp_df.index:
            if verbose:
                print('*' * 10 + f'{label}: i = {i}' + '*' * 10)
            s, d, cn = temp_df.loc[i, ['start', 'duration', 'class']]
            label_base_df = label_base_df.append(
                MakeMillisecondIntegerIndexedDf(
                    start    = s,
                    duration = d,
                    colname  = cn,
                    colvalue = 1,
                    decimals = 4,
                    verbose  = verbose
                ))
            
            if verbose:
                print(f'\n>>> Base DF size after {i}' \
                      ' rounds: {len(label_base_df)}')
                print(f'\n>>> The head:\n{label_base_df.head(10)}'\
                      f'\n>>> The tail:\n{label_base_df.tail(10)}')
                print('\n>>> A few of its contents:'\
                      f'\n{label_base_df[~label_base_df[label].isna()].head(5)}')

        # Creating a placeholder for the update call
        outer_df[label] = np.nan
        
        # When attempting the update method:
        print(f'Attempting update with DF from label {label}')
        outer_df.update(
            other     = label_base_df,
            overwrite = True)

        if verbose:
            try:
                print(outer_df[~outer_df[label].isna()].head())
            except:
                print(outer_df)
            
    return outer_df

In [14]:
tester = DfFromRttm('./ny_7759.rttm')
# tester[['start', 'duration']] = round(tester[['start', 'duration']], 4)
# tester[['start', 'duration']] = tester[['start', 'duration']]
labels = ['MAL', 'CHI', 'KCHI', 'FEM', 'SPEECH']
tester[['start', 'duration']].loc[2]

start       7.773
duration    0.145
Name: 2, dtype: float64

In [15]:
result = BuildOneHotEncodedDf(tester, label_list=labels, verbose=True)


        >> This DF has data that runs until 239.67000000000002.
        >> That value was found at row 142 and is the sum of
           235.49200000000002 and 4.178
        
max_seconds_needed is equal to 239.67000000000002
 >>> From [0] for [239.67000000000002] seconds until [239.67000000000002] the col [base_col] will contain the value [nan]
                  base_col
millisecond_ints          
0                      NaN
1                      NaN
2                      NaN                   base_col
millisecond_ints          
239667                 NaN
239668                 NaN
239669                 NaN
The outer_df frame will contain 239670 records.
>>> Processing label: MAL

The temp_df subset for label MAL contains 26 rows
     start  duration class
3    9.560     0.380   MAL
5   12.565     1.787   MAL
8   18.416     1.573   MAL
15  25.397     0.118   MAL
16  25.822     0.250   MAL        start  duration class
131  209.993     0.857   MAL
132  210.978     1.256   MAL
138  225.

75015               1                   MAL
millisecond_ints     
77200               1
77201               1
77202               1

>>> Base DF size after 8 rounds: {len(label_base_df)}

>>> The head:
                  MAL
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  MAL
millisecond_ints     
77193             1.0
77194             1.0
77195             1.0
77196             1.0
77197             1.0
77198             1.0
77199             1.0
77200             1.0
77201             1.0
77202             1.0

>>> A few of its contents:
                  MAL
millisecond_ints     
9560              1.0
9561              1.0
9562              1.0
9563              1.0
9564              1.0
**********MAL: i = 9**********
 >>> From [79.32300000000001] for [0.552] s


>>> Base DF size after 16 rounds: {len(label_base_df)}

>>> The head:
                  MAL
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  MAL
millisecond_ints     
139579            1.0
139580            1.0
139581            1.0
139582            1.0
139583            1.0
139584            1.0
139585            1.0
139586            1.0
139587            1.0
139588            1.0

>>> A few of its contents:
                  MAL
millisecond_ints     
9560              1.0
9561              1.0
9562              1.0
9563              1.0
9564              1.0
**********MAL: i = 17**********
 >>> From [139.696] for [0.113] seconds until [139.809] the col [MAL] will contain the value [1]
                  MAL
millisecond_ints     
139696              1
139697    

238765            1.0

>>> A few of its contents:
                  MAL
millisecond_ints     
9560              1.0
9561              1.0
9562              1.0
9563              1.0
9564              1.0
Attempting update with DF from label MAL
                  base_col  MAL
millisecond_ints               
9560                   NaN  1.0
9561                   NaN  1.0
9562                   NaN  1.0
9563                   NaN  1.0
9564                   NaN  1.0
>>> Processing label: CHI

The temp_df subset for label CHI contains 24 rows
     start  duration class
19  33.011     0.363   CHI
22  37.144     0.257   CHI
28  41.050     0.167   CHI
29  42.356     0.142   CHI
30  42.992     0.353   CHI        start  duration class
127  203.359     1.488   CHI
136  220.264     1.228   CHI
137  223.351     5.385   CHI
141  234.272     1.019   CHI
147  239.007     0.140   CHI
The temp_df frame is as follows:
      start  duration class
0    33.011     0.363   CHI
1    37.144     0.257   CHI
2

48304             1.0

>>> A few of its contents:
                  CHI
millisecond_ints     
33011             1.0
33012             1.0
33013             1.0
33014             1.0
33015             1.0
**********CHI: i = 7**********
 >>> From [51.104] for [0.424] seconds until [51.528] the col [CHI] will contain the value [1]
                  CHI
millisecond_ints     
51104               1
51105               1
51106               1                   CHI
millisecond_ints     
51525               1
51526               1
51527               1

>>> Base DF size after 7 rounds: {len(label_base_df)}

>>> The head:
                  CHI
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  CHI
millisecond_ints     
51518             1.0
51519             1.0
51520         


>>> A few of its contents:
                  CHI
millisecond_ints     
33011             1.0
33012             1.0
33013             1.0
33014             1.0
33015             1.0
**********CHI: i = 22**********
 >>> From [234.27200000000002] for [1.0190000000000001] seconds until [235.29100000000003] the col [CHI] will contain the value [1]
                  CHI
millisecond_ints     
234272              1
234273              1
234274              1                   CHI
millisecond_ints     
235288              1
235289              1
235290              1

>>> Base DF size after 22 rounds: {len(label_base_df)}

>>> The head:
                  CHI
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  CHI
millisecond_ints     
235281            1.0
235282            1

113032               1                   KCHI
millisecond_ints      
113513               1
113514               1
113515               1

>>> Base DF size after 11 rounds: {len(label_base_df)}

>>> The head:
                  KCHI
millisecond_ints      
0                  NaN
1                  NaN
2                  NaN
3                  NaN
4                  NaN
5                  NaN
6                  NaN
7                  NaN
8                  NaN
9                  NaN
>>> The tail:
                  KCHI
millisecond_ints      
113506             1.0
113507             1.0
113508             1.0
113509             1.0
113510             1.0
113511             1.0
113512             1.0
113513             1.0
113514             1.0
113515             1.0

>>> A few of its contents:
                  KCHI
millisecond_ints      
6198               1.0
6199               1.0
6200               1.0
6201               1.0
6202               1.0
**********KCHI: i = 12**********
 >>

170514               1                   KCHI
millisecond_ints      
171489               1
171490               1
171491               1

>>> Base DF size after 20 rounds: {len(label_base_df)}

>>> The head:
                  KCHI
millisecond_ints      
0                  NaN
1                  NaN
2                  NaN
3                  NaN
4                  NaN
5                  NaN
6                  NaN
7                  NaN
8                  NaN
9                  NaN
>>> The tail:
                  KCHI
millisecond_ints      
171482             1.0
171483             1.0
171484             1.0
171485             1.0
171486             1.0
171487             1.0
171488             1.0
171489             1.0
171490             1.0
171491             1.0

>>> A few of its contents:
                  KCHI
millisecond_ints      
6198               1.0
6199               1.0
6200               1.0
6201               1.0
6202               1.0
**********KCHI: i = 21**********
 >>

46  235.567     3.714   FEM
 >>> From [0] for [21.991999999999997] seconds until [21.991999999999997] the col [FEM] will contain the value [nan]
                  FEM
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN                   FEM
millisecond_ints     
21989             NaN
21990             NaN
21991             NaN

>>> The last few rows of the label_base_df for label FEM are:
                  FEM
millisecond_ints     
21987             NaN
21988             NaN
21989             NaN
21990             NaN
21991             NaN
>>> Base DF size for label FEM: 47

**********FEM: i = 0**********
 >>> From [21.991999999999997] for [0.445] seconds until [22.436999999999998] the col [FEM] will contain the value [1]
                  FEM
millisecond_ints     
21992               1
21993               1
21994               1                   FEM
millisecond_ints     
22434               1
22435               1
22436               1

>>> Base DF

56643               1

>>> Base DF size after 7 rounds: {len(label_base_df)}

>>> The head:
                  FEM
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  FEM
millisecond_ints     
56634             1.0
56635             1.0
56636             1.0
56637             1.0
56638             1.0
56639             1.0
56640             1.0
56641             1.0
56642             1.0
56643             1.0

>>> A few of its contents:
                  FEM
millisecond_ints     
21992             1.0
21993             1.0
21994             1.0
21995             1.0
21996             1.0
**********FEM: i = 8**********
 >>> From [58.381] for [0.33899999999999997] seconds until [58.72] the col [FEM] will contain the value [1]
                  FEM
millisecond_ints     
5


>>> A few of its contents:
                  FEM
millisecond_ints     
21992             1.0
21993             1.0
21994             1.0
21995             1.0
21996             1.0
**********FEM: i = 23**********
 >>> From [113.493] for [1.364] seconds until [114.857] the col [FEM] will contain the value [1]
                  FEM
millisecond_ints     
113493              1
113494              1
113495              1                   FEM
millisecond_ints     
114854              1
114855              1
114856              1

>>> Base DF size after 23 rounds: {len(label_base_df)}

>>> The head:
                  FEM
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  FEM
millisecond_ints     
114847            1.0
114848            1.0
114849            1.0
114850    

141285              1                   FEM
millisecond_ints     
147352              1
147353              1
147354              1

>>> Base DF size after 31 rounds: {len(label_base_df)}

>>> The head:
                  FEM
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  FEM
millisecond_ints     
147345            1.0
147346            1.0
147347            1.0
147348            1.0
147349            1.0
147350            1.0
147351            1.0
147352            1.0
147353            1.0
147354            1.0

>>> A few of its contents:
                  FEM
millisecond_ints     
21992             1.0
21993             1.0
21994             1.0
21995             1.0
21996             1.0
**********FEM: i = 32**********
 >>> From [147.968] for [3.385] seconds u

186652            1.0

>>> A few of its contents:
                  FEM
millisecond_ints     
21992             1.0
21993             1.0
21994             1.0
21995             1.0
21996             1.0
**********FEM: i = 39**********
 >>> From [188.104] for [2.957] seconds until [191.061] the col [FEM] will contain the value [1]
                  FEM
millisecond_ints     
188104              1
188105              1
188106              1                   FEM
millisecond_ints     
191058              1
191059              1
191060              1

>>> Base DF size after 39 rounds: {len(label_base_df)}

>>> The head:
                  FEM
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  FEM
millisecond_ints     
191051            1.0
191052            1.0
191053    


>>> Base DF size after 46 rounds: {len(label_base_df)}

>>> The head:
                  FEM
millisecond_ints     
0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
>>> The tail:
                  FEM
millisecond_ints     
239271            1.0
239272            1.0
239273            1.0
239274            1.0
239275            1.0
239276            1.0
239277            1.0
239278            1.0
239279            1.0
239280            1.0

>>> A few of its contents:
                  FEM
millisecond_ints     
21992             1.0
21993             1.0
21994             1.0
21995             1.0
21996             1.0
Attempting update with DF from label FEM
                  base_col  MAL  CHI  KCHI  FEM
millisecond_ints                               
21992                  NaN  NaN  NaN   NaN  1.0
21993               

                  SPEECH
millisecond_ints        
123491                 1
123492                 1
123493                 1                   SPEECH
millisecond_ints        
124997                 1
124998                 1
124999                 1

>>> Base DF size after 11 rounds: {len(label_base_df)}

>>> The head:
                  SPEECH
millisecond_ints        
0                    NaN
1                    NaN
2                    NaN
3                    NaN
4                    NaN
5                    NaN
6                    NaN
7                    NaN
8                    NaN
9                    NaN
>>> The tail:
                  SPEECH
millisecond_ints        
124990               1.0
124991               1.0
124992               1.0
124993               1.0
124994               1.0
124995               1.0
124996               1.0
124997               1.0
124998               1.0
124999               1.0

>>> A few of its contents:
                  SPEECH
millisecond_

168673               1.0

>>> A few of its contents:
                  SPEECH
millisecond_ints        
6038                 1.0
6039                 1.0
6040                 1.0
6041                 1.0
6042                 1.0
**********SPEECH: i = 19**********
 >>> From [169.47099999999998] for [4.754] seconds until [174.22499999999997] the col [SPEECH] will contain the value [1]
                  SPEECH
millisecond_ints        
169471                 1
169472                 1
169473                 1                   SPEECH
millisecond_ints        
174222                 1
174223                 1
174224                 1

>>> Base DF size after 19 rounds: {len(label_base_df)}

>>> The head:
                  SPEECH
millisecond_ints        
0                    NaN
1                    NaN
2                    NaN
3                    NaN
4                    NaN
5                    NaN
6                    NaN
7                    NaN
8                    NaN
9                  

In [16]:
result.tail(20)

Unnamed: 0_level_0,base_col,MAL,CHI,KCHI,FEM,SPEECH
millisecond_ints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
239650,,,,,,1.0
239651,,,,,,1.0
239652,,,,,,1.0
239653,,,,,,1.0
239654,,,,,,1.0
239655,,,,,,1.0
239656,,,,,,1.0
239657,,,,,,1.0
239658,,,,,,1.0
239659,,,,,,1.0


In [17]:
result.loc[6190:6400,]

Unnamed: 0_level_0,base_col,MAL,CHI,KCHI,FEM,SPEECH
millisecond_ints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6190,,,,,,1.0
6191,,,,,,1.0
6192,,,,,,1.0
6193,,,,,,1.0
6194,,,,,,1.0
...,...,...,...,...,...,...
6396,,,,1.0,,1.0
6397,,,,1.0,,1.0
6398,,,,1.0,,1.0
6399,,,,1.0,,1.0


In [18]:
result.loc[9550:9600,]

Unnamed: 0_level_0,base_col,MAL,CHI,KCHI,FEM,SPEECH
millisecond_ints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9550,,,,,,1.0
9551,,,,,,1.0
9552,,,,,,1.0
9553,,,,,,1.0
9554,,,,,,1.0
9555,,,,,,1.0
9556,,,,,,1.0
9557,,,,,,1.0
9558,,,,,,1.0
9559,,,,,,1.0


In [19]:
result.loc[21980:22000,]

Unnamed: 0_level_0,base_col,MAL,CHI,KCHI,FEM,SPEECH
millisecond_ints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21980,,,,,,1.0
21981,,,,,,1.0
21982,,,,,,1.0
21983,,,,,,1.0
21984,,,,,,1.0
21985,,,,,,1.0
21986,,,,,,1.0
21987,,,,,,1.0
21988,,,,,,1.0
21989,,,,,,1.0


In [20]:
result.loc[33005:33100,]

Unnamed: 0_level_0,base_col,MAL,CHI,KCHI,FEM,SPEECH
millisecond_ints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
33005,,,,,1.0,1.0
33006,,,,,1.0,1.0
33007,,,,,1.0,1.0
33008,,,,,1.0,1.0
33009,,,,,1.0,1.0
...,...,...,...,...,...,...
33096,,,1.0,,1.0,1.0
33097,,,1.0,,1.0,1.0
33098,,,1.0,,1.0,1.0
33099,,,1.0,,1.0,1.0


In [25]:
result['overlap'] = np.where(result[['MAL', 'CHI', 'KCHI', 'FEM']].sum(axis = 1) > 1, 1, 0)
# result['concurrent'] = result.sum(result[['MAL', 'CHI', 'KCHI', 'FEM']].sum(axis = 1) > 1 else 'False')

In [22]:
result

Unnamed: 0_level_0,base_col,MAL,CHI,KCHI,FEM,SPEECH,overlap
millisecond_ints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,,,,,,,0
1,,,,,,,0
2,,,,,,,0
3,,,,,,,0
4,,,,,,,0
...,...,...,...,...,...,...,...
239665,,,,,,1.0,0
239666,,,,,,1.0,0
239667,,,,,,1.0,0
239668,,,,,,1.0,0


In [24]:
result.describe()

Unnamed: 0,base_col,MAL,CHI,KCHI,FEM,SPEECH,overlap
count,0.0,20084.0,19748.0,17601.0,168531.0,217135.0,239670.0
mean,,1.0,1.0,1.0,1.0,1.0,0.126274
std,,0.0,0.0,0.0,0.0,0.0,0.332158
min,,1.0,1.0,1.0,1.0,1.0,0.0
25%,,1.0,1.0,1.0,1.0,1.0,0.0
50%,,1.0,1.0,1.0,1.0,1.0,0.0
75%,,1.0,1.0,1.0,1.0,1.0,0.0
max,,1.0,1.0,1.0,1.0,1.0,1.0
