In [1]:
import pandas as pd
import os
from tqdm import tqdm

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None


input_file_directory = "./data/input_data/"
preprocessed_file_directory = "./data/preprocessed_data/"
output_file_directory = "./data/result_data/"
tss_data_directory = "./data/tss_data/"
processed_file_directory = "./data/processed_data/"


In [2]:
speaker_cols = []

def prep(df):
    filtered_columns = df[speaker_cols]
    df['speech_pattern'] = filtered_columns.astype(str).apply(lambda x: ','.join(x), axis=1)
    
    return df
    
    
def identify_tss(df):
    
    queue = []
    df['tss'] = 0
    df['queue'] = ''
    
    previous_speaker_set = set()
    previous_speaker_pattern = ''
    
    for index, row in df.iterrows():
        df.at[index, 'queue'] = str(queue)
        
            
        if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern :

            new_speaker_set = set()
            for speaker in speaker_cols:
                if row[speaker]:
                    new_speaker_set.add(speaker)
                    
            difference_set = new_speaker_set - previous_speaker_set 
            if len(difference_set):
                queue += list(difference_set)

                while len(queue)>3:
                    queue.pop(0)

                if len(set(queue)) == 3:
                    df.at[index, 'tss'] = 1

            previous_speaker_set = new_speaker_set
            
        previous_speaker_pattern = row['speech_pattern']
        
    return df

def identify_tss_sp(df, MEDIAN_SP_THRESHOLD):
    
    queue = []
    
    df[f'tss_sp_{MEDIAN_SP_THRESHOLD}'] = 0
    df[f'queue_sp_{MEDIAN_SP_THRESHOLD}'] = ''
    
    previous_speaker_set = set()
    previous_speaker_pattern = ''
    
    pause_frame_count = 0
    
    for index, row in df.iterrows():
        df.at[index, f'queue_sp_{MEDIAN_SP_THRESHOLD}'] = str(queue)
        
        if row['Active Speaker Count'] == 0:
            pause_frame_count += 1
            
        if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern:
            
            if MEDIAN_SP_THRESHOLD is not None and pause_frame_count > MEDIAN_SP_THRESHOLD:
                queue = []
                pause_frame_count = 0

            new_speaker_set = set()
            for speaker in speaker_cols:
                if row[speaker]:
                    new_speaker_set.add(speaker)
                    
            difference_set = new_speaker_set - previous_speaker_set 
            if len(difference_set):
                queue += list(difference_set)

                while len(queue)>3:
                    queue.pop(0)

                if len(set(queue)) == 3:
                    df.at[index, f'tss_sp_{MEDIAN_SP_THRESHOLD}'] = 1

            previous_speaker_set = new_speaker_set
            
        previous_speaker_pattern = row['speech_pattern']
        
        if row['Active Speaker Count'] != 0:
            pause_frame_count = 0
            
    return df
    

In [3]:
def identify_tss_disjoint(df):
    
    queue = []
    df['tss_disjoint'] = 0
    df['queue_disjoint'] = ''

    previous_speaker_set = set()
    previous_speaker_pattern = ''
    
    for index, row in df.iterrows():
        df.at[index, 'queue_disjoint'] = str(queue)
        
            
        if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern :

            new_speaker_set = set()
            for speaker in speaker_cols:
                if row[speaker]:
                    new_speaker_set.add(speaker)
                    
            difference_set = new_speaker_set - previous_speaker_set 
            if len(difference_set):
                queue += list(difference_set)

                while len(queue)>3:
                    queue.pop(0)

                if len(set(queue)) == 3:
                    df.at[index, 'tss_disjoint'] = 1
                    queue = []

            previous_speaker_set = new_speaker_set
            
        previous_speaker_pattern = row['speech_pattern']
        
            
            
    return df


def identify_tss_disjoint_sp(df, MEDIAN_SP_THRESHOLD):
    
    queue = []
    df[f'tss_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = 0
    df[f'queue_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = ''

    previous_speaker_set = set()
    previous_speaker_pattern = ''
    
    pause_frame_count = 0
    
    for index, row in df.iterrows():
        df.at[index, f'queue_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = str(queue)
        
        if row['Active Speaker Count'] == 0:
            pause_frame_count += 1
            
            
        if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern :
            
            if MEDIAN_SP_THRESHOLD is not None and pause_frame_count > MEDIAN_SP_THRESHOLD:
                queue = []
                pause_frame_count = 0

            new_speaker_set = set()
            for speaker in speaker_cols:
                if row[speaker]:
                    new_speaker_set.add(speaker)
                    
            difference_set = new_speaker_set - previous_speaker_set 
            if len(difference_set):
                queue += list(difference_set)

                while len(queue)>3:
                    queue.pop(0)

                if len(set(queue)) == 3:
                    df.at[index, f'tss_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = 1
                    queue = []

            previous_speaker_set = new_speaker_set
            
        previous_speaker_pattern = row['speech_pattern']
        
            
            
    return df
    
    

In [4]:
tss_list = []

MEDIAN_SP_THRESHOLDS = [2, 32, 34.9, 78.5]
MEDIAN_SP_THRESHOLDS.sort(reverse=True)

for filename in tqdm(os.listdir(preprocessed_file_directory)):
    if '.csv' not in filename:
        continue
    file_path = preprocessed_file_directory +'/'+ filename
    df = pd.read_csv(file_path)
    
    speaker_cols = []
    for col in df.columns:
        if 'Sub Id' in col:
            speaker_cols.append(col)
    
    df = prep(df)
    df = identify_tss(df)
    df = identify_tss_disjoint(df)
    
    for MEDIAN_SP_THRESHOLD in MEDIAN_SP_THRESHOLDS:
        df = identify_tss_sp(df, MEDIAN_SP_THRESHOLD)
        df = identify_tss_disjoint_sp(df, MEDIAN_SP_THRESHOLD)
    
    df.to_csv(f'{tss_data_directory}/{filename}', index=False)
    group, speakers = filename[:-4].split('_', 1)
    
    data_dict = {'Group':group,'Speakers':speakers,'TSS Count':df['tss'].sum(), 'Disjoint TSS Count':df['tss_disjoint'].sum()}
    for MEDIAN_SP_THRESHOLD in MEDIAN_SP_THRESHOLDS:
        data_dict[f'TSS Count (<={MEDIAN_SP_THRESHOLD} Frame Pauses)'] = df[f'tss_sp_{MEDIAN_SP_THRESHOLD}'].sum()
        data_dict[f'Disjoint TSS Count (<={MEDIAN_SP_THRESHOLD} Frame Pauses)'] = df[f'tss_disjoint_sp_{MEDIAN_SP_THRESHOLD}'].sum()
        
    tss_list.append(data_dict)
    
    
tss_df = pd.DataFrame(tss_list)

tss_df.to_csv(f'{output_file_directory}/tss_results_sp.csv',index=False)

100%|██████████| 129/129 [1:29:39<00:00, 41.70s/it]


In [5]:
tss_df

Unnamed: 0,Group,Speakers,TSS Count,Disjoint TSS Count,TSS Count (<=78.5 Frame Pauses),Disjoint TSS Count (<=78.5 Frame Pauses),TSS Count (<=34.9 Frame Pauses),Disjoint TSS Count (<=34.9 Frame Pauses),TSS Count (<=32 Frame Pauses),Disjoint TSS Count (<=32 Frame Pauses),TSS Count (<=2 Frame Pauses),Disjoint TSS Count (<=2 Frame Pauses)
0,100,A2627_B2621_C2625,162,81,141,71,122,65,121,65,78,49
1,101,A2640_B2644_C2636,186,104,178,95,165,91,164,89,118,76
2,102,A2639_B2645_C2662,169,91,157,75,138,71,136,69,99,59
3,103,A2651_B2631_C2657,120,66,111,58,99,54,99,53,67,47
4,104,A2656_B2650_C2653,88,53,85,47,66,43,66,42,31,26
5,105,A2675_B2674_C2663,75,47,65,35,50,32,47,32,23,17
6,106,A2673_B2664_C2654,179,94,169,82,152,75,152,77,86,53
7,107,A2677_B2678_C2649,72,42,71,38,67,39,67,39,48,33
8,108,A2679_B2660_C2676,164,86,156,77,140,75,140,75,110,67
9,109,A2682_B2647_C2684,113,58,98,49,77,45,76,45,51,36
