In [1]:
import pandas as pd
import os
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None


input_file_directory = "./data/input_data/"
preprocessed_file_directory = "./data/preprocessed_data/"
output_file_directory = "./data/result_data/"
tss_data_directory = "./data/custom_binned_tss_data/"
processed_file_directory = "./data/processed_data/"


In [2]:
def prep(df):
    filtered_columns = df[speaker_cols]
    df['speech_pattern'] = filtered_columns.astype(str).apply(lambda x: ','.join(x), axis=1)
    
    return df

In [3]:
def identify_tss_disjoint(df):
    
    queue = []
    df['tss_disjoint'] = 0
    df['queue_disjoint'] = ''

    previous_speaker_set = set()
    previous_speaker_pattern = ''
    
    for index, row in df.iterrows():
        df.at[index, 'queue_disjoint'] = str(queue)
        
            
        if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern :

            new_speaker_set = set()
            for speaker in speaker_cols:
                if row[speaker]:
                    new_speaker_set.add(speaker)
                    
            difference_set = new_speaker_set - previous_speaker_set 
            if len(difference_set):
                queue += list(difference_set)

                while len(queue)>3:
                    queue.pop(0)

                if len(set(queue)) == 3:
                    df.at[index, 'tss_disjoint'] = 1
                    queue = []

            previous_speaker_set = new_speaker_set
            
        previous_speaker_pattern = row['speech_pattern']
        
            
            
    return df

def identify_tss_disjoint_sp(df, MEDIAN_SP_THRESHOLD):
    
    queue = []
    df[f'tss_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = 0
    df[f'queue_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = ''

    previous_speaker_set = set()
    previous_speaker_pattern = ''
    
    pause_frame_count = 0
    
    for index, row in df.iterrows():
        df.at[index, f'queue_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = str(queue)
        
        if row['Active Speaker Count'] == 0:
            pause_frame_count += 1
            
            
        if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern :
            
            if MEDIAN_SP_THRESHOLD is not None and pause_frame_count > MEDIAN_SP_THRESHOLD:
                queue = []
                pause_frame_count = 0

            new_speaker_set = set()
            for speaker in speaker_cols:
                if row[speaker]:
                    new_speaker_set.add(speaker)
                    
            difference_set = new_speaker_set - previous_speaker_set 
            if len(difference_set):
                queue += list(difference_set)

                while len(queue)>3:
                    queue.pop(0)

                if len(set(queue)) == 3:
                    df.at[index, f'tss_disjoint_sp_{MEDIAN_SP_THRESHOLD}'] = 1
                    queue = []

            previous_speaker_set = new_speaker_set
            
        previous_speaker_pattern = row['speech_pattern']
        
            
            
    return df
    
    

In [4]:
def binning(df):
    
    # Calculate the number of bins required
    bin_size = 1800
    num_bins = (len(df) - 1) // bin_size + 1

    # Create an empty list to store the DataFrames for each bin
    result_dfs = []

    # Loop through each bin and calculate the sum for each column
    for bin_num in range(num_bins):
        start_index = bin_num * bin_size
        end_index = (bin_num + 1) * bin_size
        bin_df = df.iloc[start_index:end_index]
        bin_sum = bin_df.sum()
        bin_sum['Frame'] = f"{bin_num * bin_size + 1}-{min((bin_num + 1) * bin_size, len(df))}"
        result_dfs.append(bin_sum)

    # Concatenate all the DataFrames in the result_dfs list into a single DataFrame
    result_df = pd.concat(result_dfs, axis=1).T

    # Set the 'Frame' column as the index of the result DataFrame
    result_df.set_index('Frame', inplace=True)
    result_df = result_df.reset_index(drop=True)
    
    return result_df


def binning_plus(df):
    # Calculate the number of bins required
    bin_sizes = [5400] + [1800] * ((len(df) - 5400 - 1) // 1800 + 1)

    # Create an empty list to store the DataFrames for each bin
    result_dfs = []

    # Loop through each bin and calculate the sum for each column
    for bin_num, bin_size in enumerate(bin_sizes):
        start_index = bin_num * bin_size
        end_index = (bin_num + 1) * bin_size
        bin_df = df.iloc[start_index:end_index]
        bin_sum = bin_df.sum()
        bin_sum['Frame'] = f"{bin_num * bin_size + 1}-{min((bin_num + 1) * bin_size, len(df))}"
        result_dfs.append(bin_sum)

    # Concatenate all the DataFrames in the result_dfs list into a single DataFrame
    result_df = pd.concat(result_dfs, axis=1).T

    # Set the 'Frame' column as the index of the result DataFrame
    result_df.set_index('Frame', inplace=True)
    result_df = result_df.reset_index(drop=True)
    
    return result_df



def save_dataframe_as_bar_graph(df, save_filepath, title, ylabel, xlabel="Minute"):
    # Set a custom color palette for the bars
    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
    sns.set_palette(colors)

    # Increase the width of the bars
    bar_width = 0.8
    
    df.set_index('Minute', inplace=True)

    # Plotting the bar graph with or without outlining the bars
    ax = df.plot(kind='bar', figsize=(10, 6), width=bar_width)

    # Adding labels and title
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    # Displaying the legend with custom labels
    plt.legend(title='Columns', labels=df.columns)

    # Removing the top and right spines for aesthetics
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

#     Show the plot
#     plt.tight_layout()
#     plt.show()

    # Save the graph at the specified location
    plt.savefig(save_filepath)
    plt.close()


    

In [7]:
tss_list = []
speaker_cols = []

MEDIAN_SP_THRESHOLDS = [2, 32, 34.9, 78.5]
MEDIAN_SP_THRESHOLDS.sort(reverse=True)

for filename in tqdm(os.listdir(preprocessed_file_directory)):
    if '.csv' not in filename:
        continue
    file_path = preprocessed_file_directory +'/'+ filename
    df = pd.read_csv(file_path)
    
    speaker_cols = []
    for col in df.columns:
        if 'Sub Id' in col:
            speaker_cols.append(col)
    
    df = prep(df)
    df = identify_tss_disjoint(df)
    
    for MEDIAN_SP_THRESHOLD in MEDIAN_SP_THRESHOLDS:
        df = identify_tss_disjoint_sp(df, MEDIAN_SP_THRESHOLD)
        
    df = df[['Frame', 'tss_disjoint',
       'tss_disjoint_sp_78.5', 
       'tss_disjoint_sp_34.9',  'tss_disjoint_sp_32', 'tss_disjoint_sp_2' ]]
    
    df.columns = ['Frame', 'Disjoint TSS Count',
       'Disjoint TSS Count (Pause <= 78.5)', 
       'Disjoint TSS Count (Pause <= 34.9)',  'Disjoint TSS Count (Pause <= 32)', 'Disjoint TSS Count (Pause <= 2)' ]
    df = binning_plus(df)
#     df.insert(0, 'Minute', range(1, len(df) + 1))
    df.insert(0, 'Minute', [f"0-3" if i == 3 else str(i) for i in range(3,len(df)+3)])

    group, speakers = filename[:-4].split('_', 1)
#     df.to_csv(f"{tss_data_directory}/binned_tss_{filename}", index=False)
#     save_dataframe_as_bar_graph(df, f"{tss_data_directory}/binned_tss{filename[:-4]}_graph", f"Group: {group} ({speakers})", "Triadic Speech Seq. Count")
#     display(df.head(20))
    break
    
    

  0%|          | 0/129 [00:17<?, ?it/s]


ValueError: Length of values (31) does not match length of index (33)

In [None]:
df.head()

In [None]:
df.tail()