In [1]:
import pandas as pd
import os
from tqdm import tqdm

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None


input_file_directory = "./data/input_data/"
preprocessed_file_directory = "./data/preprocessed_data/"
output_file_directory = "./data/result_data/"
processed_file_directory = "./data/processed_data/"


## Data Preprocess
- Only keep relevant columns
- Trim length

In [2]:
# desired_columns = ['Sub_ID', 'Frame', 'Speak']
# alternate_columns = ['Unnamed: 0','1','44']

# length_data = []

# def pre_process():
#     for filename in tqdm(os.listdir(input_file_directory)):
#         if '.csv' not in filename:
#             continue

#         file_path = input_file_directory +'/'+ filename
#         df = pd.read_csv(file_path)
        
#         if all(col in df.columns for col in desired_columns):
#             df = df[desired_columns]
#             print(f'File has different column names: ' + filename)
#         elif all(col in df.columns for col in alternate_columns):
#             df = df[alternate_columns]
#             df.columns = ['Sub_ID','Frame','Speak']
#         else:
#             print('CHECK THIS DF OUT')
#             break
            
#         # But do we want to do this?
# #         df = df.drop_duplicates(subset = ['Sub_ID','Frame']).reset_index(drop=True)
        
#         length_info_dict = {'Filename' : filename, 'Original Length':len(df)}
#         df = df[:188811]
#         length_info_dict['Final Length'] = len(df)
#         length_data.append(length_info_dict)
#         if len(df)<188811:
#             print(f'Skipping file (length issue): {filename}')
#             continue
        
        
        
#         # Convert all columns to integers
#         try:
#             df = df.applymap(int)
#         except:
#             print(f'Skipping file (int issue): {filename}')
# #             print(df.shape)
# #             display(df.head())
# #             display(df.tail())
#             continue
        
#         sub_ids = list(df['Sub_ID'].unique())
    
#         # Grouping by frame
#         try:
#             grouped_df = df.pivot(index='Frame', columns='Sub_ID', values='Speak')
#             grouped_df = grouped_df.reset_index()

#             # Renaming columns
#             grouped_df = grouped_df.rename(columns=lambda x: 'Sub Id: ' + str(x) if x in sub_ids else x)

#             # Adding a column that represents the number of speakers speaking in this frame.
#             grouped_df['Active Speaker Count'] = grouped_df.iloc[:, 1:].sum(axis=1)
            
#             grouped_df.to_csv(f'{preprocessed_file_directory}/{filename}',index=False)
#         except Exception as e:
#             print(f"An error occurred for file: {filename}:", e)

# pre_process()
# length_data_df = pd.DataFrame(length_data).sort_values(by=['Original Length']).reset_index(drop=True)
# display(length_data_df.head())

### Logic Driving Helper Functions

In [3]:
def prepare(df: pd.DataFrame):
    filtered_columns = df.filter(like='Sub Id:')
    df['speech_pattern'] = filtered_columns.astype(str).apply(lambda x: ','.join(x), axis=1)
    return df


def identify_pauses(df: pd.DataFrame):
    df['pause_frames'] = ''
    df['pause_seconds'] = ''
        
    initiate_flag = False
    pause_frame_count = 0
    
    for index, row in df.iterrows():
        if index > 0:
            if row['Active Speaker Count'] > 0:
                if initiate_flag and pause_frame_count > 0:
                    
                    df.at[index, 'pause_frames'] = pause_frame_count
                    df.at[index, 'pause_seconds'] = round(pause_frame_count/30, 3)
            
                else:
                    initiate_flag = True
                
                pause_frame_count = 0
            else:
                pause_frame_count += 1
                
    return df


def identify_sp_ignore_multi_speakers(df: pd.DataFrame):
    df['sp_no_zero_frames'] = ''
    df['sp_no_zero_seconds'] = ''
        
    initiate_flag = False
    pause_frame_count = 0
    previous_speaker_pattern = None
    
    for index, row in df.iterrows():
        if index > 0:
            if row['Active Speaker Count'] == 1:
                if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern:
                    
                    df.at[index, 'sp_no_zero_frames'] = pause_frame_count
                    df.at[index, 'sp_no_zero_seconds'] = round(pause_frame_count/30, 3)
                
                previous_speaker_pattern = row['speech_pattern']    
                pause_frame_count = 0
                
            elif row['Active Speaker Count'] == 0:
                pause_frame_count += 1
                
    return df



def identify_sp_zero_multi_speakers(df: pd.DataFrame):
    df['sp_zero_frames'] = ''
    df['sp_zero_seconds'] = ''
        
    initiate_flag = False
    pause_frame_count = 0
    previous_speaker_pattern = None
    
    for index, row in df.iterrows():
        if index > 0:
            if row['Active Speaker Count'] > 0:
                if previous_speaker_pattern and row['speech_pattern'] != previous_speaker_pattern :
                    
                    df.at[index, 'sp_zero_frames'] = pause_frame_count
                    df.at[index, 'sp_zero_seconds'] = round(pause_frame_count/30, 3)
                
                previous_speaker_pattern = row['speech_pattern']    
                pause_frame_count = 0
                
            elif row['Active Speaker Count'] == 0:
                pause_frame_count += 1
                
    return df


In [4]:

cum_stats_secs = []
cum_stats_frames = []
all_df_list = []



for filename in tqdm(os.listdir(preprocessed_file_directory)):
    # We only want to process csv files
    if '.csv' not in filename:
        continue
    
    file_path = preprocessed_file_directory +'/'+ filename
    df = pd.read_csv(file_path)
    
#     display(df.head())
    
    res_df = prepare(df)
    res_df = identify_pauses(res_df)
    res_df = identify_sp_ignore_multi_speakers(res_df)
    res_df = identify_sp_zero_multi_speakers(res_df)
    
    
    concise_columns = ['Frame', 'pause_frames', 'pause_seconds', 'sp_no_zero_frames', 'sp_no_zero_seconds', 'sp_zero_frames', 'sp_zero_seconds']
    concise_df = df[concise_columns]

    # Split the filename on the first underscore
    group, speakers = filename[:-4].split('_', 1)
    concise_df['Group'] = group
    concise_df['Speakers'] = speakers
    
    for col in concise_columns:
        concise_df[col] = pd.to_numeric(concise_df[col], errors='coerce')
        
    concise_df = concise_df[(concise_df['pause_frames']>=0) | (concise_df['sp_no_zero_frames']>=0) | (concise_df['sp_zero_frames']>=0)].reset_index(drop=True)
    concise_df['pause_frames'] == concise_df['pause_frames'].astype('Int64')
    concise_df['sp_no_zero_frames'] == concise_df['sp_no_zero_frames'].astype('Int64')
    concise_df['sp_zero_frames'] == concise_df['sp_zero_frames'].astype('Int64')
    
    concise_df = concise_df[['Group', 'Speakers']+concise_columns]
    all_df_list.append(concise_df)
    
    
    res_df.to_csv(f'{processed_file_directory}/{filename}', index=False)
    

#     display(res_df.head(5))
    
#     print('*'*80)
#     print('*'*80)
    
    
    

100%|██████████| 129/129 [16:40<00:00,  7.76s/it]


In [5]:
combined_df = pd.concat(all_df_list)
display(combined_df.head())

Unnamed: 0,Group,Speakers,Frame,pause_frames,pause_seconds,sp_no_zero_frames,sp_no_zero_seconds,sp_zero_frames,sp_zero_seconds
0,100,A2627_B2621_C2625,227,,,,,0.0,0.0
1,100,A2627_B2621_C2625,269,,,0.0,0.0,0.0,0.0
2,100,A2627_B2621_C2625,285,,,,,0.0,0.0
3,100,A2627_B2621_C2625,316,,,,,0.0,0.0
4,100,A2627_B2621_C2625,483,103.0,3.433,103.0,3.433,103.0,3.433


In [6]:
# # Get frequency of Speaker#

# # Grouped frequencies
# display(combined_df.groupby('Triad_Id')['active_speaker_count'].value_counts().reset_index(name='Frequency'))

# # Cumulative freqencies
# display(combined_df['active_speaker_count'].value_counts().reset_index(name='Frequency'))

In [7]:
def stats_plus(df, group_by_columns, stats_column):
    
    df[stats_column] = pd.to_numeric(combined_df[stats_column], errors='coerce')
    
    grouped_stats = df.groupby(group_by_columns)[stats_column].agg(['median', 'count', 'mean', 'std', 'min', 'max']).reset_index()
    cumulative_stats = df[stats_column].agg(['median', 'count', 'mean', 'std', 'min', 'max']).reset_index()
    cumulative_stats = cumulative_stats.transpose()
    cumulative_stats.columns = cumulative_stats.iloc[0] 

    # Drop the index row from the DataFrame
    cumulative_stats = cumulative_stats.iloc[1:]
    for group_by_column in group_by_columns:
        cumulative_stats[group_by_column] = 'Overall'

    cumulative_stats.reset_index(drop=True)

    frame_stats = pd.concat([grouped_stats, cumulative_stats], ignore_index=True)
    
    return frame_stats
    

In [8]:
p_frame = stats_plus(combined_df, ['Group', 'Speakers'], 'pause_frames')
p_secs = stats_plus(combined_df, ['Group', 'Speakers'], 'pause_seconds')
display(p_frame)
p_frame.to_csv(f'{output_file_directory}/summary_pauses_frames.csv', index=False)
p_secs.to_csv(f'{output_file_directory}/summary_pauses_seconds.csv', index=False)

Unnamed: 0,Group,Speakers,median,count,mean,std,min,max
0,1,A2007_B2010_C2013,38.0,200.0,98.635,315.232931,1.0,3681.0
1,10,A2031_B2049_C2072,24.0,107.0,84.971963,352.918518,1.0,3015.0
2,100,A2627_B2621_C2625,29.0,168.0,64.982143,210.910217,1.0,2176.0
3,101,A2640_B2644_C2636,25.0,157.0,58.140127,237.584141,1.0,2302.0
4,102,A2639_B2645_C2662,24.0,149.0,61.563758,233.935927,1.0,2424.0
5,103,A2651_B2631_C2657,22.0,157.0,64.197452,269.435817,1.0,2559.0
6,104,A2656_B2650_C2653,28.5,148.0,63.283784,263.965367,1.0,2630.0
7,105,A2675_B2674_C2663,32.0,261.0,58.965517,196.885389,1.0,2480.0
8,106,A2673_B2664_C2654,26.0,202.0,55.277228,213.474881,1.0,2321.0
9,107,A2677_B2678_C2649,9.5,58.0,94.034483,394.183053,1.0,2171.0


In [11]:
sp_frame = stats_plus(combined_df, ['Group', 'Speakers'], 'sp_no_zero_frames')
sp_secs = stats_plus(combined_df, ['Group', 'Speakers'], 'sp_no_zero_seconds')

display(sp_frame)
display(sp_secs)

sp_frame.to_csv(f'{output_file_directory}/summary_ps_frames.csv', index=False)
sp_secs.to_csv(f'{output_file_directory}/summary_ps_seconds.csv', index=False)

Unnamed: 0,Group,Speakers,median,count,mean,std,min,max
0,1,A2007_B2010_C2013,13.0,195.0,45.041026,92.888018,0.0,869.0
1,10,A2031_B2049_C2072,0.0,214.0,36.523364,252.079328,0.0,3015.0
2,100,A2627_B2621_C2625,0.0,276.0,30.822464,166.88483,0.0,2176.0
3,101,A2640_B2644_C2636,0.0,334.0,16.221557,127.07532,0.0,2302.0
4,102,A2639_B2645_C2662,0.0,312.0,25.528846,164.085224,0.0,2424.0
5,103,A2651_B2631_C2657,0.0,272.0,22.823529,157.585208,0.0,2559.0
6,104,A2656_B2650_C2653,0.5,214.0,34.046729,221.4872,0.0,2630.0
7,105,A2675_B2674_C2663,5.0,272.0,33.705882,193.153119,0.0,2480.0
8,106,A2673_B2664_C2654,0.0,355.0,20.971831,125.443031,0.0,2321.0
9,107,A2677_B2678_C2649,0.0,144.0,4.659722,18.7076,0.0,199.0


Unnamed: 0,Group,Speakers,median,count,mean,std,min,max
0,1,A2007_B2010_C2013,0.433,195.0,1.501385,3.096296,0.0,28.967
1,10,A2031_B2049_C2072,0.0,214.0,1.217421,8.402628,0.0,100.5
2,100,A2627_B2621_C2625,0.0,276.0,1.027384,5.56282,0.0,72.533
3,101,A2640_B2644_C2636,0.0,334.0,0.540731,4.23583,0.0,76.733
4,102,A2639_B2645_C2662,0.0,312.0,0.850949,5.469516,0.0,80.8
5,103,A2651_B2631_C2657,0.0,272.0,0.760768,5.252843,0.0,85.3
6,104,A2656_B2650_C2653,0.0165,214.0,1.134907,7.382909,0.0,87.667
7,105,A2675_B2674_C2663,0.167,272.0,1.123529,6.438466,0.0,82.667
8,106,A2673_B2664_C2654,0.0,355.0,0.69907,4.181454,0.0,77.367
9,107,A2677_B2678_C2649,0.0,144.0,0.155319,0.623555,0.0,6.633


In [10]:
sp_z_frame = stats_plus(combined_df, ['Group', 'Speakers'], 'sp_zero_frames')
sp_z_secs = stats_plus(combined_df, ['Group', 'Speakers'], 'sp_zero_seconds')

sp_z_frame.to_csv(f'{output_file_directory}/summary_psz_frames.csv', index=False)
sp_z_secs.to_csv(f'{output_file_directory}/summary_psz_seconds.csv', index=False)