In [1]:
import pandas as pd
import os
from tqdm import tqdm
from multiprocessing import Pool
import time
import seaborn as sns
import matplotlib.pyplot as plt
import traceback

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [2]:
input_file_directory = "D:/Program Files/Documents/CMU/Alcohol_Research/base_columns_data"
tss_file_directory = "D:/Program Files/Documents/CMU/Alcohol_Research/binned_tss_data"

minute_binned_data_directory = "D:/Program Files/Documents/CMU/Alcohol_Research/Binned_By_Minute_all_variables"
custom_binned_data_directory = "D:/Program Files/Documents/CMU/Alcohol_Research/Custom Bin_all_variables"

group_minute_binned_data_directory = "D:/Program Files/Documents/CMU/Alcohol_Research/Binned By Minute_Only Group Variables"
# group_custom_binned_data_directory = "D:/Program Files/Documents/CMU/Alcohol_Research/Custom Bin_Only Group Variables"



In [3]:
def bin_on_columns(df, bin_sizes: list, columns: list = None):

    # Create an empty list to store the DataFrames for each bin
    result_dfs = []

    # Loop through each bin and calculate the sum for each column
    for bin_num, bin_size in enumerate(bin_sizes):
        start_index = bin_num * bin_size
        end_index = (bin_num + 1) * bin_size
        bin_df = df.iloc[start_index:end_index]
        bin_sum = bin_df.sum()
        bin_sum['Frame'] = f"{bin_num * bin_size + 1}-{min((bin_num + 1) * bin_size, len(df))}"
        result_dfs.append(bin_sum)

    # Concatenate all the DataFrames in the result_dfs list into a single DataFrame
    result_df = pd.concat(result_dfs, axis=1).T

    # Set the 'Frame' column as the index of the result DataFrame
    result_df.set_index('Frame', inplace=True)
    result_df = result_df.reset_index(drop=True)
    
    return result_df

In [4]:
single_bin_columns = ['AU6', 'AU9', 'AU12', 'AU14', 'AU15', 'AU20', 'Speak', 'Sip', 'Laugh',
                      'DuchSmile_bin', 'SmileCon1_bin', 'SmileCon2_bin', 'SmileCon3_bin', 
                      'SmileCon_bin_any', 'NegAff_bin_any', 'Silence']

dyad_bin_columns = ['DyadSpeaking', 'DyadLaughing']

triad_bin_columns = ['GoldenMoment', 'TriadSpeaking', 'TriadLaughing']

In [11]:
def get_minute_and_custom_binned_data(df, gc_col = 'Sub_ID'):
    cp1 = df[df['minute']<=3].reset_index(drop=True).groupby(gc_col).sum().reset_index()
    cp1['minute'] = '1-3'    
    cp = pd.concat([cp1, df]).reset_index(drop=True)
    return cp
    
def get_custom_binned_data(df, gc_col = 'Sub_ID'):
    
    cp1 = df[df['minute']<=3].reset_index(drop=True).groupby(gc_col).sum().reset_index()
    cp1['minute'] = '1-3'
    cp2 = df[df['minute']>3].reset_index(drop=True)
    cp = pd.concat([cp1,cp2]).reset_index(drop=True)
    
    return cp
    

In [6]:
for filename in tqdm(os.listdir(input_file_directory)):
    if '.csv' not in filename:
        continue
    
    file_path = input_file_directory +'/'+ filename
    group = filename.split('_')[0]
    df = pd.read_csv(file_path)
        
    try:
        tss_df = pd.read_csv(f'{tss_file_directory}/{filename}')
        tss_df = tss_df.rename(columns={'Minute':'minute'})
    
        single_df = df[['Sub_ID', 'minute'] + single_bin_columns]
        single_df = single_df.groupby(['Sub_ID', 'minute']).sum().reset_index()
        single_df = single_df.sort_values(by=['minute', 'Sub_ID']).reset_index(drop=True)
        single_df[single_bin_columns] = (single_df[single_bin_columns] / 30).round(3)

        bi_tri_df = df[['Frame', 'minute'] + dyad_bin_columns + triad_bin_columns]
        bi_tri_df = bi_tri_df.groupby(['Frame', 'minute']).max().reset_index().groupby('minute').sum().reset_index()
        bi_tri_df = bi_tri_df.drop(columns=['Frame'])
        bi_tri_df[dyad_bin_columns + triad_bin_columns] = (bi_tri_df[dyad_bin_columns + triad_bin_columns]/30).round(3)
        bi_tri_df = bi_tri_df.merge(tss_df, on=['minute']).reset_index(drop=True)
        bi_tri_df.to_csv(f'{group_minute_binned_data_directory}/{filename}', index=False)


        all_df = single_df.merge(bi_tri_df, on=['minute']).reset_index(drop=True)
        all_df.to_csv(f'{minute_binned_data_directory}/{filename}', index=False)
        cp_all = get_custom_binned_data(all_df)
        cp_all.to_csv(f'{custom_binned_data_directory}/{filename}', index=False)
    except Exception as e:
        print(e)
        print(f'Skipping file: {filename}')
        traceback.print_exc()
        
    

100%|██████████| 129/129 [00:00<00:00, 406.13it/s]


In [6]:
df_list = []
for filename in tqdm(os.listdir(minute_binned_data_directory)):
    if '.csv' not in filename:
        continue
    file_path = minute_binned_data_directory +'/'+ filename
    group = filename.split('_')[0]
    df = pd.read_csv(file_path)
    df['Group'] = group
    df_list.append(df)

100%|██████████| 129/129 [00:01<00:00, 78.07it/s]


In [9]:
mega_df = pd.concat(df_list).reset_index(drop=True)
cols = ['Group'] + [col for col in df.columns if col != 'Group']
mega_df = mega_df[cols]
mega_df['Group'] = mega_df['Group'].astype(int)
mega_df = mega_df.sort_values(by=['Group', 'minute', 'Sub_ID'])
mega_df.to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/combined.csv',index=False)

### All Groups Different Variable Sets, Minute Binned

In [10]:
mega_df.to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_all_variables_minute_binned.csv',index=False)
mega_df[['Group','Sub_ID', 'minute'] + single_bin_columns].reset_index(drop=True).to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_individual_variables_minute_binned.csv',index=False)
mega_df[['Group', 'minute'] + dyad_bin_columns + triad_bin_columns].drop_duplicates().reset_index(drop=True).to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_group_variables_minute_binned.csv',index=False)

In [13]:
def custom_sort_category(minute_str):
    if minute_str == '1-3':
        return 1
    else:
        return 2


### All Groups Different Variable Sets, Minute Binned + Custom Binned

In [21]:
temp_df = get_minute_and_custom_binned_data(mega_df.drop(columns=['Group']))
temp_df = mega_df[['Sub_ID','Group']].drop_duplicates().merge(temp_df, on='Sub_ID')
temp_df['sort_category'] = temp_df['minute'].apply(custom_sort_category)
temp_df = temp_df.sort_values(by=['Group','sort_category','minute','Sub_ID']).reset_index(drop=True)
temp_df = temp_df.drop('sort_category', axis=1)
temp_df = temp_df.round(3)
temp_df.to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_all_variables_minute_and_custom_binned.csv',index=False)
temp_df[['Group','Sub_ID', 'minute'] + single_bin_columns].reset_index(drop=True).to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_individual_variables_minute_and_custom_binned.csv',index=False)
temp_df[['Group', 'minute'] + dyad_bin_columns + triad_bin_columns].drop_duplicates().reset_index(drop=True).to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_group_variables_minute_and_custom_binned.csv',index=False)

### All Groups Different Variable Sets, Custom Binned

In [10]:
test_df = get_custom_binned_data(mega_df.drop(columns=['Group']))
test_df = mega_df[['Sub_ID','Group']].drop_duplicates().merge(test_df, on='Sub_ID')
test_df['sort_category'] = test_df['minute'].apply(custom_sort_category)
test_df = test_df.sort_values(by=['Group','sort_category','minute','Sub_ID']).reset_index(drop=True)
test_df = test_df.drop('sort_category', axis=1)
test_df = test_df.round(3)
test_df.to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_all_variables_custom_binned.csv',index=False)

In [11]:
custom_binned_individual_variables_df = test_df[['Group','Sub_ID', 'minute'] + single_bin_columns].reset_index(drop=True)
custom_binned_individual_variables_df.to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_individual_variables_custom_binned.csv',index=False)

In [12]:
custom_binned_group_variables_df = test_df[['Group', 'minute'] + dyad_bin_columns + triad_bin_columns].drop_duplicates().reset_index(drop=True)
custom_binned_group_variables_df.to_csv('D:/Program Files/Documents/CMU/Alcohol_Research/all_groups_group_variables_custom_binned.csv',index=False)