In [None]:
import pandas as pd

In [None]:
base = './rtgender_results'
model = 'roberta-base'
congress = pd.read_csv(f'{base}/facebook_congress_posts_clean_{model}_results.csv')
wiki = pd.read_csv(f'{base}/facebook_wiki_posts_clean_{model}_results.csv')
fitoc = pd.read_csv(f'{base}/fitocracy_posts_clean_{model}_results.csv')
reddit = pd.read_csv(f'{base}/reddit_posts_clean_{model}_results.csv')

In [None]:
def combine_data(df1, df2, df3, df4):
    first = pd.concat([df1, df2])
    second = pd.concat([first, df3])
    final = pd.concat([second, df4])
    
    return final

In [None]:
def clean_cols(df):
    df['data'] = df['data'].str.replace('data/', '', regex=False).str.replace('_posts_clean.csv', '',regex=False)
    
    model_map = {
        'google/electra-base-discriminator':'ELECTRA', 
        'microsoft/deberta-base':'DeBERTa-base', 
        'roberta-base':'RoBERTa-base'
    }
    
    df['model'] = df['model'].map(model_map)
    
    df['seed'] = df['seed'].map(lambda x: f'run{x+1}')
    
    df = df.rename(columns = {'data':'data_name'})
    
    return df

In [None]:
def pivot_data(df):
    
    # first, collapse score cols
    df = df.melt(
        id_vars=['data_name', 'model', 'layer', 'seed'], 
        value_vars = ['test_f1', 'test_mdl'], 
        var_name='score_type',
        value_name='score'
    )
    
    # convert the score type col to proper values
    df['score_type'] = df['score_type'].map({'test_f1':'F1', 'test_mdl':'MDL'})
    
    # then, pivot scores into a separate col per rep
    df = df.pivot(
        index = ['data_name', 'model', 'layer', 'score_type'],
        values = 'score',
        columns = 'seed'
    ).reset_index()
    
    return df

In [None]:
def postprocess(df1, df2, df3, df4):
    rtg_raw = combine_data(df1, df2, df3, df4)
    rtg_clean = clean_cols(rtg_raw)
    rtg_pivot = pivot_data(rtg_clean)
    
    return rtg_pivot 

In [None]:
df_all = postprocess(congress, wiki, fitoc, reddit)

In [None]:
output_path = ''
df_all.to_csv(output_path)