# Step 3: Merge all prompts from all runs into one long-format dataset

Depends on: 

- 8 CSV files generated in  `processed_data/umg_parsed_queries_v2*.csv`

Outputs: 
- 1 large (395200 row) CSV file containing all results and metadata: `processed_data/umg_all_parsed_queries.csv`

In [1]:
import os
import json
from tqdm import tqdm
import pandas as pd
import glob
import datetime
pd.set_option('display.max_columns', None)

In [2]:
start = datetime.datetime.now()

In [3]:
models = [ "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0125", "gpt-4-turbo-2024-04-09", "gpt-4o-2024-05-13"]

In [4]:
glob.glob("processed_data/umg_parsed_queries_v2*.csv")

['processed_data/umg_parsed_queries_v2_employee_gpt-3.5-turbo-0613.csv',
 'processed_data/umg_parsed_queries_v2_employer_gpt-4o-2024-05-13.csv',
 'processed_data/umg_parsed_queries_v2_employer_gpt-3.5-turbo-0125.csv',
 'processed_data/umg_parsed_queries_v2_employer_gpt-3.5-turbo-0613.csv',
 'processed_data/umg_parsed_queries_v2_employee_gpt-4o-2024-05-13.csv',
 'processed_data/umg_parsed_queries_v2_employee_gpt-4-turbo-2024-04-09.csv',
 'processed_data/umg_parsed_queries_v2_employee_gpt-3.5-turbo-0125.csv',
 'processed_data/umg_parsed_queries_v2_employer_gpt-4-turbo-2024-04-09.csv']

In [5]:
df = pd.DataFrame()
for filen in glob.glob("processed_data/umg_parsed_queries_v2*.csv"):
    df_temp = pd.read_csv(filen)
    if len(df) == 0:
        df = df_temp
    else:
        df = pd.concat([df,df_temp])

In [6]:
majors_dict = {
    'Education': 'Social Sciences',
    'Psychology': 'Social Sciences',
    'Nursing': 'STEM',
    'Literature': 'Arts and Humanities',
    'History': 'Arts and Humanities',
    'Gender Studies': 'Arts and Humanities',
    'Electrical Engineering': 'STEM',
    'Data Science': 'STEM',
    'Biology': 'STEM',
    'Philosophy': 'Arts and Humanities',
    'Computer Science': 'STEM',
    'Economics': 'Social Sciences',
    'Communication': 'Social Sciences',
    'Neuroscience': 'STEM',
    'Business': 'Social Sciences',
    'Black Studies': 'Arts and Humanities',
    'Biomedical Sciences': 'STEM',
    'Visual Arts': 'Arts and Humanities',
    'Xyzzy': 'Fake-Control',
    'None-Control': 'None-Control'
}

# Add the 'major_type' column to the DataFrame
df['major_type'] = df['Major'].map(majors_dict)
df['major_type'].value_counts()

major_type
STEM                   124800
Arts and Humanities    124800
Social Sciences        104000
Fake-Control            20800
None-Control            20800
Name: count, dtype: int64

In [7]:
df.columns

Index(['custom_id', 'model', 'content', 'run_id', 'University', 'Major',
       'Pronoun', 'Ranking', 'USNWR_Category', 'Funding', 'Region', 'HBCU',
       'query_response_parsed', 'run_type', 'major_type'],
      dtype='object')

In [8]:
df.sort_values(by=["model", "run_type","Pronoun","University","Major"], inplace=True)
df.reset_index(inplace=True)

In [9]:
df.to_csv("processed_data/umg_all_parsed_queries.csv", index=None)

In [10]:
df.describe()

Unnamed: 0,index,run_id,Ranking,HBCU,query_response_parsed
count,395200.0,395200.0,292448.0,324064.0,393572.0
mean,24699.5,24699.5,90.594595,0.146341,108788.645074
std,14260.569688,14260.569688,94.611303,0.353449,17138.816246
min,0.0,0.0,1.0,0.0,65000.0
25%,12349.75,12349.75,18.0,0.0,95000.0
50%,24699.5,24699.5,53.0,0.0,110000.0
75%,37049.25,37049.25,133.0,0.0,120000.0
max,49399.0,49399.0,376.0,1.0,195000.0


In [11]:
end = datetime.datetime.now()
print("Elapsed time:", end-start)

Elapsed time: 0:00:02.816094
