In [1]:
import pandas as pd

# Read both human and LLM JSONL files
df_human = pd.read_json('./data/ReviewCritique.jsonl', lines=True)
df_llm = pd.read_json('./data/ReviewCritique_LLM.jsonl', lines=True)

In [2]:
# df_llm has title, and 3 review columns 'claude_opus', 'gpt4', 'gemini_pro_1.5'
df_llm.columns

Index(['decision', 'title', 'body_text', 'claude_opus', 'gpt4',
       'gemini_pro_1.5'],
      dtype='object')

In [3]:
# df_human has 5 review columns, but we will use only three 'review#1', 'review#2', 'review#3'
df_human.columns

Index(['decision', 'title', 'body_text', 'review#1', 'review#2', 'review#3',
       'review#4', 'review#5'],
      dtype='object')

In [4]:
# First check which titles occur in both df_llm and df_human
# Should be 20 total based on our understanding
# Right now we only get 15, but we'll proceed with that
df_human['title'] = [t.lower() for t in df_human['title']]
df_llm['title'] = [t.lower() for t in df_llm['title']]

common_titles = df_llm['title'].isin(df_human['title'])
sum(common_titles)

15

In [5]:
df_llm_filtered = df_llm[common_titles]
df_human_filtered = df_human[df_human['title'].isin(df_llm['title'])]


In [6]:
# Create the directory structure
# We keep one folder for all the review data
# The root folder is called 'extracted'
# Each paper has its own subfolder in this root, with the title of the paper as the folder name
# Each paper folder has three subfolders inside, one for actual reviews, one for pairwise and one for summarised
# Actual reviews will have 3 human and 3 LLM original reviews.
# Summarised will have same 6 reviews summarised in bullet points by another LLM
# Pairwise will have 6C2 i.e. 15 pairwise comparisons for all the reviews

import os
FOLDER_PATH = './extracted/'

for title in df_llm_filtered.title:
    folder_name = title.replace(' ', '_')
    try:
        os.makedirs(FOLDER_PATH + folder_name)
        os.makedirs(FOLDER_PATH + folder_name + '/actual')
        os.makedirs(FOLDER_PATH + folder_name + '/summarised')
        os.makedirs(FOLDER_PATH + folder_name + '/pairwise')
    except:
        pass
                

In [7]:
# Now we parse the original reviews for both humans and LLMs
for idx, paper in df_human_filtered.iterrows():
    # Get review texts
    review1 = '\n'.join([review['segment_text'] for review in paper['review#1']['review']])
    review2 = '\n'.join([review['segment_text'] for review in paper['review#2']['review']])
    review3 = '\n'.join([review['segment_text'] for review in paper['review#3']['review']])

    # Store the reviews in file
    folder_name = paper['title'].replace(' ', '_')
    
    with open(os.path.join('extracted', folder_name, 'actual', 'review1.txt'), 'w') as file1:
        file1.write(review1)
    
    with open(os.path.join('extracted', folder_name, 'actual', 'review2.txt'), 'w') as file2:
        file2.write(review2)
    
    with open(os.path.join('extracted', folder_name, 'actual', 'review3.txt'), 'w') as file3:
        file3.write(review3)


In [8]:
# Now we parse the original reviews for both humans and LLMs
for idx, paper in df_llm_filtered.iterrows():
    # Get review texts
    claude_opus = '\n'.join([review['segment_text'] for review in paper['claude_opus']['review']])
    gpt4 = '\n'.join([review['segment_text'] for review in paper['gpt4']['review']])
    gemini_pro = '\n'.join([review['segment_text'] for review in paper['gemini_pro_1.5']['review']])

    # Store the reviews in file
    folder_name = paper['title'].replace(' ', '_')
    
    with open(os.path.join('extracted', folder_name, 'actual', 'claude_opus.txt'), 'w') as file1:
        file1.write(claude_opus)
    
    with open(os.path.join('extracted', folder_name, 'actual', 'gpt4.txt'), 'w') as file2:
        file2.write(gpt4)
    
    with open(os.path.join('extracted', folder_name, 'actual', 'gemini_pro.txt'), 'w') as file3:
        file3.write(gemini_pro)


# Get LLM Summaries for all Reviews