## Convert OS based questions to CB based Questions jsonl for finetuned model

In [1]:
import re
import pandas as pd

In [2]:
IN_DIR = "../../datasets/original"
OUT_DIR = "../finetuning_datasets/eval_only"

## Generate jsonl for inference on finetuned model

In [3]:
input_fp = f'{IN_DIR}/2024_all_questions.csv'
questions_df = pd.read_csv(input_fp)
justices = list(questions_df['justice'].unique())

In [4]:
justices = [
    'Clarence Thomas',
    'John G. Roberts, Jr.',
    'Elena Kagan',
    'Ketanji Brown Jackson',
    'Sonia Sotomayor',
    'Samuel A. Alito, Jr.',
    'Amy Coney Barrett',
    'Neil Gorsuch',
    'Brett M. Kavanaugh'
 ]

In [5]:
input_fp = f'{IN_DIR}/2024_full_text_transcripts.csv'
df = pd.read_csv(input_fp)


def extract_speaker_and_text(input_string):
    speaker_pattern = re.compile(r"<speaker>(.*?)</speaker>", re.DOTALL)
    text_pattern = re.compile(r"<text>(.*?)</text>", re.DOTALL)

    speaker_match = speaker_pattern.search(input_string)
    text_match = text_pattern.search(input_string)

    speaker = speaker_match.group(1)
    text_content = text_match.group(1)

    turn = f"{speaker}: {text_content}"

    return turn

df['petitioner_turn'] = df['petitioner_opening_text'].apply(extract_speaker_and_text)
df['respondent_turn'] = df['respondent_opening_statement'].apply(extract_speaker_and_text)

In [6]:
def get_question_generation_prompt(justice, opening_statement):
    '''
        Formats conversation context and the justice's response into fine-tuning format.
    '''
    formatted_data = {
        "system_prompt": (
            "You are a Supreme Court Justice participating in oral arguments. "
            "Given a transcript excerpt and a Justice's name, generate the Justice's next question in response to the conversation history."
        ),
        "instruction": (
            "<context>\n" +
            f"<turn>{opening_statement}</turn>" +
            "\n</context>\n" +
            f"<justice>{justice}</justice>\n" +
            f"Generate a question that {justice} is likely to ask next."
        ),
        "justice": justice,
    }
    
    return formatted_data

data = []
for j in justices:
    justice = f'Justice {j}'
    for _, row in df.iterrows():
        # petitioner opening statement 
        sample = get_question_generation_prompt(justice, row['petitioner_turn'])
        sample.update({
            'question_addressee': 'petitioner',
            'opening_statement': row['petitioner_opening_text']
        })
        data.append(sample)

        # respondent opening statement 
        sample = get_question_generation_prompt(justice, row['respondent_turn'])
        sample.update({
            'question_addressee': 'respondent',
            'opening_statement': row['respondent_opening_statement']
        })
        data.append(sample)

data_df = pd.DataFrame(data)

In [7]:
out_fp = f'{OUT_DIR}/OS_to_CB_based_questions_test.jsonl'
data_df.to_json(out_fp, orient='records', index=False, lines=True)

In [8]:
# data_df.info()