In [1]:
import json
import pandas as pd
import os

# initialize script variables
all_questions_list = []
current_justices = ["John G. Roberts, Jr.", "Clarence Thomas", "Samuel A. Alito, Jr.", "Sonia Sotomayor", "Elena Kagan", "Neil Gorsuch", "Brett M. Kavanaugh", "Amy Coney Barrett", "Ketanji Brown Jackson"]

def add_questions_from_section(section_json, json_file_name, question_addressee, opening_statement):
    '''
    Add all questions from transcript section into question list (global var).
    Each "question" contains metadata of the transcript_id, to whom the question is addressed,
    the Supreme Court Justice speaking, and the question text.
    '''
    turns = section_json["turns"]
    for turn in turns:
        # 1) check for speaker not existing, 2) inaudible audio, or 3) Elizabeth Prelogar
        if turn["speaker"] == None or turn["speaker"]["roles"] == None or len(turn["speaker"]["roles"]) == 0:
            continue
        
        # check for 1) Justice Amy Coney Barrett or 2) all other justices
        if turn["speaker"]["name"] in current_justices:
            all_text_in_turn =  " ".join([block["text"] for block in turn["text_blocks"]])
            
            # check for questions in text turn; use question len > 50 chars as heuristic
            if len(all_text_in_turn) > 50 and "?" in all_text_in_turn and "inaudible" not in all_text_in_turn.lower():
                question_info = {}
                question_info["transcript_id"] = json_file_name[:-5]
                question_info["question_addressee"] = question_addressee
                question_info["justice"] = turn["speaker"]["name"]
                question_info["question_text"] = all_text_in_turn
                question_info["opening_statement"] = opening_statement

                all_questions_list.append(question_info)


def get_formatted_text_of_turn(turn):
    '''
    Return all text within a turn with xml-like tags denoting speaker and text.
    
    @param turn -- JSON representing a single speaker turn
    @return -- String with xml-like tags
    '''
    if (turn["speaker"] == None):
        return
    speaker_in_turn = "<speaker>" + turn["speaker"]["name"] + "</speaker>"
    all_text_in_turn = "<text>" + " ".join([block["text"] for block in turn["text_blocks"]]) + "</text>"
    return speaker_in_turn + all_text_in_turn

def add_questions_from_transcript(json_file_name):
    '''
    Given a transcript, add all justice questions satisfying the criteria in the 
    data cleaning notes above to our global question list.
    '''
    with open(TRANSCRIPTS_DIR + json_file_name) as json_file:
        transcript_json = json.load(json_file)

    sections = transcript_json["transcript"]["sections"]
    for i in range(len(sections)):
        if i % 2 == 0:
            question_addressee = "petitioner"
            petitioner_opening_turn = transcript_json["transcript"]["sections"][0]["turns"][1]
            opening_statement = get_formatted_text_of_turn(petitioner_opening_turn)
        else:
            question_addressee = "respondent"
            respondent_opening_turn = transcript_json["transcript"]["sections"][1]["turns"][0]
            opening_statement = get_formatted_text_of_turn(respondent_opening_turn)

        add_questions_from_section(sections[i], json_file_name, question_addressee, opening_statement)

'''
Main function --> goes through all transcripts and add questions to the global list.
'''
TRANSCRIPTS_DIR = "../transcripts_up_to_2024/"
FILE_TO_SAVE = "../datasets/questions_from_current_justices.csv"

for json_file_name in os.listdir(TRANSCRIPTS_DIR):
    # ignore hidden files
    if json_file_name.startswith('.'):
        continue
    add_questions_from_transcript(json_file_name)

questions_df = pd.DataFrame(all_questions_list)
questions_df.to_csv(FILE_TO_SAVE)