In [None]:
import json
import os
import pandas as pd

TRANSCRIPTS_DIR = "2024_cases_json/"      # directory of JSONs of oral arguments
CSV_DIR = "2024_cases_csv"         # directory in which to store csv of aggregated oral arguments

def get_formatted_text_of_turn(turn):
    speaker_in_turn = "<speaker>" + turn["speaker"]["name"] + "</speaker>"
    all_text_in_turn = "<text>" + " ".join([block["text"] for block in turn["text_blocks"]]) + "</text>"
    return speaker_in_turn + all_text_in_turn

def get_section_full_text(section_json):
    turns = section_json["turns"]
    full_text = ""
    for turn in turns:
        full_text += get_formatted_text_of_turn(turn)
    return full_text

def get_transcript_dict(json_file_name):
    transcript_info = {}
    transcript_info["transcript_id"] = json_file_name[:-5]

    transcript_file_path = TRANSCRIPTS_DIR + json_file_name
    with open(transcript_file_path) as json_file:
        transcript_json = json.load(json_file)
    
    # parse petitioner opening statement and full text
    petitioner_opening_turn = transcript_json["transcript"]["sections"][0]["turns"][1]
    transcript_info["petitioner_opening_text"] = get_formatted_text_of_turn(petitioner_opening_turn)
    petitioner_turns = transcript_json["transcript"]["sections"][0]
    transcript_info["petitioner_full_text"] = get_section_full_text(petitioner_turns)
    
    # parse respondent opening statement and full text
    respondent_opening_turn = transcript_json["transcript"]["sections"][1]["turns"][0]
    transcript_info["respondent_opening_statement"] = get_formatted_text_of_turn(respondent_opening_turn)
    respondent_turns = transcript_json["transcript"]["sections"][1]
    transcript_info["respondent_full_text"] = get_section_full_text(respondent_turns)

    return transcript_info
    
transcripts = []
cases_dir = os.fsencode(TRANSCRIPTS_DIR)
for json_file_name in os.listdir(TRANSCRIPTS_DIR):
    # ignore hidden files
    if json_file_name.startswith('.'):
        continue
    transcripts.append(get_transcript_dict(json_file_name))

transcripts_df = pd.DataFrame(transcripts)
transcripts_df.to_csv(CSV_DIR + "/transcript_parsed")