Case Cleaning

* First, we copied all case files from [Oyez's Github Repo](https://github.com/walkerdb/supreme_court_transcripts)
* Then, we delete all non-transcript files (files that don't end with "t01" or "t02")
* Then, we remove all files from 2024
* Next, we remove all files that don't contain a non-null "transcript" field
* Finally, we remove all files with only one section in the transcript field

Text Cleaning
* We consolidate all the text within a Scotus Justice's turn and keep text that has at least one "?" inside the text.
* we omit lines < 50 characters because they will not help with contextual embeddings.(For example, a common question justices ask is "why?" but it noise in our vector embedding without the surrounding context. -- This is a heuristic approach that we might revisit)
* Around ~8000 lines had some "Inaudible" strings in them, so we remove these lines as well.


In [None]:
import json
import pandas as pd
import os

# initialize script variables
all_questions_list = []

def add_questions_from_section(section_json, json_file_name, question_addressee):
    '''
    Add all questions from transcript section into question list (global var).
    Each "question" contains metadata of the transcript_id, to whom the question is addressed,
    the Supreme Court Justice speaking, and the question text.
    '''
    turns = section_json["turns"]
    for turn in turns:
        # 1) check for speaker not existing, 2) inaudible audio, or 3) Elizabeth Prelogar
        if turn["speaker"] == None or turn["speaker"]["roles"] == None or len(turn["speaker"]["roles"]) == 0:
            continue
        
        # check for 1) Justice Amy Coney Barrett or 2) all other justices
        if ('2' in turn["speaker"]["roles"] and turn["speaker"]["roles"]['2']["type"] == "scotus_justice") or turn["speaker"]["roles"][0]["type"] == "scotus_justice":
            all_text_in_turn =  " ".join([block["text"] for block in turn["text_blocks"]])
            
            # check for questions in text turn; use question len > 50 chars as heuristic
            if len(all_text_in_turn) > 50 and "?" in all_text_in_turn and "inaudible" not in all_text_in_turn.lower():
                question_info = {}
                question_info["transcript_id"] = json_file_name[:-5]
                question_info["question_addressee"] = question_addressee
                question_info["justice"] = turn["speaker"]["name"]
                question_info["question_text"] = all_text_in_turn

                all_questions_list.append(question_info)

def add_questions_from_transcript(json_file_name):
    '''
    Given a transcript, add all justice questions satisfying the criteria in the 
    data cleaning notes above to our global question list.
    '''
    with open(TRANSCRIPTS_DIR + json_file_name) as json_file:
        transcript_json = json.load(json_file)

    sections = transcript_json["transcript"]["sections"]
    for i in range(len(sections)):
        if i % 2 == 0:
            question_addressee = "petitioner"
        else:
            question_addressee = "respondent"

        add_questions_from_section(sections[i], json_file_name, question_addressee)

'''
Main function --> goes through all transcripts and add questions to the global list.
'''
TRANSCRIPTS_DIR = "transcripts_up_to_2024/"
FILE_TO_SAVE = "all_questions.csv"

for json_file_name in os.listdir(TRANSCRIPTS_DIR):
    # ignore hidden files
    if json_file_name.startswith('.'):
        continue
    add_questions_from_transcript(json_file_name)

questions_df = pd.DataFrame(all_questions_list)
questions_df.to_csv(FILE_TO_SAVE)