In [2]:
# Functions to call the oyez API, parse the output and store transcript and s3 links were provided by:
# https://github.com/walkerdb/supreme_court_transcripts
# 
# case_summaries.json was provided by:
# https://github.com/walkerdb/supreme_court_transcripts

import json
from datetime import date
import traceback

import requests

import os
import pandas as pd

In [3]:
# @sleep_and_retry
# @limits(calls=10, period=10)  # no more than 1 call per second
def get_http_json(url):
    print(f"Getting {url}")
    response = requests.get(url)
    parsed = response.json()
    return parsed


def get_case(term, docket):
    """Get the info of the case and fetch all
    transcripts that the info links to"""
    url = f"https://api.oyez.org/cases/{term}/{docket}"
    docket_data = get_http_json(url)

    if not (
        "oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]
    ):
        # no oral arguments for this case yet
        # fail so we will try again later
        print(f"No oral arguments for docket {docket}")
        return (docket_data, [])

    oral_argument_audio = docket_data["oral_argument_audio"]
    transcripts = []
    for link in oral_argument_audio:
        t = get_http_json(link["href"])
        transcripts.append(t)

    return docket_data, transcripts

def getTranscript(transcripts):
    transcript_list = []
    speaker_list = []
    speaker_type_list = []
    time_list = []
    
    for t in transcripts:
        sections = t['transcript']['sections']
        for section in sections:
            turns = section['turns']
            for turn in turns:
                
                try:
                    speaker = turn['speaker']['name']
                except:
                    speaker = '<UNK>'
                speaker_list.append(speaker)   
                
                roles = turn['speaker']['roles']
                if isinstance(roles, list):
                    multiple_roles = []
                    for role in roles:
                        multiple_roles.append(role['type'])
                    speaker_type_list.append(multiple_roles)
                
                else:
                    speaker_type_list.append(['Other']) #Other is most likely Lawyer
                
                
                texts = turn['text_blocks']
                texts_out = []
                times_out = []
                for text in texts:
                    texts_out.append(text['text'])
                    times_out.append((text['start'],text['stop']))
                
                transcript_list.append(texts_out)
                time_list.append(times_out)

    return transcript_list, speaker_list, speaker_type_list, time_list

def getAudio(transcripts):
    num_files = len(transcripts)
    audio_list = []
    for t in transcripts:
        media_dicts = t['media_file']
        #just incase theres more than one, there shouldnt be but they re in a weird list
        for media_dict in media_dicts:
            audio_list.append(media_dict['href'])
    return [num_files,audio_list]

# transcript and audio EXAMPLE #

In [5]:
term = '2020'
docket = '18-540'

In [6]:
docket_data, transcripts = get_case(term, docket)

Getting https://api.oyez.org/cases/2020/18-540
Getting https://api.oyez.org/case_media/oral_argument_audio/25060


In [7]:
transcript_list, speaker_list, speaker_type_list, time_list = getTranscript(transcripts)

In [8]:
transcript_list[0:3]

[["We'll hear argument first this morning in Case 18-540, Rutledge versus Pharmaceutical Care Management Association. General Bronni."],
 ['Thank you, Mr. Chief Justice, and may it please the Court: Pharmacy Benefit Managers are drug middlemen that reimburse pharmacists for the cost of prescription drugs.',
  "Those reimbursements are frequently below a pharmacist's cost.",
  'That drives pharmacists out of business, and it has left many communities without a pharmacist. Act 900 responded to that practice by regulating what PBMs pay pharmacists.',
  "That response isn't preempted for three reasons.",
  "It doesn't regulate benefits, it doesn't regulate plan administration, and it doesn't regulate -- or discriminate against ERISA entities. First, Act 900 does not regulate benefits.",
  "Instead, it regulates the price of drugs that a plan has already decided to cover. That's rate regulation, and under Travelers, that's not preempted, and that's because cost differences don't force plans

In [9]:
print('Speakers:',speaker_list[0:3])
print()
print('Speaker type (Judge or other):',speaker_type_list[0:3])
print()
print('Times:',time_list[0:3])

Speakers: ['John G. Roberts, Jr.', 'Nicholas J. Bronni', 'John G. Roberts, Jr.']

Speaker type (Judge or other): [['scotus_justice'], ['Other'], ['scotus_justice']]

Times: [[(0.07, 10.715)], [(10.715, 20.79), (20.79, 24.88), (24.88, 35.795), (35.795, 38.885), (38.885, 50.685), (50.685, 76.48), (76.48, 83.475), (83.475, 93.265), (93.265, 99.365), (99.365, 105.255), (105.255, 110.625), (110.625, 117.275), (117.275, 120.3), (120.3, 129.79)], [(129.79, 137.26), (137.26, 139.335), (139.335, 142.26), (142.26, 158.985), (158.985, 177.03)]]


## GCP ##

In [15]:
# Get all the terms and dockets from case_summaries.json file
with open(os.getcwd() + '/case_summaries.json') as f:
    data = json.load(f)
    
case_summaries = pd.DataFrame(data)
case_summaries = case_summaries[['term', 'docket_number']]

In [16]:
# Let's start with 2020
case_summaries_2020 = case_summaries[case_summaries['term']=='2020']

In [17]:
case_summaries_2020

Unnamed: 0,term,docket_number
8079,2020,18-540
8080,2020,19-71
8081,2020,19-368
8082,2020,19-309
8083,2020,18-956


In [18]:
data = {}

for term, docket_number in case_summaries_2020.itertuples(index=False):
    docket_data, transcripts = get_case(term, docket_number)
    data[docket_number] = transcripts

Getting https://api.oyez.org/cases/2020/18-540
Getting https://api.oyez.org/case_media/oral_argument_audio/25060
Getting https://api.oyez.org/cases/2020/19-71
Getting https://api.oyez.org/case_media/oral_argument_audio/25058
Getting https://api.oyez.org/cases/2020/19-368
Getting https://api.oyez.org/case_media/oral_argument_audio/25061
Getting https://api.oyez.org/cases/2020/19-309
Getting https://api.oyez.org/case_media/oral_argument_audio/25055
Getting https://api.oyez.org/cases/2020/18-956
Getting https://api.oyez.org/case_media/oral_argument_audio/25068


In [19]:
data

{'18-540': [{'id': 25060,
   'title': 'Oral Argument - October 06, 2020',
   'media_file': [None],
   'transcript': {'title': 'LESLIE RUTLEDGE, ATTORNEY GENERAL OF ARKANSAS, Petitioner, v. PHARMACEUTICAL CARE MANAGEMENT ASSOCIATION, Respondent',
    'duration': None,
    'sections': [{'start': 0.07,
      'stop': 1321.435,
      'byte_start': 0,
      'byte_stop': 0,
      'turns': [{'start': 0.07,
        'stop': 10.715,
        'byte_start': 0,
        'byte_stop': 0,
        'speaker': {'ID': 15086,
         'name': 'John G. Roberts, Jr.',
         'href': 'https://api.oyez.org/people/john_g_roberts_jr',
         'view_count': 0,
         'last_name': 'Roberts',
         'roles': [{'id': 2730,
           'type': 'scotus_justice',
           'date_start': 1127970000,
           'date_end': 0,
           'appointing_president': 'George W. Bush',
           'role_title': 'Chief Justice of the United States',
           'institution_name': 'Supreme Court of the United States',
         

In [44]:
# Example: Get transacript for docket '18-877'
getTranscript(data['18-540'])[:500]

" <John G. Roberts, Jr.>  We'll hear argument first this morning in Case 18-540, Rutledge versus Pharmaceutical Care Management Association. General Bronni. <Nicholas J. Bronni>  Thank you, Mr. Chief Justice, and may it please the Court: Pharmacy Benefit Managers are drug middlemen that reimburse pharmacists for the cost of prescription drugs. Those reimbursements are frequently below a pharmacist's cost. That drives pharmacists out of business, and it has left many communities without a pharmaci"

In [43]:
# Example: Get audio link 
getAudio(data['19-71'])[1][0]

'https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-71/19-71_20201006-argument.delivery.mp3'

In [47]:
# Save transcripts in dictionary 

transcript_data = {}

for docket, transcript in data.items():
    script = getTranscript(data[docket])
    transcript_data[docket] = script


In [40]:
# Save audio files in dictionary 

audio_data = {}

for docket, transcript in data.items():
    try:
        s3_link = getAudio(data[docket])[1][0]
        audio_data[docket] = s3_link
    except: # Some cases don't have an audio file, so just skip those 
        pass
    

In [53]:
# Only export transcripts for cases we have mp3s for
valid_mp3_cases = audio_data.keys()
transcript_data_clean = {k: v for k, v in transcript_data.items() if k in valid_mp3_cases}

# note that transcripts.json must already exist at this point
with open('transcripts.json', 'w+') as f:
    # this would place the entire output on one line
    # use json.dump(lista_items, f, indent=4) to "pretty-print" with four spaces per indent
    json.dump(transcript_data_clean, f)

In [60]:
# Print commands for GCP terminal 

# Change this if you have to 
gcp_bucket = 'split-test-10-11'

for docket, s3_link in audio_data.items():
    print('curl -L ' + s3_link + ' | gsutil cp - gs://{}/{}.mp3'.format(gcp_bucket,docket))

curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-71/19-71_20201006-argument.delivery.mp3 | gsutil cp - gs://split-test-10-11/19-71.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-368/19-368_20201007-argument.delivery.mp3 | gsutil cp - gs://split-test-10-11/19-368.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-309/19-309_20201005-argument.delivery.mp3 | gsutil cp - gs://split-test-10-11/19-309.mp3
curl -L https://api.oyez.org/sites/default/files/case_data/2020/18-956/18-956_20201007-argument.delivery.mp3 | gsutil cp - gs://split-test-10-11/18-956.mp3
