# Code from the githuub #
If we want to obtain data outselves, if not just download and open json files

In [1]:
import json
from datetime import date
import traceback

import requests

YEARS_TO_GO_BACK = 2


# @sleep_and_retry
# @limits(calls=10, period=10)  # no more than 1 call per second
def get_http_json(url):
    print(f"Getting {url}")
    response = requests.get(url)
    parsed = response.json()
    return parsed


def get_case(term, docket):
    """Get the info of the case and fetch all
    transcripts that the info links to"""
    url = f"https://api.oyez.org/cases/{term}/{docket}"
    docket_data = get_http_json(url)

    if not (
        "oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]
    ):
        # no oral arguments for this case yet
        # fail so we will try again later
        print(f"No oral arguments for docket {docket}")
        return (docket_data, [])

    oral_argument_audio = docket_data["oral_argument_audio"]
    transcripts = []
    for link in oral_argument_audio:
        t = get_http_json(link["href"])
        transcripts.append(t)

    return docket_data, transcripts

def getTranscript(transcripts):
    output = ''
    for t in transcripts:
        sections = t['transcript']['sections']
        for section in sections:
            turns = section['turns']
            for turn in turns:
                try:
                    speaker = ' <' + turn['speaker']['name'] + '> '
                except:
                    speaker = ' <UNK> '
                output += speaker
                texts = turn['text_blocks']
                for text in texts:
                    output += (' ' + text['text'])
    return output

def getAudio(transcripts):
    num_files = len(transcripts)
    audio_list = []
    for t in transcripts:
        media_dicts = t['media_file']
        #just incase theres more than one, there shouldnt be but they re in a weird list
        for media_dict in media_dicts:
            audio_list.append(media_dict['href'])
    return [num_files,audio_list]

In [35]:
# Manually have to specify all the dockets for this run bec can't scrape oyez website :(
term = '2019'
dockets = ['18-877', '17-1623', '17-1498', '18-882', '18-6943', '19-631', '18-725', '17-1618']

data = {}

for docket in dockets:
    docket_data, transcripts = get_case(term, docket)
    data[docket] = transcripts

Getting https://api.oyez.org/cases/2019/18-877
Getting https://api.oyez.org/case_media/oral_argument_audio/24929
Getting https://api.oyez.org/cases/2019/17-1623
No oral arguments for docket 17-1623
Getting https://api.oyez.org/cases/2019/17-1498
Getting https://api.oyez.org/case_media/oral_argument_audio/24948
Getting https://api.oyez.org/cases/2019/18-882
Getting https://api.oyez.org/case_media/oral_argument_audio/24962
Getting https://api.oyez.org/cases/2019/18-6943
Getting https://api.oyez.org/case_media/oral_argument_audio/24943
Getting https://api.oyez.org/cases/2019/19-631
Getting https://api.oyez.org/case_media/oral_argument_audio/24986
Getting https://api.oyez.org/cases/2019/18-725
Getting https://api.oyez.org/case_media/oral_argument_audio/25049
Getting https://api.oyez.org/cases/2019/17-1618
Getting https://api.oyez.org/case_media/oral_argument_audio/25053


In [58]:
del data['17-1623'] # No oral arguments for docket 17-1623
del data['18-725'] # weird audio issues

In [39]:
# Get transacript for docket '18-877'
getTranscript(data['18-877'])[:100]

" <John G. Roberts, Jr.>  We'll hear argument next in Case 18-877, Allen versus Cooper. Mr. Shaffer. "

In [57]:
# Get audio link 
getAudio(data['17-1618'])[1][0]

'https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/17-1618/17-1618_20191008-argument.delivery.mp3'

In [59]:
# Save audio files in dictionary 

audio_data = {}

for docket, transcript in data.items():
    s3_link = getAudio(data[docket])[1][0]
    
    audio_data[docket] = s3_link
    

In [61]:
# Print commands for GCP terminal 

# Change this if you have to 
gcp_bucket = 'oyez-aduio-10-11/data'

for docket, s3_link in audio_data.items():
    print('curl -L ' + s3_link + ' | gsutil cp - gs://{}/{}.mp3'.format(gcp_bucket,docket))

curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/18-877/18-877_20191105-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/18-877.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/17-1498/17-1498_20191203-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/17-1498.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/18-882/18-882_20200115-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/18-882.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/18-6943/18-6943_20191204-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/18-6943.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/19-631/19-631_20200506-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/19-631.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2019/17-1618/17-1618_20191008-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/17-1618.mp3
