# Code from the githuub #
If we want to obtain data outselves, if not just download and open json files

In [1]:
import json
from datetime import date
import traceback

import requests

import os
import pandas as pd

In [2]:
# @sleep_and_retry
# @limits(calls=10, period=10)  # no more than 1 call per second
def get_http_json(url):
    print(f"Getting {url}")
    response = requests.get(url)
    parsed = response.json()
    return parsed


def get_case(term, docket):
    """Get the info of the case and fetch all
    transcripts that the info links to"""
    url = f"https://api.oyez.org/cases/{term}/{docket}"
    docket_data = get_http_json(url)

    if not (
        "oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]
    ):
        # no oral arguments for this case yet
        # fail so we will try again later
        print(f"No oral arguments for docket {docket}")
        return (docket_data, [])

    oral_argument_audio = docket_data["oral_argument_audio"]
    transcripts = []
    for link in oral_argument_audio:
        t = get_http_json(link["href"])
        transcripts.append(t)

    return docket_data, transcripts

def getTranscript(transcripts):
    output = ''
    for t in transcripts:
        sections = t['transcript']['sections']
        for section in sections:
            turns = section['turns']
            for turn in turns:
                try:
                    speaker = ' <' + turn['speaker']['name'] + '> '
                except:
                    speaker = ' <UNK> '
                output += speaker
                texts = turn['text_blocks']
                for text in texts:
                    output += (' ' + text['text'])
    return output

def getAudio(transcripts):
    num_files = len(transcripts)
    audio_list = []
    for t in transcripts:
        media_dicts = t['media_file']
        #just incase theres more than one, there shouldnt be but they re in a weird list
        for media_dict in media_dicts:
            audio_list.append(media_dict['href'])
    return [num_files,audio_list]

In [17]:
# Get all the terms and dockets from case_summaries.json file
with open(os.getcwd() + '/case_summaries.json') as f:
    data = json.load(f)
    
case_summaries = pd.DataFrame(data)
case_summaries = case_summaries[['term', 'docket_number']]

In [27]:
# Let's start with 2020
case_summaries_2020 = case_summaries[case_summaries['term']=='2020']

In [30]:
case_summaries_2020

Unnamed: 0,term,docket_number
8079,2020,18-540
8080,2020,19-71
8081,2020,19-368
8082,2020,19-309
8083,2020,18-956


In [34]:
data = {}

for term, docket_number in case_summaries_2020.itertuples(index=False):
    docket_data, transcripts = get_case(term, docket_number)
    data[docket_number] = transcripts

Getting https://api.oyez.org/cases/2020/18-540
Getting https://api.oyez.org/case_media/oral_argument_audio/25060
Getting https://api.oyez.org/cases/2020/19-71
Getting https://api.oyez.org/case_media/oral_argument_audio/25058
Getting https://api.oyez.org/cases/2020/19-368
Getting https://api.oyez.org/case_media/oral_argument_audio/25061
Getting https://api.oyez.org/cases/2020/19-309
Getting https://api.oyez.org/case_media/oral_argument_audio/25055
Getting https://api.oyez.org/cases/2020/18-956
Getting https://api.oyez.org/case_media/oral_argument_audio/25068


In [44]:
# Example: Get transacript for docket '18-877'
getTranscript(data['18-540'])[:500]

" <John G. Roberts, Jr.>  We'll hear argument first this morning in Case 18-540, Rutledge versus Pharmaceutical Care Management Association. General Bronni. <Nicholas J. Bronni>  Thank you, Mr. Chief Justice, and may it please the Court: Pharmacy Benefit Managers are drug middlemen that reimburse pharmacists for the cost of prescription drugs. Those reimbursements are frequently below a pharmacist's cost. That drives pharmacists out of business, and it has left many communities without a pharmaci"

In [43]:
# Example: Get audio link 
getAudio(data['19-71'])[1][0]

'https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-71/19-71_20201006-argument.delivery.mp3'

In [47]:
# Save transcripts in dictionary 

transcript_data = {}

for docket, transcript in data.items():
    script = getTranscript(data[docket])
    transcript_data[docket] = script


In [40]:
# Save audio files in dictionary 

audio_data = {}

for docket, transcript in data.items():
    try:
        s3_link = getAudio(data[docket])[1][0]
        audio_data[docket] = s3_link
    except: # Some cases don't have an audio file, so just skip those 
        pass
    

In [53]:
# Only export transcripts for cases we have mp3s for
valid_mp3_cases = audio_data.keys()
transcript_data_clean = {k: v for k, v in transcript_data.items() if k in valid_mp3_cases}

# note that transcripts.json must already exist at this point
with open('transcripts.json', 'w+') as f:
    # this would place the entire output on one line
    # use json.dump(lista_items, f, indent=4) to "pretty-print" with four spaces per indent
    json.dump(transcript_data_clean, f)

In [41]:
# Print commands for GCP terminal 

# Change this if you have to 
gcp_bucket = 'oyez-aduio-10-11/data'

for docket, s3_link in audio_data.items():
    print('curl -L ' + s3_link + ' | gsutil cp - gs://{}/{}.mp3'.format(gcp_bucket,docket))

curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-71/19-71_20201006-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/19-71.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-368/19-368_20201007-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/19-368.mp3
curl -L https://s3.amazonaws.com/oyez.case-media.mp3/case_data/2020/19-309/19-309_20201005-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/19-309.mp3
curl -L https://api.oyez.org/sites/default/files/case_data/2020/18-956/18-956_20201007-argument.delivery.mp3 | gsutil cp - gs://oyez-aduio-10-11/data/18-956.mp3
