In [4]:
import pandas as pd
import os
import re

In [85]:
count = 0
for file in os.listdir("./cases"):
    count += 1

print(count)

2540


In [86]:
count = 0
for file in os.listdir("./transcripts"):
    count += 1

print(count)

2565


In [87]:
transcript_dockets = []
for file in os.listdir("./transcripts"):
    match = re.search(r"^\d{4}\.[a-zA-Z0-9_-]+(?=-t\d{2}\.json)", file)
    if match:
        transcript_dockets.append(match.group())

In [88]:
cases_dockets = []
for file in os.listdir("./cases"):
    match = re.search(r"^\d{4}\.[a-zA-Z0-9_-]+(?=\.json)", file)
    if match:
       cases_dockets.append(match.group())

In [89]:
print(len(transcript_dockets))
transcript_set = set(transcript_dockets)
print(len(transcript_set))

import collections
print([item for item, count in collections.Counter(transcript_dockets).items() if count > 1])  

2565
2540
['2014.14-556', '2011.10-9646', '2018.17-647', '2005.04-473', '2014.13-7120', '2011.10-1491', '1991.91-615', '2015.14-449', '1999.98-6322', '2017.15-1204', '2011.11-393', '1998.98-405', '2013.13-132', '2017.15-1498', '2003.02-1674', '1991.90-1038', '2012.10-930', '2008.08-205', '2005.04-1170', '1991.90-985', '2000.99-1257', '1991.90-857', '2005.04-1360']


In [90]:
print(len(cases_dockets))
cases_set = set(cases_dockets)
print(len(cases_set))

import collections
print([item for item, count in collections.Counter(cases_dockets).items() if count > 1])  

2540
2540
[]


In [66]:
len(cases_set.intersection(transcript_set))

2540

In [69]:
transcript_set.difference(cases_set)

set()

In [70]:
cases_to_grab = cases_set.difference(transcript_set)
len(cases_to_grab)

203

In [75]:
import json
cases_to_remove = []
for case in cases_to_grab:
    with open(f"./cases/{case}.json") as file: 
        docket_data = json.load(file)
        if not ("oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]):
            cases_to_remove.append(case)

In [84]:
folder = "./cases/"
count = 0
for file in os.listdir(folder):
    if file[:-5] in cases_to_remove:
        print(os.path.join(folder, file))
        os.remove(os.path.join(folder, file))


./cases/1999.98-9913.json
./cases/2000.00-1210.json
./cases/2015.14-848.json
./cases/2006.06-532.json
./cases/2015.15-833.json
./cases/2011.11-38.json
./cases/1993.92-1949.json
./cases/2007.07-110.json
./cases/2002.02-1295.json
./cases/2009.08-10537.json
./cases/2005.04-1538.json
./cases/2015.15-278.json
./cases/2005.05-101.json
./cases/1993.92-8836.json
./cases/1993.93-8312.json
./cases/2005.04-1095.json
./cases/1995.94-9323.json
./cases/2010.09-940.json
./cases/1998.98-437.json
./cases/1998.98-1071.json
./cases/1996.95-2025.json
./cases/2017.16-273.json
./cases/1998.98-9085.json
./cases/2012.12-1084.json
./cases/2011.11-1179.json
./cases/2003.02-1369.json
./cases/1993.93-70.json
./cases/1996.96-713.json
./cases/2005.05-8400.json
./cases/2012.12-694.json
./cases/1998.98-8952.json
./cases/1999.98-9933.json
./cases/1992.91-2019.json
./cases/1996.96-987.json
./cases/2011.11-74.json
./cases/1993.92-6281.json
./cases/2006.06-641.json
./cases/1996.96-8796.json
./cases/2008.08-5721.json
./ca

In [57]:
folder_dir = "./rerun_transcripts/"
for file_name in os.listdir(folder_dir):
    print(file_name)

2017.16-424-t01.json
2017.16-460-t01.json
2017.16-658-t01.json
2017.16-499-t01.json
2017.16-498-t01.json
2017.16-285-t01.json
2017.16-299-t01.json
2017.16-784-t01.json
2017.15-1509-t01.json
2017.16-6855-t01.json
2017.16-969-t01.json
2017.16-6795-t01.json


In [58]:
import shutil

source_path = "./rerun_transcripts"
dest_path = "./transcripts"

for file_name in os.listdir(source_path):
    file_path  = os.path.join(source_path, file_name)
    shutil.move(file_path, dest_path)

In [49]:
# Grab missing transcripts, if they exist
import json
import requests
from ratelimit import limits, sleep_and_retry

@sleep_and_retry
@limits(calls=10, period=10)  # no more than 1 call per second
def get_http_json(url):
    print(f"Getting {url}")
    response = requests.get(url)
    parsed = response.json()
    return parsed

def get_case(term, docket):
    """Get the info of the case and fetch all
    transcripts that the info links to"""
    url = f"https://api.oyez.org/cases/{term}/{docket}"
    docket_data = get_http_json(url)

    if not (
        "oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]
    ):
        # no oral arguments for this case yet
        # fail so we will try again later
        print(f"No oral arguments for docket {docket}")
        return (docket_data, [])

    oral_argument_audio = docket_data["oral_argument_audio"]
    transcripts = []
    for link in oral_argument_audio:
        t = get_http_json(link["href"])
        transcripts.append(t)

    return docket_data, transcripts


def write_case(term, docket, docket_data, transcripts):
    """
    Writes term-docket.json file with docket_data
    For each transcript, writes the term-docket-t##.json file
    """
    with open(f"./rerun_cases_and_transcripts/{term}.{docket}.json", "w") as docket_file:
        json.dump(docket_data, docket_file, indent=2)

    count = 0
    for t in transcripts:
        count += 1
        t_filename = "./rerun_cases_and_transcripts/{}.{}-t{:0>2d}.json".format(term, docket, count)
        with open(t_filename, "w") as t_file:
            json.dump(t, t_file, indent=2)


In [50]:
import traceback

count = 0
total = len(cases_to_grab)
succesful = set()
for case in cases_to_grab:
    term = case[:4]
    docket = case[5:]
    ## pull the file
    count += 1
    print(f"Trying: {term}/{docket}\t\t{count}/{total}")
    try:
        docket_data, transcripts = get_case(term, docket)
        if not transcripts:
            # No transcripts for this case yet
            continue

        write_case(term, docket, docket_data, transcripts)
        succesful.add((term, docket))
    except Exception as exc:
        traceback.print_exc()
        print(f"Failed for {term}/{docket}, continuing anyways")

Trying: 1994/93-1462		1/215
Getting https://api.oyez.org/cases/1994/93-1462
No oral arguments for docket 93-1462
Trying: 2014/14-618		2/215
Getting https://api.oyez.org/cases/2014/14-618
No oral arguments for docket 14-618
Trying: 1992/92-6846		3/215
Getting https://api.oyez.org/cases/1992/92-6846
No oral arguments for docket 92-6846
Trying: 1992/92-409		4/215
Getting https://api.oyez.org/cases/1992/92-409
No oral arguments for docket 92-409
Trying: 1996/96-858		5/215
Getting https://api.oyez.org/cases/1996/96-858
No oral arguments for docket 96-858
Trying: 1994/93-1577		6/215
Getting https://api.oyez.org/cases/1994/93-1577
No oral arguments for docket 93-1577
Trying: 2015/15-648		7/215
Getting https://api.oyez.org/cases/2015/15-648
No oral arguments for docket 15-648
Trying: 2005/05-552		8/215
Getting https://api.oyez.org/cases/2005/05-552
No oral arguments for docket 05-552
Trying: 2013/13-5967		9/215
Getting https://api.oyez.org/cases/2013/13-5967
No oral arguments for docket 13-596