In [None]:
!pip install requests bs4 pandas numpy datetime urllib3 tqdm reachability rpy2


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Collecting datetime
  Downloading DateTime-5.4-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting reachability
  Downloading reachability-0.1.4-py3-none-any.whl (5.5 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (247 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.3/247.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: zope.interface, reachability, datetime, bs4
Successfully installed bs4-0.0.2 datetime-5.4 reachability-0.1.4 zope.interface-6.2


In [None]:
import requests
from bs4 import BeautifulSoup as beautifulsoup
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from urllib.parse import urlencode
from tqdm import tqdm
import sys
import time
from reachability import reachability
import pickle

url = "https://www.courtlistener.com/api/rest/v3/"
api_key = "163cea228fb27936988d579ed72fe787848e1866"

def get_data_from_clusters(api_key, url):
    url = url + "clusters/"
    headers = {"Authorization": f"Token {api_key}"}
    params = {
        "dockets__court": "scotus",
        "id__gt": 0,  # Start from the first ID
        "date_filed__gt": "2019-01-01",
        "sub_opinions__isnull": False,
        "sub_opinions__author__isnull": False,
        "order_by": "id",
        "fields": "id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions"
    }

    clusters_data = []
    with tqdm(desc="Fetching clusters data", unit="page") as pbar:
        while True:
            try:
                response = requests.get(
                    url, headers=headers, params=urlencode(params, safe=",")
                )
                response.raise_for_status()
                data = response.json()
                for cluster in data["results"]:
                    sub_opinions = cluster["sub_opinions"]
                    cluster["sub_opinions"] = [
                        int(sub_opinion.split("/")[-2]) for sub_opinion in sub_opinions
                    ]
                clusters_data.extend(data["results"])
                if not data["next"]:
                    break
                params["id__gt"] = data["results"][-1]["id"]
                time.sleep(1)  # Add a delay to avoid being banned
                pbar.update(1)
            except requests.exceptions.HTTPError as err:
                print(err)
                time.sleep(180)
                continue
            except requests.exceptions.ConnectionError as err:
                reachable = reachability(timeout=None)
                while not reachable.is_online():
                    print("Waiting for internet connection...")
                    time.sleep(120)
                continue
            except Exception as err:
                print("Other error: ", err)
                break
    return clusters_data

# Get data from clusters
clusters_data = get_data_from_clusters(api_key, url)

# Pickle the clusters_data
with open('clusters_data.pickle', 'wb') as file:
    pickle.dump(clusters_data, file)

print("Clusters data has been pickled and saved to 'clusters_data.pickle'.")

Fetching clusters data: 121page [09:29,  4.33s/page]

502 Server Error: Bad Gateway for url: https://www.courtlistener.com/api/rest/v3/clusters/?dockets__court=scotus&id__gt=4618637&date_filed__gt=2019-01-01&sub_opinions__isnull=False&sub_opinions__author__isnull=False&order_by=id&fields=id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions


Fetching clusters data: 889page [1:05:19,  3.72s/page]

504 Server Error: Gateway Time-out for url: https://www.courtlistener.com/api/rest/v3/clusters/?dockets__court=scotus&id__gt=5287107&date_filed__gt=2019-01-01&sub_opinions__isnull=False&sub_opinions__author__isnull=False&order_by=id&fields=id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions


Fetching clusters data: 963page [1:13:06,  3.75s/page]

504 Server Error: Gateway Time-out for url: https://www.courtlistener.com/api/rest/v3/clusters/?dockets__court=scotus&id__gt=5793813&date_filed__gt=2019-01-01&sub_opinions__isnull=False&sub_opinions__author__isnull=False&order_by=id&fields=id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions


Fetching clusters data: 1106page [1:24:50,  3.47s/page]

504 Server Error: Gateway Time-out for url: https://www.courtlistener.com/api/rest/v3/clusters/?dockets__court=scotus&id__gt=6481434&date_filed__gt=2019-01-01&sub_opinions__isnull=False&sub_opinions__author__isnull=False&order_by=id&fields=id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions


Fetching clusters data: 1305page [1:35:08,  1.74s/page]

502 Server Error: Bad Gateway for url: https://www.courtlistener.com/api/rest/v3/clusters/?dockets__court=scotus&id__gt=9381892&date_filed__gt=2019-01-01&sub_opinions__isnull=False&sub_opinions__author__isnull=False&order_by=id&fields=id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions


Fetching clusters data: 1327page [1:38:55,  1.84s/page]

504 Server Error: Gateway Time-out for url: https://www.courtlistener.com/api/rest/v3/clusters/?dockets__court=scotus&id__gt=9388714&date_filed__gt=2019-01-01&sub_opinions__isnull=False&sub_opinions__author__isnull=False&order_by=id&fields=id,case_name,date_filed,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,sub_opinions


Fetching clusters data: 1544page [1:48:20,  4.21s/page]

Clusters data has been pickled and saved to 'clusters_data.pickle'.



