In [None]:
!pip install requests bs4 pandas numpy datetime urllib3 tqdm reachability rpy2


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Collecting datetime
  Downloading DateTime-5.4-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting reachability
  Downloading reachability-0.1.4-py3-none-any.whl (5.5 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (247 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.3/247.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: zope.interface, reachability, datetime, bs4
Successfully installed bs4-0.0.2 datetime-5.4 reachability-0.1.4 zope.interface-6.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
url = "https://www.courtlistener.com/api/rest/v3/"
api_key = "163cea228fb27936988d579ed72fe787848e1866"

In [None]:
import pickle
import requests
from urllib.parse import urlencode
from tqdm import tqdm
import time
from bs4 import BeautifulSoup as beautifulsoup
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sys
from reachability import Reachability
import os
import multiprocessing

def fetch_author_data(args):
    author_id, api_key, url, headers = args
    params = {
        "id": author_id,
        "fields": "political_affiliations,name_first,name_middle,name_last,name_suffix,date_dob,id,positions",
    }
    while True:
        try:
            newurl = f"{url}?{urlencode(params, safe=',')}"
            response = requests.get(newurl, headers=headers)
            response.raise_for_status()
            data = response.json()
            for author in data["results"]:
                if isinstance(author, dict):  # Check if author is a dictionary
                    is_scotus_justice = False
                    for position_url in author.get("positions", []):
                        position_response = requests.get(position_url, headers=headers)
                        position_response.raise_for_status()
                        position_data = position_response.json()
                        try:
                            court_id = position_data["court"]["id"]
                            if court_id == "scotus":
                                is_scotus_justice = True
                                break
                        except (KeyError, TypeError):
                            continue
                    if is_scotus_justice:
                        print("SCOTUS")
                        political_affiliations = author.get("political_affiliations", [])
                        last_affiliation = political_affiliations[-1] if political_affiliations else {}
                        return {
                            "id": author.get("id"),
                            "name_first": author.get("name_first"),
                            "name_middle": author.get("name_middle"),
                            "name_last": author.get("name_last"),
                            "name_suffix": author.get("name_suffix"),
                            "date_dob": author.get("date_dob"),
                            "political_affiliation": last_affiliation
                        }
            return None
        except requests.exceptions.HTTPError as err:
            # If the request fails, wait 3 minutes and try again with the same parameters
            print(err)
            time.sleep(180)
            continue
        except requests.exceptions.ConnectionError as err:
            reachable = Reachability(timeout=None)
            while not reachable.is_online():
                print("Waiting for internet connection...")
                time.sleep(120)
            continue
        except Exception as err:
            print("Other error: ", err)
            return None

def get_data_from_authors(api_key, url, author_ids, max_queries_per_hour=2000):
    url = url + "people/"
    headers = {"Authorization": f"Token {api_key}"}
    authors_data = []
    start_time = time.time()
    query_count = 0
    hour_count = 1

    with multiprocessing.Pool() as pool:
        results = list(tqdm(pool.imap(fetch_author_data, [(author_id, api_key, url, headers) for author_id in author_ids]),
                            total=len(author_ids), desc="Fetching authors data", unit="author"))
        authors_data = [result for result in results if result is not None]

    return authors_data

# Load the opinions_data from the pickle file
with open('/content/drive/MyDrive/opinions_data.pickle', 'rb') as f:
    opinions_data = pickle.load(f)

# Extract unique author IDs from the opinions_data
author_ids = [opinion["author_id"] for opinion in opinions_data if "author_id" in opinion]
author_ids = [id for id in author_ids if id not in [None, "None"]]
author_ids = sorted(set(author_ids))

# Call the get_data_from_authors function with the extracted author_ids
authors_data = get_data_from_authors(api_key, url, author_ids, max_queries_per_hour=2000)

# Save the authors_data as a pickle file
with open('/content/drive/MyDrive/authors_data.pickle', 'wb') as f:
    pickle.dump(authors_data, f)

print("Fetched authors data saved as 'authors_data.pickle'")

Fetching authors data:   0%|          | 0/273 [00:00<?, ?author/s]

SCOTUS


Fetching authors data:   2%|▏         | 5/273 [00:09<09:20,  2.09s/author]

SCOTUS


Fetching authors data:   5%|▌         | 15/273 [00:26<06:22,  1.48s/author]

SCOTUS


Fetching authors data:   7%|▋         | 18/273 [00:33<08:14,  1.94s/author]

SCOTUS


Fetching authors data:   9%|▉         | 24/273 [00:41<05:45,  1.39s/author]

SCOTUS


Fetching authors data:   9%|▉         | 25/273 [00:44<07:33,  1.83s/author]

SCOTUS


Fetching authors data:  10%|▉         | 27/273 [00:48<07:02,  1.72s/author]

SCOTUS


Fetching authors data:  14%|█▎        | 37/273 [01:05<05:27,  1.39s/author]

SCOTUS


Fetching authors data:  15%|█▍        | 40/273 [01:11<05:49,  1.50s/author]

SCOTUS


Fetching authors data:  16%|█▌        | 43/273 [01:15<05:40,  1.48s/author]

SCOTUS


Fetching authors data:  48%|████▊     | 130/273 [02:15<01:19,  1.79author/s]

502 Server Error: Bad Gateway for url: https://www.courtlistener.com/api/rest/v3/people/?id=7881&fields=political_affiliations,name_first,name_middle,name_last,name_suffix,date_dob,id,positions


Fetching authors data:  48%|████▊     | 132/273 [02:16<01:09,  2.02author/s]

502 Server Error: Bad Gateway for url: https://www.courtlistener.com/api/rest/v3/people/?id=8041&fields=political_affiliations,name_first,name_middle,name_last,name_suffix,date_dob,id,positions


Fetching authors data:  80%|████████  | 219/273 [05:57<00:54,  1.02s/author]

SCOTUS


Fetching authors data:  82%|████████▏ | 225/273 [06:10<01:29,  1.87s/author]

SCOTUS


Fetching authors data: 100%|██████████| 273/273 [06:52<00:00,  1.51s/author]


Fetched authors data saved as 'authors_data.pickle'
