In [None]:
# Scrape case details from https://supreme.justia.com

In [1]:
import logging

In [2]:
import re
import requests
from bs4 import BeautifulSoup

In [3]:
import logging
logging.basicConfig(level=logging.INFO)

In [4]:
# Scraper for data from https://supreme.justia.com
# -------------

In [5]:
def extract_case_urls_for_year(year):
    base_url = 'https://supreme.justia.com/cases/federal/us/year/{}.html'

    # Updated regex pattern to match specific case URLs
    pattern = re.compile(r"/cases/federal/us/\d+/\d+-\d+/")

    all_case_urls = []

    url = base_url.format(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for a in soup.find_all('a', href=True):
        if pattern.match(a['href']):
            full_url = "https://supreme.justia.com" + a['href']
            all_case_urls.append(full_url)

    return all_case_urls


In [13]:
#extract_case_urls_for_year('2020')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): supreme.justia.com:443
DEBUG:urllib3.connectionpool:https://supreme.justia.com:443 "GET /cases/federal/us/year/2020.html HTTP/1.1" 200 None


['https://supreme.justia.com/cases/federal/us/592/20-366/',
 'https://supreme.justia.com/cases/federal/us/592/20-366/',
 'https://supreme.justia.com/cases/federal/us/592/19-1302/',
 'https://supreme.justia.com/cases/federal/us/592/19-1302/',
 'https://supreme.justia.com/cases/federal/us/592/19-108/',
 'https://supreme.justia.com/cases/federal/us/592/19-108/',
 'https://supreme.justia.com/cases/federal/us/592/19-71/',
 'https://supreme.justia.com/cases/federal/us/592/19-71/',
 'https://supreme.justia.com/cases/federal/us/592/19-309/',
 'https://supreme.justia.com/cases/federal/us/592/19-309/',
 'https://supreme.justia.com/cases/federal/us/592/18-540/',
 'https://supreme.justia.com/cases/federal/us/592/18-540/',
 'https://supreme.justia.com/cases/federal/us/592/19-1108/',
 'https://supreme.justia.com/cases/federal/us/592/19-1108/',
 'https://supreme.justia.com/cases/federal/us/592/19-1261/',
 'https://supreme.justia.com/cases/federal/us/592/19-1261/',
 'https://supreme.justia.com/cases/f

In [6]:
def extract_case_urls(years):
    urls = []
    for year in years:
        logging.debug(f"year: {year}")
        _urls = extract_case_urls_for_year(year)
        logging.debug(_urls)
        urls.extend(_urls)
    return urls

In [9]:
#u = extract_case_urls([2020, 2021, 2022])

In [None]:
def extract_case_url_for_year(year):
    base_url = 'https://supreme.justia.com/cases/federal/us/year/{}.html'

    # Updated regex pattern to match specific case URLs
    pattern = re.compile(r"/cases/federal/us/\d+/\d+-\d+/")

    all_case_urls = []

    url = base_url.format(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for a in soup.find_all('a', href=True):
        if pattern.match(a['href']):
            full_url = "https://supreme.justia.com" + a['href']
            all_case_urls.append(full_url)

    return all_case_urls


In [10]:
def scrape_contestants(soup):
    # Get the webpage content

    # Find the <h1> tag by its specific attributes
    h1_tag = soup.find('h1', attrs={"id": "text-a", "class": "heading-1 has-margin-top-10 has-margin-bottom-10"})

    if not h1_tag:
        return "Case title not found"

    # Use regular expression to match the pattern "Word v. Word"
    match = re.search(r'([\w\s]+)\sv\.\s([\w\s]+)', h1_tag.text)
    if match:
        return match.group(1).strip() + " v. " + match.group(2).strip()
    else:
        return "Pattern not found in title!"

In [11]:
def scrape_syllabus(soup):
    # Find the syllabus section by div id (assuming id="diminished-text-3" based on the provided example)
    syllabus_div = soup.find('div', {'id': 'diminished-text-3'})

    if not syllabus_div:
        return "Syllabus not found"

    # Find the <em>Held</em> tag
    held_tag = syllabus_div.find('em', string=lambda x: x and 'Held' in x)

    if held_tag:
        # Create an empty list to store paragraphs before the "Held" tag
        paragraphs_before_held = []

        # Iterate over previous siblings of the "Held" tag to get content before it
        for sib in held_tag.find_all_previous():
            if sib.name == 'p':
                paragraphs_before_held.insert(0, sib.get_text(strip=True))  # Prepend to keep order
            elif sib == held_tag:  # Stop once we reach the 'Held' tag itself
                break

        syllabus_content = " ".join(paragraphs_before_held[:-1])  # Excluding the last paragraph which contains 'Held:'
    else:
        # If no held tag, extract entire syllabus
        syllabus_content = syllabus_div.get_text(strip=True)

    return syllabus_content

In [12]:
def get_case_details(url):
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request was unsuccessful

    # Parse the webpage using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    contestants = scrape_contestants(soup)
    syllabus = scrape_syllabus(soup)
    
    return {'contestants':contestants, 'syllabus':syllabus}

In [14]:
#get_case_details('https://supreme.justia.com/cases/federal/us/592/20-366/')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): supreme.justia.com:443
DEBUG:urllib3.connectionpool:https://supreme.justia.com:443 "GET /cases/federal/us/592/20-366/ HTTP/1.1" 200 None


In [16]:
#get_case_details('https://supreme.justia.com/cases/federal/us/591/19-631/')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): supreme.justia.com:443
DEBUG:urllib3.connectionpool:https://supreme.justia.com:443 "GET /cases/federal/us/591/19-631/ HTTP/1.1" 200 None


{'contestants': 'Barr v. American Association of Political Consultants',
 'syllabus': 'The Telephone Consumer Protection Act of 1991 prohibits almost all robocalls to cell phones, 47 U.S.C. 227(b)(1)(A)(iii). A 2015 amendment created an exception that allows robocalls made solely to collect a debt owed to or guaranteed by the United States, 129 Stat. 588. The Fourth Circuit concluded that the government-debt exception was a content-based speech restriction that could not withstand strict scrutiny and was severable from the robocall restriction. The Supreme Court affirmed. Under the Free Speech Clause, the government generally has no power to restrict expression because of its message, its ideas, its subject matter, or its content. Content-based laws are subject to strict scrutiny. The government-debt exception is content-based because it favors speech made for the purpose of collecting government debt over political and other speech. The exception does not draw distinctions based on sp