<a href="https://colab.research.google.com/github/shantanudeshp/llm-scotus/blob/20230904-wip/llm_scotus_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai

In [None]:
import os
import openai
from getpass import getpass

In [None]:
openai_key = getpass('Enter the openai key')

In [None]:

openai.organization = "org-YjWOHW4W2uUvobjJfFBOBkU8"
openai.api_key = openai_key  # os.getenv("OPENAI_API_KEY")
openai.Model.list()

In [None]:
instructions = '''The following is a brief outline of a case. Predict the verdict of the Supreme Court by responding with the name of the Contestant they would most likely side with.
For example:
If the contestants are Reagan v. Alabama, respond with either Reagan or Alabama.
If the contestants are Lubkowitz v. Smith, respond with either Lubkowitz or Smith.
If the contestants are Department of Defense v. Higgins, respond with either Department of Defense or Higgins.'''

In [None]:
template1 = """
{instructions}

Case: {case_name}

Summary: {case_summary}

Verdict:
"""

print(template1)

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_contestants(url):
    # Get the webpage content
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request was unsuccessful

    # Parse the webpage using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the <h1> tag by its specific attributes
    h1_tag = soup.find('h1', attrs={"id": "text-a", "class": "heading-1 has-margin-top-10 has-margin-bottom-10"})

    if not h1_tag:
        return "Case title not found"

    # Use regular expression to match the pattern "Word v. Word"
    match = re.search(r'([\w\s]+)\sv\.\s([\w\s]+)', h1_tag.text)
    if match:
        return match.group(1).strip() + " v. " + match.group(2).strip()
    else:
        return "Pattern not found in title!"

In [None]:
def scrape_syllabus(url):
    # Get the webpage content
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request was unsuccessful

    # Parse the webpage using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the syllabus section by div id (assuming id="diminished-text-3" based on the provided example)
    syllabus_div = soup.find('div', {'id': 'diminished-text-3'})

    if not syllabus_div:
        return "Syllabus not found"

    # Find the <em>Held</em> tag
    held_tag = syllabus_div.find('em', string=lambda x: x and 'Held' in x)

    if held_tag:
        # Create an empty list to store paragraphs before the "Held" tag
        paragraphs_before_held = []

        # Iterate over previous siblings of the "Held" tag to get content before it
        for sib in held_tag.find_all_previous():
            if sib.name == 'p':
                paragraphs_before_held.insert(0, sib.get_text(strip=True))  # Prepend to keep order
            elif sib == held_tag:  # Stop once we reach the 'Held' tag itself
                break

        syllabus_content = " ".join(paragraphs_before_held[:-1])  # Excluding the last paragraph which contains 'Held:'
    else:
        # If no held tag, extract entire syllabus
        syllabus_content = syllabus_div.get_text(strip=True)

    return syllabus_content

In [None]:
import openai
import requests
from bs4 import BeautifulSoup

# Assuming you've already defined your scrape_syllabus and scrape_contestants functions...
def generate(prompt, engine="text-davinci-003", max_tokens=100):
    response = openai.Completion.create(
        engine=engine,
        prompt=prompt,
        max_tokens=max_tokens
    )
    tokens_used = response.usage['total_tokens']
    return response.choices[0].text.strip(), tokens_used

def get_verdict(urls):
    results = []

    for url in urls:
        # 1. Call the scrape_contestants and scrape_syllabus functions on each URL
        case_name = scrape_contestants(url)
        case_summary = scrape_syllabus(url)

        # 2. Put the outputs inside the template
        prompt = template1.format(instructions=instructions, case_name=case_name, case_summary=case_summary)

        # 3. Call the LLM API to predict the verdict for each
        predicted_verdict, tokens_used = generate(prompt)

        # 4. Store the verdict and tokens used for each URL
        results.append({
            "url": url,
            "predicted_verdict": predicted_verdict,
            "tokens_used": tokens_used
        })

    # 5. Return the results
    return results

# Example usage:
urls = ['https://supreme.justia.com/cases/federal/us/597/21-954/']  # Add more URLs as needed
verdicts = get_verdict(urls)

for verdict in verdicts:
    print(f"URL: {verdict['url']}, Predicted Verdict: {verdict['predicted_verdict']}, Tokens used: {verdict['tokens_used']}")

In [None]:
import requests
from bs4 import BeautifulSoup

def get_case_links(base_url):
    response = requests.get(base_url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the links to individual cases based on the structure you provided
    links = [div.a['href'] for div in soup.select('.has-padding-content-block-30.-zb.search-result')]

    return links

def main():
    base_url = 'https://supreme.justia.com/cases/federal/us/year/2022.html'

    # Get all individual case links
    case_links = get_case_links(base_url)

    # Convert relative links to full URLs
    full_urls = ['https://supreme.justia.com' + link for link in case_links]

    # Call your scraping function with the list of URLs
    get_verdict(full_urls)

if __name__ == "__main__":
    main()

In [None]:
import re
import requests
from bs4 import BeautifulSoup

def extract_case_urls(years):
    base_url = 'https://supreme.justia.com/cases/federal/us/year/{}.html'

    # Updated regex pattern to match specific case URLs
    pattern = re.compile(r"/cases/federal/us/\d+/\d+-\d+/")

    all_case_urls = []

    for year in years:
        url = base_url.format(year)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        for a in soup.find_all('a', href=True):
            if pattern.match(a['href']):
                full_url = "https://supreme.justia.com" + a['href']
                all_case_urls.append(full_url)

    return all_case_urls

years_list = [2022, 2023, 2020]
all_case_urls = extract_case_urls(years_list)

# Now, `all_case_urls` contains the case URLs for the provided years.
for url in all_case_urls:
    print(url)
#print(all_case_urls)

In [None]:
case_urls

In [None]:
get_verdict