In [5]:
from bs4 import BeautifulSoup
import bs4
from playwright.async_api import async_playwright
import requests

In [18]:
url = 'https://assist.org/transfer/results?year=75&institution=79&agreement=113&agreementType=from&viewAgreementsOptions=true&view=agreement&viewBy=major&viewSendingAgreements=false&viewByKey=75%2F113%2Fto%2F79%2FMajor%2F607b828c-8ba3-411b-7de1-08dcb87d5deb'

In [19]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, wait_until='networkidle')
await page.wait_for_selector('.articRow')

<JSHandle preview=JSHandle@node>

In [20]:
content = await page.content()
soup = BeautifulSoup(content, 'lxml')
articRows = soup.find_all('div', class_='articRow')
print(f"Found {len(articRows)} receiving course blocks.")
type(articRows[0])

Found 23 receiving course blocks.


bs4.element.Tag

In [None]:
example_articRow = articRows[9] # simplest case one one course for both receiving and sending

In [27]:
def clean_units(units: str) -> float:
    """
    ### Description:
        Remove the 'unit' suffix and convert to float.
    ### Args:
        units (str): The units string to clean, e.g., "4.00units" or "3.0 unit".
    ### Returns:
        float: The cleaned units as a float, e.g., 4.0 or 3.0.
    ### Raises:
        ValueError: If the units string is not in a valid format.
    """
    removed_suffix = units.replace('units', '').replace('unit', '').strip()
    try:
        return float(removed_suffix)
    except ValueError as e:
        print(f"Error converting units to float: {e}")
        raise ValueError(f"Invalid units format: {units}")

def clean_full_course(course: str) -> dict[str, str]:
    """
    ### Description:
        Clean the course string to extract the subject and number.
    ### Args:
        course (str): The full course string, e.g., "CS 101".
    ### Returns:
        dict: A dictionary with 'subject' and 'number' keys.
    """
    parts = course.split()
    if len(parts) < 2:
        raise ValueError(f"Invalid course format: {course}")
    return {
        'subject': parts[0],
        'number': ' '.join(parts[1:])
    }
def process_courseLine(courseLine: bs4.element.Tag):
    try:
        course = courseLine.find('div', class_='prefixCourseNumber').text.strip() # type: ignore
        title = courseLine.find('div', class_='courseTitle').text.strip() # type: ignore
        units = courseLine.find('div', class_='courseUnits').text.strip() # type: ignore
        course_subject = clean_full_course(course)["subject"]
        course_number = clean_full_course(course)["number"]
        return {
            'type': 'course',
            'subject': course_subject,
            'number': course_number,
            'title': title,
            'units': clean_units(units)
        }
    except AttributeError as e:
        print(f"Error processing course line: {e}")
        return None

In [28]:
# handle a rowReceiving
def handle_rowReceiving(rowReceiving: bs4.element.Tag):
    assert rowReceiving.name == 'div' and 'rowReceiving' in rowReceiving.get('class', []), f"Expected a 'div' with class 'rowReceiving', got {rowReceiving.name} with classes {rowReceiving.get('class', [])}"
    
    # for now only support one courseLine
    courseLine = rowReceiving.find('div', class_='courseLine')
    if courseLine:
        course_data = process_courseLine(courseLine)
        return course_data
    else:
        print("No courseLine found in rowReceiving.")
        raise ValueError("No courseLine found in rowReceiving.")
    

In [29]:
def handle_rowSending(rowSending: bs4.element.Tag):
    assert rowSending.name == 'div' and 'rowSending' in rowSending.get('class', []), f"Expected a 'div' with class 'rowSending', got {rowSending.name} with classes {rowSending.get('class', [])}"
    
    mainContent = rowSending.find('div', class_='view_sending__content')
    # for now only support one courseLine
    courseLine = mainContent.find('div', class_='courseLine')
    if courseLine:
        course_data = process_courseLine(courseLine)
        return course_data
    else:
        print("No courseLine found in rowSending.")
        raise ValueError("No courseLine found in rowSending.")

In [32]:
def process_artic_row(row: bs4.element.Tag):
    try:
        receiving_html = row.find('div', class_='rowReceiving')  # type: ignore
        receiving_course = handle_rowReceiving(receiving_html) # type: ignore
        sending_html = row.find('div', class_='rowSending')  # type: ignore
        sending_course = handle_rowSending(sending_html) # type: ignore
    except Exception as e:
        print(f"Error processing receiving course: {e}")
        raise ValueError("Failed to process articRow.")
    
    return {
        "receiving": receiving_course,
        "sending": sending_course
    }

In [33]:
process_artic_row(example_articRow)

{'receiving': {'type': 'course',
  'subject': 'ASTRON',
  'number': '10',
  'title': 'Introduction to General Astronomy',
  'units': 4.0},
 'sending': {'type': 'course',
  'subject': 'ASTR',
  'number': '10',
  'title': 'Stellar Astronomy',
  'units': 5.0}}