In [None]:
from bs4 import BeautifulSoup
import bs4
from playwright.async_api import async_playwright
import requests
import json

In [None]:
url = 'https://assist.org/transfer/results?year=75&institution=79&agreement=113&agreementType=from&viewAgreementsOptions=true&view=agreement&viewBy=major&viewSendingAgreements=false&viewByKey=75%2F113%2Fto%2F79%2FMajor%2F607b828c-8ba3-411b-7de1-08dcb87d5deb'

In [None]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, wait_until='networkidle')
await page.wait_for_selector('.articRow')

In [None]:
content = await page.content()
soup = BeautifulSoup(content, 'lxml')
articRows = soup.find_all('div', class_='articRow')
print(f"Found {len(articRows)} receiving course blocks.")
type(articRows[0])

In [None]:
def clean_units(units: str) -> float:
    """
    ### Description:
        Remove the 'unit' suffix and convert to float.
    ### Args:
        units (str): The units string to clean, e.g., "4.00units" or "3.0 unit".
    ### Returns:
        float: The cleaned units as a float, e.g., 4.0 or 3.0.
    ### Raises:
        ValueError: If the units string is not in a valid format.
    """
    removed_suffix = units.replace('units', '').replace('unit', '').strip()
    try:
        return float(removed_suffix)
    except ValueError as e:
        print(f"Error converting units to float: {e}")
        raise ValueError(f"Invalid units format: {units}")

def clean_full_course(course: str) -> dict[str, str]:
    """
    ### Description:
        Clean the course string to extract the subject and number.
    ### Args:
        course (str): The full course string, e.g., "CS 101".
    ### Returns:
        dict: A dictionary with 'subject' and 'number' keys.
    """
    parts = course.split()
    if len(parts) < 2:
        raise ValueError(f"Invalid course format: {course}")
    return {
        'subject': parts[0],
        'number': ' '.join(parts[1:])
    }
    
def process_course_line(courseLine: bs4.element.Tag):
    try:
        course = courseLine.find('div', class_='prefixCourseNumber').text.strip() # type: ignore
        title = courseLine.find('div', class_='courseTitle').text.strip() # type: ignore
        units = courseLine.find('div', class_='courseUnits').text.strip() # type: ignore
        course_subject = clean_full_course(course)["subject"]
        course_number = clean_full_course(course)["number"]
        return {
            'type': 'course',
            'subject': course_subject,
            'number': course_number,
            'title': title,
            'units': clean_units(units)
        }
    except AttributeError as e:
        print(f"Error processing course line: {e}")
        return None

In [None]:
def handle_bracket(bracket: bs4.element.Tag):
    """
    ### Description:
        Handle the bracket element and extract its text.
    ### Args:
        bracket (bs4.element.Tag): The bracket element to process.
    ### Returns:
        str: The cleaned text from the bracket.
    """
    assert isinstance(bracket, bs4.element.Tag), "Expected a BeautifulSoup Tag"
    assert bracket.name == 'div', "Expected a div element"
    assert 'bracketWrapper' in bracket.get('class', []), "Expected a bracket class" #type: ignore : again [] saves us
        
    bracket_content = bracket.find('div', class_='bracketContent')
    
    # make sure we only have ands
    cojunctions = bracket_content.find_all('awc-view-conjunction')
    if len(cojunctions) == 0:
        raise ValueError("No conjunctions found in bracket content")
    for conjunction in cojunctions:
        if conjunction.text.strip() != 'And':
            raise ValueError(f"Unexpected conjunction: {conjunction.text.strip()} in bracket")
    
    course_lines = bracket_content.find_all('div', class_='courseLine')
    if not course_lines:
        raise ValueError("No course lines found in bracket content")
    courses = []
    for course_line in course_lines:
        course = process_course_line(course_line)
        if course:
            courses.append(course)
    
    return {
        "type": "CourseGroup",
        "courseConjunction": "And",
        "courses": courses
    }

In [141]:
def count_ands_in_bracket(bracket: bs4.element.Tag) -> int:
    """
    ### Description:
        Count the number of 'And' conjunctions in a bracket element.
    ### Args:
        bracket (bs4.element.Tag): The bracket element to process.
    ### Returns:
        int: The count of 'And' conjunctions in the bracket.
    """
    assert isinstance(bracket, bs4.element.Tag), "Expected a BeautifulSoup Tag"
    assert bracket.name == 'div', "Expected a div element"
    assert 'bracketWrapper' in bracket.get('class', []), "Expected a bracket class" #type: ignore : again [] saves us
    
    return len(bracket.find_all(lambda tag: tag.name == 'awc-view-conjunction' and tag.text.strip() == 'And'))
def handle_main_block(mainBlock: bs4.element.Tag):
    """
    ### Description:
        Handle the main block element and extract its text.
    ### Args:
        mainBlock (bs4.element.Tag): The main block element to process.
    ### Returns:
        dict: A dictionary with the type and text of the main block.
    """
    assert isinstance(mainBlock, bs4.element.Tag), "Expected a BeautifulSoup Tag"
    assert mainBlock.name == 'div', "Expected a div element"
    assert mainBlock.find(class_="courseLine") is not None, "Expected a courseLine class in mainBlock" # type: ignore
    
    if (mainBlock.find('div', class_='bracketWrapper') is not None):
        # handle bracket
        brackets = mainBlock.find_all('div', class_='bracketWrapper')
        and_conjuncted_courses = []

        for bracket in brackets:
            courses_in_bracket = handle_bracket(bracket)
            and_conjuncted_courses.append(courses_in_bracket)
        
        # now check that and count in brackets is equal to total and count
        total_and_conjunctions = len(mainBlock.find_all(
            lambda tag: tag.name == 'awc-view-conjunction' and tag.text.strip() == 'And'
        ))
        bracket_and_conjunctions = sum(count_ands_in_bracket(bracket) for bracket in brackets)
        if total_and_conjunctions != bracket_and_conjunctions: # our assumption that all ands are in a bracket is wrong
            raise ValueError(
                f"Mismatch in 'And' conjunction counts: {len(and_conjunctions)} found, but only {and_conjunction_in_brackets_count} in brackets."
            )
        
        # now check the other `awc-view-conjunction` tags are "Or" otherwise we have an undocumented case and need manual inspection
        non_and_or_conjunctions = mainBlock.find_all(
            lambda tag: tag.name == 'awc-view-conjunction' and (tag.text.strip() != 'And' and tag.text.strip() != 'Or')
        )
        
        if non_and_or_conjunctions:
            raise ValueError(f"Unexpected conjunctions found: {[conj.text.strip() for conj in non_and_or_conjunctions]}")
        
        # lets add one more protective check 
        all_conjunctions = mainBlock.find_all('awc-view-conjunction')
        or_conjunctions = mainBlock.find_all(
            lambda tag: tag.name == 'awc-view-conjunction' and tag.text.strip() == 'Or'
        ) # use text to filer instead of classes since we wont know if classes stay consistent outside our observed data of "and" and "or"

        # this works because we know every conjunction in the brackets is an "and" since the function checks for that, so now we only have to check our assumption that all other conjunctions are "or"
        if len(all_conjunctions) != len(or_conjunctions) + total_and_conjunctions:
            print(f"All conjunctions: {len(all_conjunctions)}, Or conjunctions: {len(or_conjunctions)}, Brackets: {len(brackets)}")
            raise ValueError("Mismatch in conjunction counts: some brackets may not be handled correctly.")
        
        if len(or_conjunctions) == 0:
            # if no or conjunctions, we can return and course directly
            assert len(and_conjuncted_courses) == 1, "Expected exactly one course group when no 'Or' conjunctions are present"
            return and_conjuncted_courses[0]
        else:
            # if there are or conjunctions, we need to return a group of courses
            return {
                "type": "CourseGroup",
                "courseConjunction": "Or",
                "courses": and_conjuncted_courses
            }
        
    else:
        # for now only support one courseLine
        courseLine = mainBlock.find('div', class_='courseLine')
        if courseLine:
            course_data = process_course_line(courseLine) # type: ignore
            return course_data
        else:
            raise ValueError("No courseLine found.")
    

In [None]:
# handle a rowReceiving
def handle_rowReceiving(rowReceiving: bs4.element.Tag):
    assert rowReceiving.name == 'div' and 'rowReceiving' in rowReceiving.get('class', []), f"Expected a 'div' with class 'rowReceiving', got {rowReceiving.name} with classes {rowReceiving.get('class', [])}" # type: ignore
    try:
        return handle_main_block(rowReceiving) 
    except ValueError as e:
        if str(e) == "No courseLine found.":
            raise ValueError("No courseLine found in rowReceiving.")
        else:
            raise e

In [106]:
def handle_rowSending(rowSending: bs4.element.Tag):
    assert rowSending.name == 'div' and 'rowSending' in rowSending.get('class', []), f"Expected a 'div' with class 'rowSending', got {rowSending.name} with classes {rowSending.get('class', [])}" # type: ignore
    
    mainContent = rowSending.find('div', class_='view_sending__content')
    if not mainContent:
        if "No Course Articulated" in rowSending.text:
            return {
                "type": "NotArticulated",
                # TODO ADD SCHOOL NAME HERE
            }
        raise ValueError("No main content found in rowSending.")
    try:
        return handle_main_block(mainContent)
    except ValueError as e:
        if str(e) == "No courseLine found.":
            raise ValueError("No courseLine found in rowSending.")
        else:
            raise e 

In [None]:
def process_artic_row(row: bs4.element.Tag):
    try:
        receiving_html = row.find('div', class_='rowReceiving')  # type: ignore
        receiving_course = handle_rowReceiving(receiving_html) # type: ignore
    except Exception as e:
        print(f"Error processing receiving course: {e}")
        print(f"Row content: {row.prettify()}")
        raise ValueError("Failed to process articRow.")
    try:
        sending_html = row.find('div', class_='rowSending')  # type: ignore
        sending_course = handle_rowSending(sending_html) # type: ignore
    except Exception as e:
        print(f"Error processing sending course: {e}")
        print(f"Row content: {row.prettify()}")
        raise ValueError("Failed to process articRow.")
    
    return {
        "receiving": receiving_course,
        "sending": sending_course
    }

In [102]:
def process_page(soup: BeautifulSoup):
    """
    # NOT COMPLETED: need to adaprt to spec
    ### Description:
        Process the entire page and extract all receiving and sending courses.
    ### Args:
        soup (BeautifulSoup): The BeautifulSoup object containing the page content.
    ### Returns:
        list: A list of dictionaries containing receiving and sending courses.
    """
    articRows = soup.find_all('div', class_='articRow')
    articulations = []
    
    for row in articRows:
        try:
            processed_row = process_artic_row(row)
            articulations.append(processed_row)
        except ValueError as e:
            print(f"Skipping row due to error: {e}")
            raise
    
    return {
        "type": "Articulation Agreement", # only thing supported for now
        "articulations": articulations
    }

In [144]:
example_articRow = articRows[12] # simplest case one one course for both receiving and sending 5 is one and

In [145]:
scraped_course = process_artic_row(example_articRow)
print(json.dumps(scraped_course, indent=2))

{
  "receiving": {
    "type": "CourseGroup",
    "courseConjunction": "And",
    "courses": [
      {
        "type": "course",
        "subject": "CHEM",
        "number": "1A",
        "title": "General Chemistry",
        "units": 3.0
      },
      {
        "type": "course",
        "subject": "CHEM",
        "number": "1AL",
        "title": "General Chemistry Laboratory",
        "units": 2.0
      },
      {
        "type": "course",
        "subject": "CHEM",
        "number": "1B",
        "title": "General Chemistry",
        "units": 4.0
      }
    ]
  },
  "sending": {
    "type": "CourseGroup",
    "courseConjunction": "Or",
    "courses": [
      {
        "type": "CourseGroup",
        "courseConjunction": "And",
        "courses": [
          {
            "type": "course",
            "subject": "CHEM",
            "number": "1A",
            "title": "General Chemistry I",
            "units": 5.0
          },
          {
            "type": "course",
            "

In [143]:
print(json.dumps((process_page(soup)), indent=2))

{
  "type": "Articulation Agreement",
  "articulations": [
    {
      "receiving": {
        "type": "course",
        "subject": "MATH",
        "number": "1A",
        "title": "Calculus",
        "units": 4.0
      },
      "sending": {
        "type": "CourseGroup",
        "courseConjunction": "Or",
        "courses": [
          {
            "type": "CourseGroup",
            "courseConjunction": "And",
            "courses": [
              {
                "type": "course",
                "subject": "MATH",
                "number": "1A",
                "title": "Calculus I",
                "units": 5.0
              },
              {
                "type": "course",
                "subject": "MATH",
                "number": "1B",
                "title": "Calculus II",
                "units": 5.0
              }
            ]
          },
          {
            "type": "CourseGroup",
            "courseConjunction": "And",
            "courses": [
              {
 