In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def parse_course_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all course elements in the HTML
    course_elements = soup.find_all('p', class_='course-name')

    # Create a list to store course dictionaries
    courses = []

    # Iterate through each course element and create the dictionary
    for course_element in course_elements:
        # Find the next 'a' tag with class 'anchor'
        anchor_element = course_element.find_previous('a', class_='anchor')

        # Check if the anchor element is not None
        if anchor_element:
            course_id = anchor_element.get('id', '')
        else:
            course_id = ''

        course_name = course_element.get_text(strip=True)

        # Extract unit using regular expression
        unit_match = re.search(r'\((\d+)\)', course_name)
        unit = int(unit_match.group(1)) if unit_match else None

        # Find the next 'p' tag with class 'course-descriptions'
        description_element = course_element.find_next('p', class_='course-descriptions')

        # Check if the description element exists
        description_text = description_element.get_text(strip=True) if description_element else ""

        # Split 'Prerequisites' from 'Description'
        prerequisites_start = description_text.find('Prerequisites:')
        if prerequisites_start != -1:
            prerequisites_text = description_text[prerequisites_start + len('Prerequisites:'):].strip()
            description_text = description_text[:prerequisites_start].strip()
        else:
            prerequisites_text = ""

        # Create the course dictionary
        course = {
            'id': course_id,
            'name': course_name,
            'unit': unit,
            'description': description_text,
            'prerequisites': prerequisites_text
        }

        # Append the course dictionary to the list
        courses.append(course)

    return courses




def main(url):
    # Fetch the HTML content from the provided URL
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the HTML content into a list of course dictionaries
        courses = parse_course_data(response.text)

        # Create a DataFrame from the list of course dictionaries
        df = pd.DataFrame(courses)

        return df

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None




In [33]:
# Example URL
url = "https://catalog.ucsd.edu/courses/CSE.html"

# Call the main function with the provided URL
result_df = main(url)

# Display the resulting DataFrame
result_df

Unnamed: 0,id,name,unit,description,prerequisites
0,cse3,CSE 3. Fluency in Information Technology (4),4.0,Introduces the concepts and skills necessary t...,none.
1,cse4gs,CSE 4GS. Mathematical Beauty in Rome (4),4.0,Exploration of topics in mathematics and engin...,"MATH 10A or MATH 20A; department approval, and..."
2,cse6gs,CSE 6GS. Mathematical Beauty in Rome Lab (4),4.0,Companion course to CSE 4GS where theory is ap...,"MATH 10A or MATH 20A; department approval, and..."
3,cse6r,CSE 6R. Introduction to Computer Science and O...,4.0,An introduction to computer science and progra...,
4,cse8a,CSE 8A. Introduction to Programming and Comput...,4.0,Introductory course for students interested in...,restricted to undergraduates. Graduate student...
...,...,...,...,...,...
170,cse294,CSE 294. Research Meeting in CSE (2),2.0,Advanced study and analysis of active research...,consent of instructor.
171,cse298,CSE 298. Independent Study (1–16),,Open to properly qualified graduate students w...,consent of instructor.
172,cse299,CSE 299. Research (1–16),,Research.,consent of faculty.
173,cse500,CSE 500. Teaching Assistantship (2–4),,A course in which teaching assistants are aide...,graduate standing and consent of instructor.


In [34]:
# Assuming 'result_df' is your DataFrame
output_file_path = 'output.json'

# Export the DataFrame to a JSON file
result_df.to_json(output_file_path, orient='records', lines=True)

# Display a message indicating successful export
print(f"DataFrame has been exported to {output_file_path}")



DataFrame has been exported to output.json
