In [1]:
import asyncio
from crawl4ai import AsyncWebCrawler



In [12]:
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext",
        word_count_threshold=10,
        keep_data_attributes=False,
        exclude_external_links=True,    # Remove external links
        remove_overlay_elements=True,   # Remove popups/modals
        process_iframes=True 
    )
    print(result.markdown)  # Print clean markdown content
    # metadata = result.metadata
    # print(f"Title: {metadata['title']}")
    # print(f"Description: {metadata['description']}")
    # print(f"Keywords: {metadata['keywords']}")

[LOG] 🚀 Crawl4AI 0.3.731
[LOG] 🚀 Content extracted for https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext, success: True, time taken: 0.18 seconds
[LOG] 🚀 Extraction done for https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext, time taken: 0.18 seconds.
  * [Skip to Content](#content)
  * [AZ Index](/azindex/)
  * [Catalog Home](/)



#  [Guide](/)

2024-2025

Search this site Submit search

Menu 

  * [Undergraduate](/undergraduate/)
  * Graduate/Professional
    * [Graduate](/graduate)
    * [Law](/law)
    * [Medicine & Public Health](/medicine)
    * [Pharmacy](/pharmacy)
    * [Veterinary Medicine](/veterinary)
  * [Nondegree](/nondegree/)
  * [Courses](/courses/)
  * [Faculty](/faculty/)
  * [Archive](/archive/)



  * [Home](/)/
  * [Undergraduate Guide](/undergraduate/)/
  * [College of Letters & Science](/undergraduate/letters-science/)/
  * [Computer Sciences](

In [9]:
from crawl4ai.extraction_strategy import LLMExtractionStrategy
async with AsyncWebCrawler() as crawler:
    strategy = LLMExtractionStrategy(
        provider="ollama/nemotron",  # or "huggingface/...", "ollama/..."
        instruction="Clean up the markdown of any html related stuff and get raw text"
    )
    result = await crawler.arun(
        url="https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext",
        extraction_strategy=strategy
    )
    print(result.extracted_content)

[LOG] 🚀 Crawl4AI 0.3.731
[LOG] 🚀 Content extracted for https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext, success: True, time taken: 0.13 seconds
[LOG] 🚀 Extraction done for https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext, time taken: 0.13 seconds.
[
    {
        "index": 0,
        "tags": [],
        "content": "  * [Skip to Content](#content)\n  * [AZ Index](/azindex/)\n  * [Catalog Home](/)"
    },
    {
        "index": 1,
        "tags": [],
        "content": ""
    },
    {
        "index": 2,
        "tags": [],
        "content": "[University of Wisconsin–Madison](http://www.wisc.edu)"
    },
    {
        "index": 3,
        "tags": [],
        "content": "[![Link to University of Wisconsin-Madison home page](/images/uw-crest.svg)](http://www.wisc.edu)"
    },
    {
        "index": 4,
        "tags": [],
        "content": "#  [Guide](/)"
    },
    {
 

In [16]:
async with AsyncWebCrawler() as crawler:
    html2text_options = {
        "ignore_links": True,
        "body_width": 0
    }
    result = await crawler.arun(
        url="https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext",
        html2text=html2text_options
    )
    
    print(result.markdown)  # Print clean markdown content

[LOG] 🚀 Crawl4AI 0.3.731
[LOG] 🚀 Content extracted for https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext, success: True, time taken: 0.13 seconds
[LOG] 🚀 Extraction done for https://guide.wisc.edu/undergraduate/letters-science/computer-sciences/computer-sciences-bs/#requirementstext, time taken: 0.13 seconds.
  * Skip to Content
  * AZ Index
  * Catalog Home



University of Wisconsin–Madison

![Link to University of Wisconsin-Madison home page](/images/uw-crest.svg)

#  Guide

2024-2025

Search this site Submit search

Menu 

  * Undergraduate
  * Graduate/Professional
    * Graduate
    * Law
    * Medicine & Public Health
    * Pharmacy
    * Veterinary Medicine
  * Nondegree
  * Courses
  * Faculty
  * Archive
  * Apply Now  More



  * Home/
  * Undergraduate Guide/
  * College of Letters & Science/
  * Computer Sciences/
  * Computer Sciences, BS



# Computer Sciences, BS

![""](/gallery/computer-sciences.jpg)

Our grad

In [19]:
import re
paragraphs = [paragraph.strip() for paragraph in result.markdown.split('\n\n') if paragraph.strip()]

pattern = r'[^\d]\d{3}[^\d]'

# Filter paragraphs containing the pattern
filtered_paragraphs = [p for p in paragraphs if re.search(pattern, p)]

# Print the filtered paragraphs
for i, paragraph in enumerate(filtered_paragraphs, start=1):
    print(f"Filtered Paragraph {i}:\n{paragraph}\n")

Filtered Paragraph 1:
* Completion of COMP SCI 300 and MATH 222
  * Grade of BC or higher in one of these introductory programming courses, taken at UW-Madison: COMP SCI 300, COMP SCI/​E C E 354 or COMP SCI 400
  * 2.250 GPA or higher among the first completed attempts of these courses: COMP SCI 300 and MATH 222

Filtered Paragraph 2:
If a student needs additional coursework to meet the 2.250 GPA requirement, COMP SCI/​MATH 240, COMP SCI/​E C E 354, and/or COMP SCI 400 Programming III may also be used.

Filtered Paragraph 3:
University General Education Requirements Requirements | Detail  
---|---  
Mathematics | Complete two courses of 3+ credits at the Intermediate or Advanced level in MATH, COMP SCI, or STAT subjects. A maximum of one course in each of COMP SCI and STAT subjects counts toward this requirement.  
Language | Complete the third unit of a language other than English.  
LS Breadth | Complete:• 12 credits of Humanities, which must include at least 6 credits of Literature;

In [20]:
async def clean_data(urls: str):
    async with AsyncWebCrawler() as crawler:
        html2text_options = {
            "ignore_links": True,
            "body_width": 0
        }
        result = await crawler.arun(
            url=urls,
            html2text=html2text_options
        )
        paragraphs = [paragraph.strip() for paragraph in result.markdown.split('\n\n') if paragraph.strip()]

        pattern = r'[^\d]\d{3}[^\d]'
        
        # Filter paragraphs containing the pattern
        filtered_paragraphs = [p for p in paragraphs if re.search(pattern, p)]
        return filtered_paragraphs

In [23]:
url = 'https://guide.wisc.edu/undergraduate/letters-science/statistics/data-science-bs/#requirementstext'
fp = await clean_data(url)
for i, paragraph in enumerate(fp, start=1):
    print(f"Filtered Paragraph {i}:\n{paragraph}\n")

[LOG] 🚀 Crawl4AI 0.3.731
[LOG] 🚀 Crawling done for https://guide.wisc.edu/undergraduate/letters-science/statistics/data-science-bs/#requirementstext, success: True, time taken: 0.80 seconds
[LOG] 🚀 Content extracted for https://guide.wisc.edu/undergraduate/letters-science/statistics/data-science-bs/#requirementstext, success: True, time taken: 0.13 seconds
[LOG] 🔥 Extracting semantic blocks for https://guide.wisc.edu/undergraduate/letters-science/statistics/data-science-bs/#requirementstext, Strategy: AsyncWebCrawler
[LOG] 🚀 Extraction done for https://guide.wisc.edu/undergraduate/letters-science/statistics/data-science-bs/#requirementstext, time taken: 0.13 seconds.
Filtered Paragraph 1:
Students must have a 2.000 GPA on coursework counting in the major, and a 2.000 GPA on any upper-level work in the major completed prior to declaration. No specific coursework must be completed to declare.

Filtered Paragraph 2:
University General Education Requirements Requirements | Detail  
---|---

  fp = await clean_data(url)


In [None]:
\