In [None]:
!pip install -q requests beautifulsoup4 trafilatura

import requests
from bs4 import BeautifulSoup
import trafilatura
import time



UNIVERSITY_URLS = {
    "MIT-OCW": "https://ocw.mit.edu/courses/",
    "MIT-AI": "https://www.eecs.mit.edu/category/news/",
    "Stanford-AI": "https://ai.stanford.edu/",
    "Stanford-Online": "https://online.stanford.edu/courses",
    "Berkeley-EECS": "https://eecs.berkeley.edu/",
    "Berkeley-BAIR": "https://bair.berkeley.edu/blog/",
    "CMU-CS": "https://www.cs.cmu.edu/",
    "CMU-ML": "https://www.ml.cmu.edu/research/index.html"
}

PAGES_PER_URL = 300
OUTPUT_FILE = "usa_edu_corpus_cleaned.txt"
CRAWLED_URLS_FILE = "crawled_urls.txt"

BLOCK_EXTENSIONS = [".mp4", ".pdf", ".zip", ".jpg", ".png", ".ppt", ".doc", ".exe"]

#UTILITY FUNCTIONS

def is_valid_url(url):
    return not any(url.lower().endswith(ext) for ext in BLOCK_EXTENSIONS)

def get_all_links(page_url, base_domain):
    """Extract all internal links from a page."""
    try:
        response = requests.get(page_url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()
        for tag in soup.find_all("a", href=True):
            href = tag["href"]
            if href.startswith("/"):
                href = base_domain + href
            if base_domain in href and href.startswith("http") and is_valid_url(href):
                links.add(href.split("#")[0])
        return list(links)
    except Exception as e:
        print(f"[Error] Failed to get links from {page_url}: {e}")
        return []

def extract_clean_text(url):
    """Use trafilatura to extract clean main content from a page."""
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
        else:
            return None
    except Exception as e:
        print(f"[Error] Trafilatura failed for {url}: {e}")
        return None

#MAIN CRAWLER

def crawl_from_seed(name, seed_url, limit, out_f, log_f):
    seen = set()
    count = 0
    base_domain = seed_url.split("/")[0] + "//" + seed_url.split("/")[2]
    pages = [seed_url]

    print(f"\n====== Crawling {name} ======")
    for url in pages:
        if url in seen or count >= limit:
            continue
        seen.add(url)
        print(f"[{name} {count+1}] {url}")
        text = extract_clean_text(url)
        if text:
            paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 40]
            for p in paragraphs:
                out_f.write(p + "\n")
            log_f.write(url + "\n")
            count += 1
        time.sleep(1)
        new_links = get_all_links(url, base_domain)
        pages.extend([link for link in new_links if link not in seen])
        if count >= limit:
            break

    print(f"{name} done. {count} pages scraped.\n")

def crawl_all(university_urls, pages_per_url, output_file, crawled_log):
    with open(output_file, "w", encoding="utf-8") as out_f, open(crawled_log, "w", encoding="utf-8") as log_f:
        for name, url in university_urls.items():
            crawl_from_seed(name, url, pages_per_url, out_f, log_f)

#RUN SCRIPT

crawl_all(UNIVERSITY_URLS, PAGES_PER_URL, OUTPUT_FILE, CRAWLED_URLS_FILE)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.9/837.9 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.7/295.7 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.8/263.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h
[MIT-OCW 1] https://ocw.mit.edu/courses/
[MIT-OCW 2] https://ocw.mit.edu/about
[MIT-OCW 3] https://ocw.mit.edu/
[MIT-OCW 4] https://ocw.mit.edu/contact
[MIT-OCW 5] https://ocw.mit.edu/collections/mit-open-learning-library/
[MIT-OCW 6] https://ocw.mit.edu/search/?f=Online%20Textbook&s=-runs.best_start_date
[MIT-OCW 7] https://ocw.mit.edu/search/?f=Instructor%20Insights&s=-runs.best_start_date
[M



[MIT-OCW 84] https://ocw.mit.edu/search/?t=Mechanical+Engineering
[MIT-OCW 85] https://ocw.mit.edu/collections/ocw-scholar/
[MIT-OCW 86] https://ocw.mit.edu/search/?q=Prof.+Yufei+Zhao
[MIT-OCW 87] https://ocw.mit.edu/courses/2-79j-biomaterials-tissue-interactions-fall-2022/
[MIT-OCW 88] https://ocw.mit.edu/stories/ana-tri%C5%A1ovi%C4%87/
[MIT-OCW 89] https://ocw.mit.edu/stories/
[MIT-OCW 90] https://ocw.mit.edu/courses/2-785j-cell-matrix-mechanics-fall-2016/
[MIT-OCW 91] https://ocw.mit.edu/search/?q=Joe+Diaz
[MIT-OCW 92] https://ocw.mit.edu/search/?l=Graduate
[MIT-OCW 93] https://ocw.mit.edu/search/?t=Discrete+Mathematics
[MIT-OCW 94] https://ocw.mit.edu/courses/6-s087-foundation-models-and-generative-ai-january-iap-2024/
[MIT-OCW 95] https://ocw.mit.edu/search/?t=Microeconomics
[MIT-OCW 96] https://ocw.mit.edu/search/?q=Prof.+Steven+G.+Johnson
[MIT-OCW 97] https://ocw.mit.edu/search/?t=American+Politics
[MIT-OCW 98] https://ocw.mit.edu/courses/17-251-congress-and-the-american-politic

ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://ocw.mit.edu/help/


[MIT-OCW 133] https://ocw.mit.edu/help/
[MIT-OCW 133] https://ocw.mit.edu/search/?q=Elizabeth+Huttner-Loan
[MIT-OCW 134] https://ocw.mit.edu/search/?q=Prof.+Justin+Reich
[MIT-OCW 135] https://ocw.mit.edu/search/?t=Finance
[MIT-OCW 136] https://ocw.mit.edu/courses/res-15-005-healthcare-finance-15-482x-spring-2019/
[MIT-OCW 137] https://ocw.mit.edu/search/?q=Prof.+Silvija+Gradecak
[MIT-OCW 138] https://ocw.mit.edu/search/?q=Prof.+Peter+Shor
[MIT-OCW 139] https://ocw.mit.edu/courses/8-370x-quantum-information-science-i-spring-2018/
[MIT-OCW 140] https://ocw.mit.edu/search/?t=Health+Care+Management
[MIT-OCW 141] https://ocw.mit.edu/search/?t=Materials+Science+and+Engineering
[MIT-OCW 142] https://ocw.mit.edu/search/?t=Algorithms+and+Data+Structures
[MIT-OCW 143] https://ocw.mit.edu/search/?t=Educational+Technology
[MIT-OCW 144] https://ocw.mit.edu/search/?q=Alyssa+Napier
[MIT-OCW 145] https://ocw.mit.edu/courses/res-cms-155-design-thinking-for-leading-and-learning-spring-2019/
[MIT-OCW 146

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/dharmendra-modha/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/david-sontag/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/david-sontag/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-subramanian-ramamoorthy


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-subramanian-ramamoorthy


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/leslie-kaebling/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/leslie-kaebling/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-pietro-perona


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-pietro-perona


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-michael-black/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-michael-black/


[Stanford-AI 43] https://ai.stanford.edu/?post_type=portfolio&p=2731&preview=true


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/?post_type=portfolio&p=2731&preview=true


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-raymond-mooney


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-raymond-mooney


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-andreas-krause


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-andreas-krause


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/matt-ginsberg/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/matt-ginsberg/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-kevin-leyton-brown/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-kevin-leyton-brown/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/sham-kakade/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/sham-kakade/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-craig-boutilier


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-craig-boutilier


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-pedro-domingos


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-pedro-domingos


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/kevin-knight/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/kevin-knight/


[Stanford-AI 43] https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-regina-barzilay/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ai.stanford.edu/portfolio-view/distinguished-speaker-series-regina-barzilay/


[Stanford-AI 43] https://ai.stanford.edu/ai-salon-event-archive/
[Stanford-AI 44] https://ai.stanford.edu/events/ai-salon/
[Stanford-AI 45] https://ai.stanford.edu/blog/rl/page/2/index.html
[Stanford-AI 46] https://ai.stanford.edu/blog/self-improving-robots/
[Stanford-AI 47] https://ai.stanford.edu/blog/meta-exploration/
[Stanford-AI 48] https://ai.stanford.edu/blog/igibson/
[Stanford-AI 49] https://ai.stanford.edu/blog/selfsupervised-multimodal/
[Stanford-AI 50] https://ai.stanford.edu/blog/cavin/
[Stanford-AI 51] https://ai.stanford.edu/blog/lili/
[Stanford-AI 52] https://ai.stanford.edu/blog/robomimic/
[Stanford-AI 53] https://ai.stanford.edu/blog/robonet/
[Stanford-AI 54] https://ai.stanford.edu/blog/gti/
[Stanford-AI 55] https://ai.stanford.edu/blog/black-box-safety-validation/
[Stanford-AI 56] https://ai.stanford.edu/blog/acl-2022/
[Stanford-AI 57] https://ai.stanford.edu/blog/emnlp-2024/
[Stanford-AI 58] https://ai.stanford.edu/blog/acl-2023/
[Stanford-AI 59] https://ai.stanford

ERROR:trafilatura.downloads:download error: https://ai.stanford.edu/blog/assets/img/posts/2021-06-21-agqa/agqaexamples.gif MAX_FILE_SIZE exceeded


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2021-06-21-agqa/introvideo.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2019-02-26-beyond_local_pattern_matching/img11.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2019-02-26-beyond_local_pattern_matching/img12.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2019-02-26-beyond_local_pattern_matching/img9.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2018-12-10-batch-active-preference-learning/image2.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2018-12-10-batch-active-preference-learning/image4.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2018-12-10-batch-active-preference-learning/image7.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2018-12-10-batch-active-preference-learning/image6.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2018-12-10-batch-active-preference-learning/image5.gif


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/blog/assets/img/posts/2021-06-20-cvpr-2021/img5


ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None


[Stanford-AI 206] https://ai.stanford.edu/~rhgao/see_hear_feel/
[Stanford-AI 207] https://ai.stanford.edu/blog/page/5/index.html
[Stanford-AI 208] https://ai.stanford.edu/~sttruong/villm/leaderboard/fairness-aware/toxicity-detection
[Stanford-AI 209] https://ai.stanford.edu/~sttruong/villm/leaderboard/fairness-aware/text-classification
[Stanford-AI 210] https://ai.stanford.edu/~sttruong/villm/leaderboard/weaker-prompt/summarization
[Stanford-AI 211] https://ai.stanford.edu/~sttruong/villm/leaderboard/robustness-aware/knowledge
[Stanford-AI 212] https://ai.stanford.edu/~sttruong/villm/leaderboard/robustness-aware/translation
[Stanford-AI 213] https://ai.stanford.edu/~sttruong/villm/leaderboard/fairness-aware/question-answering
[Stanford-AI 214] https://ai.stanford.edu/~sttruong/villm/leaderboard/few-shot/toxicity-detection
[Stanford-AI 215] https://ai.stanford.edu/~sttruong/villm/leaderboard/weaker-prompt/question-answering
[Stanford-AI 216] https://ai.stanford.edu/~sttruong/villm/leade

ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://online.stanford.edu/courses


✅ Stanford-Online done. 0 pages scraped.


[Berkeley-EECS 1] https://eecs.berkeley.edu/
✅ Berkeley-EECS done. 1 pages scraped.


[Berkeley-BAIR 1] https://bair.berkeley.edu/blog/
[Berkeley-BAIR 2] https://bair.berkeley.edu/blog/2025/03/25/rl-av-smoothing/
[Berkeley-BAIR 3] https://bair.berkeley.edu/blog/2025/04/11/prompt-injection-defense/
[Berkeley-BAIR 4] https://bair.berkeley.edu/blog/2024/08/28/strong-reject/
[Berkeley-BAIR 5] https://bair.berkeley.edu/blog/2024/05/29/tiny-agent/
[Berkeley-BAIR 6] https://bair.berkeley.edu/blog/2024/09/20/linguistic-bias/
[Berkeley-BAIR 7] https://bair.berkeley.edu/blog/page2/
[Berkeley-BAIR 8] https://bair.berkeley.edu/blog/2025/04/08/plaid/
[Berkeley-BAIR 9] https://bair.berkeley.edu/blog/2024/03/11/grads-2024/
[Berkeley-BAIR 10] https://bair.berkeley.edu/blog/2024/03/21/xt/
[Berkeley-BAIR 11] https://bair.berkeley.edu/blog/2024/11/12/virutal-persona-llm/
[Berkeley-BAIR 12] https://bair.berkeley.edu/blog/subscribe/
[Berkeley-BAIR 13] https://bair

ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.text, "html.parser")


[Berkeley-BAIR 16] https://bair.berkeley.edu/blog/2023/07/10/stepwise-ssl/
[Berkeley-BAIR 17] https://bair.berkeley.edu/blog/2023/10/17/grif/
[Berkeley-BAIR 18] https://bair.berkeley.edu/blog/2023/01/20/relmm/
[Berkeley-BAIR 19] https://bair.berkeley.edu/blog/2020/04/27/ingredients/
[Berkeley-BAIR 20] https://bair.berkeley.edu/blog/2023/05/23/lmd/
[Berkeley-BAIR 21] https://bair.berkeley.edu/blog/page3/
[Berkeley-BAIR 22] https://bair.berkeley.edu/blog/2023/10/16/p3o/
[Berkeley-BAIR 23] https://bair.berkeley.edu/blog/2023/07/14/ddpo/
[Berkeley-BAIR 24] https://bair.berkeley.edu/blog/2023/06/29/coarsenconf/
[Berkeley-BAIR 25] https://bair.berkeley.edu/blog/2023/04/06/ifl/
[Berkeley-BAIR 26] https://bair.berkeley.edu/blog/2023/04/03/koala/
[Berkeley-BAIR 27] https://bair.berkeley.edu/blog/2023/11/14/fcnn/
[Berkeley-BAIR 28] https://bair.berkeley.edu/blog/2020/11/05/arm/
[Berkeley-BAIR 29] https://bair.berkeley.edu/blog/2021/10/22/mural/
[Berkeley-BAIR 30] https://bair.berkeley.edu/blog/2



[Berkeley-BAIR 167] https://bair.berkeley.edu/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://bair.berkeley.edu/faculty.html


[Berkeley-BAIR 167] https://bair.berkeley.edu/faculty.html




[Berkeley-BAIR 167] https://bair.berkeley.edu/students.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://bair.berkeley.edu//people.eecs.berkeley.edu/~pcmoritz


[Berkeley-BAIR 167] https://bair.berkeley.edu//people.eecs.berkeley.edu/~pcmoritz
[Berkeley-BAIR 167] https://bair.berkeley.edu/blog/page5/
[Berkeley-BAIR 168] https://bair.berkeley.edu/blog/page6/
[Berkeley-BAIR 169] https://bair.berkeley.edu/blog/page7/
[Berkeley-BAIR 170] https://bair.berkeley.edu/blog/page8/
[Berkeley-BAIR 171] https://bair.berkeley.edu/blog/page9/
[Berkeley-BAIR 172] https://bair.berkeley.edu/blog/page10/
[Berkeley-BAIR 173] https://bair.berkeley.edu/blog/page11/
[Berkeley-BAIR 174] https://bair.berkeley.edu/blog/page12/
✅ Berkeley-BAIR done. 174 pages scraped.


[CMU-CS 1] https://www.cs.cmu.edu/
[CMU-CS 2] https://www.cs.cmu.edu/directory
[CMU-CS 3] https://www.cs.cmu.edu/outreach/summer-research-opportunities


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 4] https://www.cs.cmu.edu/news/feed.rss
[CMU-CS 4] https://www.cs.cmu.edu/giving
[CMU-CS 5] https://www.cs.cmu.edu/overview-programs
[CMU-CS 6] https://www.cs.cmu.edu/scs-distinguished-lecture-series
[CMU-CS 7] https://www.cs.cmu.edu/research
[CMU-CS 8] https://www.cs.cmu.edu/calendar
[CMU-CS 9] https://www.cs.cmu.edu//www.qatar.cmu.edu


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu//www.qatar.cmu.edu


[CMU-CS 9] https://www.cs.cmu.edu/people/achievements/newell
[CMU-CS 10] https://www.cs.cmu.edu/publications
[CMU-CS 11] https://www.cs.cmu.edu/academics/course-listings-department
[CMU-CS 12] https://www.cs.cmu.edu/directory/
[CMU-CS 13] https://www.cs.cmu.edu/directory/hcii
[CMU-CS 14] https://www.cs.cmu.edu/directory/s3d
[CMU-CS 15] https://www.cs.cmu.edu/directory/all
[CMU-CS 16] https://www.cs.cmu.edu/directory/mld
[CMU-CS 17] https://www.cs.cmu.edu/directory/ri
[CMU-CS 18] https://www.cs.cmu.edu/key-contacts
[CMU-CS 19] https://www.cs.cmu.edu/directory/lti
[CMU-CS 20] https://www.cs.cmu.edu/directory/computing
[CMU-CS 21] https://www.cs.cmu.edu/directory/dean
[CMU-CS 22] https://www.cs.cmu.edu/directory/cbd
[CMU-CS 23] https://www.cs.cmu.edu/directory/csd
[CMU-CS 24] https://www.cs.cmu.edu/funds/scs-outreach-fund
[CMU-CS 25] https://www.cs.cmu.edu/funds/dean-s-innovation-fund-scs
[CMU-CS 26] https://www.cs.cmu.edu/funds/scs-general-scholarship-fund
[CMU-CS 27] https://www.cs.cmu.

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/~music/mat/


[CMU-CS 30] https://www.cs.cmu.edu/~music/mat/
[CMU-CS 30] https://www.cs.cmu.edu/bsai


ERROR:trafilatura.downloads:download error: https://www.cs.cmu.edu/bsai HTTPSConnectionPool(host='www.cs.cmu.edu', port=443): Max retries exceeded with url: https://www.cs.cmu.edu/bs-in-artificial-intelligence (Caused by ResponseError('too many redirects'))
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/~music/mat/master.html


[CMU-CS 30] https://www.cs.cmu.edu/~music/mat/master.html
[CMU-CS 30] https://www.cs.cmu.edu/doctoral-programs
[CMU-CS 31] https://www.cs.cmu.edu/masters-programs
[CMU-CS 32] https://www.cs.cmu.edu/news
[CMU-CS 33] https://www.cs.cmu.edu/scs-tour-policy
[CMU-CS 34] https://www.cs.cmu.edu/bs-in-artificial-intelligence/
[CMU-CS 35] https://www.cs.cmu.edu/calendar/export.ics


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 35] https://www.cs.cmu.edu/calendar/feed/rss.xml


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 35] https://www.cs.cmu.edu/partnerships?utm_source=internal&utm_medium=web&utm_campaign=pships-in-menu


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 36] https://www.cs.cmu.edu/feed/rss.xml
[CMU-CS 36] https://www.cs.cmu.edu/people/achievements/newell/newellmedal_winners
[CMU-CS 37] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/intel95.html
[CMU-CS 38] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/hci.html
[CMU-CS 39] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/hopper94.html
[CMU-CS 40] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/inventing.html
[CMU-CS 41] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/ibm.html
[CMU-CS 42] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/general.html
[CMU-CS 43] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/rtmach.html
[CMU-CS 44] https://www.cs.cmu.edu/~fox/publications.html
[CMU-CS 45] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/acm.html
[CMU-CS 46] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/copetas/www/public/video/simon96.

ERROR:trafilatura.downloads:download error: https://www.cs.cmu.edu/scs-ece-career-center/ HTTPSConnectionPool(host='www.cs.cmu.edu', port=443): Max retries exceeded with url: https://www.cs.cmu.edu/scs-career-center (Caused by ResponseError('too many redirects'))


[CMU-CS 139] https://www.cs.cmu.edu/index
[CMU-CS 140] https://www.cs.cmu.edu/partnerships/meet-our-team
[CMU-CS 141] https://www.cs.cmu.edu/partnerships/forms/contact
[CMU-CS 142] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/RTtimer.html
[CMU-CS 143] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach.html
[CMU-CS 144] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/RTM.html
[CMU-CS 145] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/RTSync.html
[CMU-CS 146] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/documents_top.html


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.cs.cmu.edu/afs/cs/usr/grm/www/home.html


[CMU-CS 147] https://www.cs.cmu.edu/afs/cs/usr/grm/www/home.html
[CMU-CS 147] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/publications.html
[CMU-CS 148] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/osf.html
[CMU-CS 149] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/tutorials.html
[CMU-CS 150] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/books.html
[CMU-CS 151] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/install.html
[CMU-CS 152] https://www.cs.cmu.edu/bs-in-artificial-intelligence
[CMU-CS 153] https://www.cs.cmu.edu/~ref/Universal-Translator.html
[CMU-CS 154] https://www.cs.cmu.edu/cih/
[CMU-CS 155] https://www.cs.cmu.edu/cmlh/
[CMU-CS 156] https://www.cs.cmu.edu/wrc/
[CMU-CS 157] https://www.cs.cmu.edu/initiatives/ai-maker-space/
[CMU-CS 158] https://www.cs.cmu.edu/~wasm/wasm-research-day-2024.html
[CMU-CS 159] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/mach/public/www/FAQ.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs.cmu.edu/project/art-6/www/rtmach.html


[CMU-CS 160] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/art-6/www/rtmach.html
[CMU-CS 160] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/mach/public/www/overview.html
[CMU-CS 161] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/people-cur.html
[CMU-CS 162] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/people-former.html
[CMU-CS 163] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/mach/public/www/status.html
[CMU-CS 164] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/sources/sources_top.html
[CMU-CS 165] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/mach/public/www/projects/mach_us.html


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.cs.cmu.edu/afs/cs.cmu.edu/local/mosaic/common/omega/Web/SCS-HOME.html


[CMU-CS 166] https://www.cs.cmu.edu/afs/cs.cmu.edu/local/mosaic/common/omega/Web/SCS-HOME.html
[CMU-CS 166] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/bchen.thesis.html
[CMU-CS 167] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/asplosVM.html
[CMU-CS 168] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/mach3_intro.html
[CMU-CS 169] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/ipc.html
[CMU-CS 170] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/multiserver_interface.html
[CMU-CS 171] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/benchmark.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/system.address.tracing.html File:


[CMU-CS 172] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/system.address.tracing.html File:
[CMU-CS 172] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/machsys.html
[CMU-CS 173] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/cpuserver.html
[CMU-CS 174] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/netperf.html
[CMU-CS 175] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/netmemorysrv.html
[CMU-CS 176] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/pagereplace.html
[CMU-CS 177] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/user.level.protocols.html
[CMU-CS 178] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/midwaytr.html
[CMU-CS 179] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/threads87.html
[CMU-CS 180] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/newpktfilter.html
[CMU-CS

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs/project/mach/public/doc/unpublished/abstracts/mach-in-x11.html


[CMU-CS 183] https://www.cs.cmu.edu/afs/cs/project/mach/public/doc/unpublished/abstracts/mach-in-x11.html
[CMU-CS 183] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/cont_threads.html
[CMU-CS 184] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/mig.html
[CMU-CS 185] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/interpossys.html
[CMU-CS 186] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/MIvmm.html
[CMU-CS 187] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/debugger.html
[CMU-CS 188] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/dospaper.html
[CMU-CS 189] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/atm.html
[CMU-CS 190] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/mach_us-multiserver.html
[CMU-CS 191] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/monmanual.html
[CMU-CS 192] https://www.cs.cmu.e

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs/project/mach/public/doc/unpublished/abstracts/datamovement.html


[CMU-CS 196] https://www.cs.cmu.edu/afs/cs/project/mach/public/doc/unpublished/abstracts/datamovement.html
[CMU-CS 196] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/IPCperf.html
[CMU-CS 197] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/defaultmm.html
[CMU-CS 198] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/os-memorysys.html
[CMU-CS 199] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/envmgr.html
[CMU-CS 200] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/cmultithread.html
[CMU-CS 201] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/dos-fs.html
[CMU-CS 202] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/non-blocking.html
[CMU-CS 203] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/mdos.html
[CMU-CS 204] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/abstracts/manual.html
[CMU-CS 205] https://www.cs.cmu.edu/

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/ftp.release


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/doc.info


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/i386_announce


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/HURD.info


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/linux.info


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/rs6k_announce


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/mailing-lists.info


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/platforms


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/MacMach.demise


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/license.info


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/mach.books


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/NeXT.release


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/mach3_supinfo


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 221] https://www.cs.cmu.edu/afs/cs/project/mach/public/FAQ/distribution.info
[CMU-CS 221] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/danner/www/danner.html


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.cs.cmu.edu/afs/cs.cmu.edu/user/grm/www/home.html


[CMU-CS 222] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/grm/www/home.html
[CMU-CS 222] https://www.cs.cmu.edu/afs/cs.cmu.edu/user/mrt/www/home.html
[CMU-CS 223] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/us-install.html
[CMU-CS 224] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/libus-ref-1192.html
[CMU-CS 225] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/mach_us_whatis.html
[CMU-CS 226] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/usenix-cpp-92.html
[CMU-CS 227] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/overview-0791.html
[CMU-CS 228] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/naming-0891.html
[CMU-CS 229] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/us_analyze.html
[CMU-CS 230] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/mach_us/abstracts/i-wooos-91.html
[CMU-CS 231] https://www.cs.cmu.edu/afs

ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 233] https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/TIP/SOSP15.ps


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/WWW/MultiC/WWW/top.html


[CMU-CS 233] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/WWW/MultiC/WWW/top.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/ParityLogging/tr94-170.abstract


[CMU-CS 233] https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/ParityLogging/tr94-170.abstract
[CMU-CS 233] https://www.cs.cmu.edu/afs/cs.cmu.edu/project/mach/public/www/mach.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/ParityLogging/tr94-170.ps


[CMU-CS 234] https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/ParityLogging/tr94-170.ps


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 234] https://www.cs.cmu.edu/afs/cs/project/nectar-io/ftp/ParityLogging/TR93-200.ps


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/afs/cs/project/pdl/WWW/HTML-Papers/Compcon95/final.fm.html


[CMU-CS 234] https://www.cs.cmu.edu/afs/cs/project/pdl/WWW/HTML-Papers/Compcon95/final.fm.html
[CMU-CS 234] https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/TIP/SOSP15_abstract.html


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


[CMU-CS 235] https://www.cs.cmu.edu/afs/cs/project/pdl/ftp/SPFS/Compcon95.ps
[CMU-CS 235] https://www.cs.cmu.edu/afs/cs/project/nectar-io/ftp/ParityLogging/ISCA93.abstract
[CMU-CS 236] https://www.cs.cmu.edu/afs/cs/project/mach/public/www/projects/mach_us.html
[CMU-CS 237] https://www.cs.cmu.edu/index.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/index.html


[CMU-CS 237] https://www.cs.cmu.edu/Publications/publications.html


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.cs.cmu.edu/Publications/publications.html


✅ CMU-CS done. 236 pages scraped.


[CMU-ML 1] https://www.ml.cmu.edu/research/index.html
[CMU-ML 2] https://www.ml.cmu.edu/academics/minor-senior-projects.html
[CMU-ML 3] https://www.ml.cmu.edu//www.cmu.edu/legal/
[CMU-ML 4] https://www.ml.cmu.edu/research/data-analysis-projects.html
[CMU-ML 5] https://www.ml.cmu.edu/people/core-faculty.html
[CMU-ML 6] https://www.ml.cmu.edu/research/phd-dissertations.html
[CMU-ML 7] https://www.ml.cmu.edu/research/technical-reports.html
[CMU-ML 8] https://www.ml.cmu.edu//www.cmu.edu/
[CMU-ML 9] https://www.ml.cmu.edu/index
[CMU-ML 10] https://www.ml.cmu.edu/Faculty_Hiring.html
[CMU-ML 11] https://www.ml.cmu.edu
[CMU-ML 12] https://www.ml.cmu.edu/current-students/student-orientation/


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML


[CMU-ML 13] https://www.ml.cmu.edu/cmsint/mldcmu.rss


ERROR:trafilatura.core:empty HTML tree: None


[CMU-ML 13] https://www.ml.cmu.edu/directions
[CMU-ML 14] https://www.ml.cmu.edu/about/contact-us.html
[CMU-ML 15] https://www.ml.cmu.edu/resources/
[CMU-ML 16] https://www.ml.cmu.edu/current-students/
[CMU-ML 17] https://www.ml.cmu.edu/current-students/frequently-asked-questions.html
✅ CMU-ML done. 17 pages scraped.

