# Extracting Documents from Wikipedia and HTML Files

In [1]:
import requests
import wikipedia
from bs4 import BeautifulSoup

In [None]:
topic = "Alan Turing"
print(f"Fetching topic: {topic}")

## Note: auto_suggest MUST be set to False, otherwise it will priotize
## suggesstions over actual results, resulting in PageError: Page id <topic>
## does not match any pages. Try another id! (no matter which topic you select)
summary = wikipedia.page(topic, auto_suggest = False).content
print("Fetched summary length:", len(summary))

## Save the wikipedia article to a .txt file
with open("./documents/alan_turing.txt", "w", encoding = "utf-8") as f:
    f.write(summary)

Fetching topic: Alan Turing
Fetched summary length: 52740


In [None]:
topic = "Nikola Tesla"
print(f"Fetching topic: {topic}")

## Note: auto_suggest MUST be set to False, otherwise it will priotize
## suggesstions over actual results, resulting in PageError: Page id <topic>
## does not match any pages. Try another id! (no matter which topic you select)
summary = wikipedia.page(topic, auto_suggest = False).content
print("Fetched summary length:", len(summary))

## Save the wikipedia article to a .txt file
with open("./documents/nikola_tesla.txt", "w", encoding = "utf-8") as f:
    f.write(summary)

Fetching topic: Nikola Tesla
Fetched summary length: 53700


In [None]:
url = "https://www.climaterealityproject.org/blog/congress-living-fantasy-world-budget-bill"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    # Find the main article container
    article = soup.find("article")
    if not article:
        article = soup.find("div", class_="node__content")

    if article:
        # Save just the article HTML (with tags) to a file
        with open("./documents/climate_change.html", "w", encoding="utf-8") as f:
            f.write(str(article))
        print("Article HTML saved as 'climate_change.html'")
    else:
        print("Could not find the main article content.")
else:
    print('Failed to retrieve the document')



Article HTML saved as 'climate_change.html'
