In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [2]:
# URL of the page containing the top 100 songs
url = "https://en.wikipedia.org/wiki/Python"

# Send a GET request to the URL
response = requests.get(url)
response

<Response [200]>

In [3]:
#parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [4]:
# Select all <a> elements with href attribute
links = soup.select('a[href]')

# Extract and print the URLs
for link in links:
    # Extract the URL from the href attribute of the <a> tag
    link_url = link['href']
    # Check if the link starts with "http" or "https" to filter out internal links
    if link_url.startswith("http"):
        # Print the extracted URL
        print(link_url)

https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
https://af.wikipedia.org/wiki/Python
https://als.wikipedia.org/wiki/Python
https://ar.wikipedia.org/wiki/%D8%A8%D8%A7%D9%8A%D8%AB%D9%88%D9%86_(%D8%AA%D9%88%D8%B6%D9%8A%D8%AD)
https://az.wikipedia.org/wiki/Python_(d%C9%99qiql%C9%99%C5%9Fdirm%C9%99)
https://bn.wikipedia.org/wiki/%E0%A6%AA%E0%A6%BE%E0%A6%87%E0%A6%A5%E0%A6%A8_(%E0%A6%A6%E0%A7%8D%E0%A6%AC%E0%A7%8D%E0%A6%AF%E0%A6%B0%E0%A7%8D%E0%A6%A5%E0%A6%A4%E0%A6%BE_%E0%A6%A8%E0%A6%BF%E0%A6%B0%E0%A6%B8%E0%A6%A8)
https://be.wikipedia.org/wiki/Python
https://bg.wikipedia.org/wiki/%D0%9F%D0%B8%D1%82%D0%BE%D0%BD_(%D0%BF%D0%BE%D1%8F%D1%81%D0%BD%D0%B5%D0%BD%D0%B8%D0%B5)
https://cs.wikipedia.org/wiki/Python_(rozcestn%C3%ADk)
https://da.wikipedia.org/wiki/Python
https://de.wikipedia.org/wiki/Python
https://eo.wikipedia.org/wiki/Pitono_(apartigilo)
https://eu.wikipedia.org/wiki/Python_(argipena)
https://fa

# Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'

In [5]:
# URL of the United States Code download page
url = 'http://uscode.house.gov/download/download.shtml'

# Send a GET request to the URL
response = requests.get(url)

In [6]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Select all <div> elements with class 'usctitlechanged'
    titles_changed_divs = soup.select('div.usctitlechanged')
    
    # Extract the text from the selected <div> elements
    changed_titles = [div.get_text(strip=True) for div in titles_changed_divs]
    
    print("Titles that have been changed since the last release point:")
    for title in changed_titles:
        print(title)
else:
    print("Failed to retrieve page:", response.status_code)


Titles that have been changed since the last release point:
Title 8 - Aliens and Nationality
Title 10 - Armed Forces٭
Title 15 - Commerce and Trade
Title 16 - Conservation
Title 21 - Food and Drugs
Title 22 - Foreign Relations and Intercourse
Title 50 - War and National Defense


# List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'

In [7]:
# URL of the Wikipedia homepage
url = 'https://www.wikipedia.org/'

# Send a GET request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')


language_links = soup.select('a.link-box')
language_links

# Extract language names and number of articles
language_info = []

for link in language_links:
    language_name = link.find('strong').get_text()
    num_articles = link.find('small').get_text().replace('\xa0', ' ')
    language_info.append((language_name, num_articles))
    
# Print language names and number of articles
for language, articles in language_info:
    print(f"{language}: {articles}")

English: 6,796,000+ articles
EspaÃ±ol: 1.938.000+ artÃ­culos
Ð ÑÑÑÐºÐ¸Ð¹: 1Â 969Â 000+ ÑÑÐ°ÑÐµÐ¹
æ¥æ¬èª: 1,407,000+ è¨äº
Deutsch: 2.891.000+ Artikel
FranÃ§ais: 2â¯598â¯000+ articles
Italiano: 1.853.000+ voci
ä¸­æ: 1,409,000+ æ¡ç® / æ¢ç®
ÙØ§Ø±Ø³Û: Û¹Û¹ÛµÙ¬Û°Û°Û°+ ÙÙØ§ÙÙ
PortuguÃªs: 1.120.000+ artigos


# A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

In [8]:
# URL of the data.gov.uk search page for Crime and Justice
url = 'https://www.data.gov.uk/'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links within the 'div' with class 'govuk-grid-column-full'
links = soup.select('div.govuk-grid-column-full a')
links


[<a class="govuk-link" href="/search?filters%5Btopic%5D=Business+and+economy">Business and economy</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Crime+and+justice">Crime and justice</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Defence">Defence</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Education">Education</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Environment">Environment</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Government">Government</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Government+spending">Government spending</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Health">Health</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Mapping">Mapping</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Society">Society</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Towns+and+cities">Towns and cities</a>,
 <a class="govuk-link" href="/search?f

In [11]:
# Extract the href attribute (link) from each link
topic_links = [link['href'] for link in links]
topic_links    
     


['/search?filters%5Btopic%5D=Business+and+economy',
 '/search?filters%5Btopic%5D=Crime+and+justice',
 '/search?filters%5Btopic%5D=Defence',
 '/search?filters%5Btopic%5D=Education',
 '/search?filters%5Btopic%5D=Environment',
 '/search?filters%5Btopic%5D=Government',
 '/search?filters%5Btopic%5D=Government+spending',
 '/search?filters%5Btopic%5D=Health',
 '/search?filters%5Btopic%5D=Mapping',
 '/search?filters%5Btopic%5D=Society',
 '/search?filters%5Btopic%5D=Towns+and+cities',
 '/search?filters%5Btopic%5D=Transport',
 '/search?filters%5Btopic%5D=Digital+service+performance',
 '/search?filters%5Btopic%5D=Government+reference+data']

In [13]:
# We parse and extract the topic names from the URLs. This involves splitting the URLs and replacing + characters with spaces.
topics = [link.split('=')[-1].replace('+', ' ') for link in topic_links]    

# Print the list of topic names
print("Different topics or categories of datasets:")
for topic in topics:
 print(topic)   

Different topics or categories of datasets:
Business and economy
Crime and justice
Defence
Education
Environment
Government
Government spending
Health
Mapping
Society
Towns and cities
Transport
Digital service performance
Government reference data
