In [296]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import unidecode

- Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: `url ='https://en.wikipedia.org/wiki/Python'`

#### Python Page

In [7]:
url = "https://en.wikipedia.org/wiki/Python"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [9]:
soup = BeautifulSoup(response.content,"html.parser")

In [35]:
anchors = soup.find_all('a')
links = [i.get('href') for i in anchors if str(i.get('href'))[:4] == 'http']

In [39]:
print(links[:5]) ## df.head()

['https://en.wiktionary.org/wiki/Python', 'https://en.wiktionary.org/wiki/python', 'https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/Python&namespace=0', 'https://en.wikipedia.org/w/index.php?title=Python&oldid=1077300299', 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en']


- Find the number of titles that have changed in the United States Code since its last release point: `url = 'http://uscode.house.gov/download/download.shtml'`

In [40]:
url = "http://uscode.house.gov/download/download.shtml"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [41]:
soup = BeautifulSoup(response.content, "html.parser")

In [75]:
name = soup.select("#us\/usc\/t16")
name[0].text.strip().split(" ")[-1]

'Conservation'

- Create a Python list with the top ten FBI's Most Wanted names: `url = 'https://www.fbi.gov/wanted/topten'`

In [76]:
url = "https://www.fbi.gov/wanted/topten"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [77]:
soup = BeautifulSoup(response.content, "html.parser")

In [99]:
criminals = soup.select("#query-results-0f737222c5054a81a120bce207b0446a > ul ")
criminals = criminals[0].find_all('li')

In [102]:
[i.find_all('a')[1].text for i in criminals]

['ALEXIS FLORES',
 'JOSE RODOLFO VILLARREAL-HERNANDEZ',
 'RAFAEL CARO-QUINTERO',
 'YULAN ADONAY ARCHAGA CARIAS',
 'EUGENE PALMER',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'ALEJANDRO ROSALES CASTILLO',
 'ARNOLDO JIMENEZ',
 'JASON DEREK BROWN',
 'OCTAVIANO JUAREZ-CORRO']

- Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: `url = 'https://www.emsc-csem.org/Earthquake/'`

In [126]:
url = "https://www.emsc-csem.org/Earthquake/"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [127]:
soup = BeautifulSoup(response.content, "html.parser")

In [217]:
body = soup.select('#tbody')[0]
rows = body.find_all('tr')

In [239]:
date = []
time = []
latitude = []
longitude = []
magnitude = []
region_name = []

for row in rows[:20]:
    row = [i.text for i in row.find_all('td')]

    date.append(row[3][10:20])
    time.append(row[3][23:33])
    latitude.append(row[4][:-1] + " " + row[5][:-2])
    longitude.append(row[6][:-1] + " " + row[7][:-2])
    magnitude.append(float(row[10]))
    region_name.append(row[11][1:])
    

earthquakes = pd.DataFrame({"date":date,
                       "time" : time,
                        "latitude": latitude,
                        "longitude": longitude,
                        "magnitude": magnitude,
                        "region_name": region_name,
                      })

In [241]:
earthquakes

Unnamed: 0,date,time,latitude,longitude,magnitude,region_name
0,2022-04-25,13:20:05.0,33.82 S,71.65 W,2.9,"VALPARAISO, CHILE"
1,2022-04-25,13:07:59.0,3.83 S,127.60 E,3.7,"SERAM, INDONESIA"
2,2022-04-25,13:06:17.2,41.86 N,22.89 E,2.7,REPUBLIC OF NORTH MACEDONIA
3,2022-04-25,12:50:22.1,43.71 N,7.63 W,1.8,SPAIN
4,2022-04-25,12:36:33.0,3.18 S,140.24 E,3.3,"PAPUA, INDONESIA"
5,2022-04-25,12:35:06.2,19.23 N,155.40 W,2.1,"ISLAND OF HAWAII, HAWAII"
6,2022-04-25,12:28:39.0,20.90 S,67.81 W,2.9,"POTOSI, BOLIVIA"
7,2022-04-25,12:13:30.0,2.42 S,139.46 E,3.5,"NEAR N COAST OF PAPUA, INDONESIA"
8,2022-04-25,11:45:48.0,23.99 S,66.93 W,2.6,"JUJUY, ARGENTINA"
9,2022-04-25,11:37:23.0,18.07 N,67.15 W,2.2,PUERTO RICO


- List all language names and number of related articles in the order they appear in [wikipedia.org](wikipedia.org): `url = 'https://www.wikipedia.org/'`

In [250]:
url = "https://www.wikipedia.org/"
response = requests.get(url)
response.status_code # 200 status code means OK!
soup = BeautifulSoup(response.content, "html.parser")

In [273]:
languages = soup.select("#www-wikipedia-org > div.central-featured")[0].find_all('div')

In [304]:
name = []
numbers = [] 

for lang in languages:
    name.append(lang.find('strong').text)
    number = lang.find('small').text.split(" ")[:-1][0][:-1]
    number = unidecode.unidecode(number)
    number = int("".join(number.split(" ")))
    numbers.append(number)
    
pd.DataFrame({
    "language": name,
    "number_of_articles": numbers,
})

Unnamed: 0,language,number_of_articles
0,English,6458000
1,Русский,1798000
2,日本語,1314000
3,Deutsch,2667000
4,Español,1755000
5,Français,2400000
6,中文,1256000
7,Italiano,1742000
8,Português,1085000
9,Polski,1512000


- A list with the different kind of datasets available in [data.gov.uk](data.gov.uk): `url = 'https://data.gov.uk/'`

In [305]:
url = "https://data.gov.uk/"
response = requests.get(url)
response.status_code # 200 status code means OK!
soup = BeautifulSoup(response.content, "html.parser")

In [316]:
[i.text for i in soup.find_all("h3")]

['Business and economy',
 'Crime and justice',
 'Defence',
 'Education',
 'Environment',
 'Government',
 'Government spending',
 'Health',
 'Mapping',
 'Society',
 'Towns and cities',
 'Transport',
 'Digital service performance',
 'Government reference data']

- Display the top 10 languages by number of native speakers stored in a pandas dataframe: `url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'`

In [317]:
url = "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"
response = requests.get(url)
response.status_code # 200 status code means OK!
soup = BeautifulSoup(response.content, "html.parser")

In [326]:
top_10 = soup.find_all('tbody')[1].find_all('tr')[1:11]

In [340]:
language = []
n_speakers = []

for lang in top_10:
    language.append(lang.find_all('td')[1].text.strip())
    speakers = float(lang.find_all('td')[2].text.strip())
    n_speakers.append(int(speakers))
    
    
pd.DataFrame({
    "language": language,
    "number of speakers in millions so you can easily read it ": n_speakers,
})

Unnamed: 0,language,number of speakers in millions so you can easily read it
0,Mandarin Chinese,918
1,Spanish,480
2,English,379
3,Hindi (sanskritised Hindustani)[11],341
4,Bengali,300
5,Portuguese,221
6,Russian,154
7,Japanese,128
8,Western Punjabi[12],92
9,Marathi,83
