In [None]:
# colab has an older version of beautifulsoup by default
# here we upgrade it
# if you are working on your own computer, you can probably comment this step out and skip it
!pip install --upgrade beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 6.5 MB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed beautifulsoup4-4.11.1 soupsieve-2.3.2.post1


In [None]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [None]:
# 2. find url and store it in a variable
url = "https://en.wikipedia.org/wiki/Berlin"

In [None]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [None]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Berlin - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"b47f6881-44d6-44b8-9783-ecb14eadf7eb","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Berlin","wgTitle":"Berlin","wgCurRevisionId":1119451206,"wgRevisionId":1119451206,"wgArticleId":3354,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 German-language sources (de)","CS1 maint: multiple names: authors list","CS1 maint: bot: original URL status unknown","Webarchive template wayback links","All articles with

In [None]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector" you copied before to get the element that belongs to the top movie)

# let's first try to get the name of the city
# by copying the selector we can see that it has the id firstHeading (it also has a class by the same name!)
soup.select("#firstHeading")

[<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Berlin</span></h1>]

In [None]:
soup.select("#firstHeading")[0].get_text()

'Berlin'

In [None]:
# Let's use this class, infobox-data, to target the information country
soup.select(".infobox-data")[0].get_text()

'\xa0Germany'

In [None]:
#soup.select(".infobox-data")[0].get_text()

Now we just carry on exploring the html, finding classes, ids, and selectors to target the information we need. Hopefully these classes and selectors will be universal across all cities on wikipedia, but it is likely that they will change in a few places, and we will have to try to make our code robust to this

In [None]:
def recreate_wiki(cities):
  # empty list that will be filled with one dictionary of information per city
  list_for_df = []
  
  # begin a for loop to create a dictionary of information for each city
  for city in cities:
    # we can use the universal nature of wikipedias urls to our advantage here
    # all of the urls are the same besides the city name
    url = f'https://en.wikipedia.org/wiki/{city}'

    # here we make our soup for the city
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # here we initialise our empty dictionary for the city
    response_dict = {}

    # here we fill the dictionary with information using the ids, classes, and selectors that we found in the html
    response_dict['city'] = soup.select(".firstHeading")[0].get_text()
    response_dict['country'] = soup.select(".infobox-data")[0].get_text()
    response_dict['latitude'] = soup.select(".latitude")[0].get_text()
    response_dict['longitude'] = soup.select(".longitude")[0].get_text()
    # not all of the wikipedia pages contain elevation, look at Hamburg
    # the if clause means that our code can continue and won't stop at this hurdle
    if soup.select_one('.infobox-label:-soup-contains("Elevation")'):
      response_dict['elevation'] = soup.select_one('.infobox-label:-soup-contains("Elevation")').find_next(class_='infobox-data').get_text()
    response_dict['website'] = soup.select_one('.infobox-label:-soup-contains("Website")').find_next(class_='infobox-data').get_text()
    if soup.select_one('th.infobox-header:-soup-contains("Population")'):
        response_dict['population'] = soup.select_one('th.infobox-header:-soup-contains("Population")').parent.find_next_sibling().find(text=re.compile(r'\d+'))
    
    # add our dictionary for the city to list_for_df
    list_for_df.append(response_dict)
  
  # make the DataFrame
  cities_df = pd.DataFrame(list_for_df)

  # fixing latitude
  cities_df['latitude'] = cities_df['latitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
  # fixing longitude
  cities_df['longitude'] = cities_df['longitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
  # fixing elevation
  cities_df.insert(4, 'elevation_in_meters', cities_df['elevation'].str.split('m').str[0].str.strip())

  # return the DataFrame
  return cities_df

In [None]:
list_of_cities = ['Berlin', 'Hamburg', 'London', 'Manchester', 'Barcelona']
recreate_wiki(list_of_cities)

Unnamed: 0,city,country,latitude,longitude,elevation_in_meters,elevation,website,population
0,Berlin,Germany,52.3112,13.2418,34,34 m (112 ft),www.berlin.de/en/,3769495
1,Hamburg,Germany,53.3300,10.0000,,,www.hamburg.de/stadt-staat/,1845229
2,London,United Kingdom,51.3026,0.739,36 ft (11,36 ft (11 m),www.london.gov.uk,8799800
3,Manchester,United Kingdom,53.2846,2.1443,38,38 m (125 ft),manchester.gov.uk,552858
4,Barcelona,Spain,41.23N,2.11E,12,12 m (39 ft),www.barcelona.cat,1620343
