# Scrape Cities from Wikipedia

In [186]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Automated Scraping

In [187]:
# list of cities

cities = ["Berlin", "Paris", "London", "Madrid", "Rome", 'Amsterdam','Barcelona','Lisbon','Prague','Vienna']

In [188]:
# find url and store it in a list
url = [] 
for i in range(len(cities)):
  url.append("https://en.wikipedia.org/wiki/" + cities[i])

In [189]:
# initialize lists

name = []
country = []
latitude = []
longitude = []
population = []

In [190]:
# fill lists

for i in range(len(url)):
  response = requests.get(url[i])
  soup = BeautifulSoup(response.content, "html.parser")
  name.append(soup.select("h1")[0].get_text())
  country.append(soup.select_one(".infobox-data").get_text())
  latitude.append(soup.select_one(".latitude").get_text())
  longitude.append(soup.select_one(".longitude").get_text())
  if soup.find('a', string=['Urban', 'Metropolitan City']) != None:
    population.append(soup.find('a', string=['Urban', 'Metropolitan City']).parent.parent.select('td')[0].get_text())
  else:
    population.append("---")

In [191]:
# print results 

print(name)
print(country)
print(latitude)
print(longitude)
print(population)

['Berlin', 'Paris', 'London', 'Madrid', 'Rome', 'Amsterdam', 'Barcelona', 'Lisbon', 'Prague', 'Vienna']
['Germany', 'France', 'United Kingdom', '\xa0Spain', ' Italy[a]', 'Netherlands', '\xa0Spain', '\xa0Portugal', '\xa0Czech Republic', 'Austria']
['52°31′12″N', '48°51′24″N', '51°30′26″N', '40°25′00″N', '41°53′36″N', '52°22′N', '41°23′N', '38°43′31″N', '50°05′15″N', '48°12′N']
['13°24′18″E', '2°21′08″E', '0°7′39″W', '03°42′09″W', '12°28′58″E', '4°54′E', '2°11′E', '9°09′00″W', '14°25′17″E', '16°22′E']
['4,473,101', '10,858,852', '9,950,000', '6,211,000[2]', '4,342,212[2]', '1,558,755', '4,840,000[3]', '2,719,000[4]', '---', '1,951,354']


## Storing information in pandas DataFrames

In [192]:
cities_df = pd.DataFrame(
    {"name": name,
     "country": country,
     "longitude": longitude,
     "latitude": latitude,
     "population": population
    }
)

In [193]:
cities_df.head(20)

Unnamed: 0,name,country,longitude,latitude,population
0,Berlin,Germany,13°24′18″E,52°31′12″N,4473101
1,Paris,France,2°21′08″E,48°51′24″N,10858852
2,London,United Kingdom,0°7′39″W,51°30′26″N,9950000
3,Madrid,Spain,03°42′09″W,40°25′00″N,"6,211,000[2]"
4,Rome,Italy[a],12°28′58″E,41°53′36″N,"4,342,212[2]"
5,Amsterdam,Netherlands,4°54′E,52°22′N,1558755
6,Barcelona,Spain,2°11′E,41°23′N,"4,840,000[3]"
7,Lisbon,Portugal,9°09′00″W,38°43′31″N,"2,719,000[4]"
8,Prague,Czech Republic,14°25′17″E,50°05′15″N,---
9,Vienna,Austria,16°22′E,48°12′N,1951354


In [194]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        10 non-null     object
 1   country     10 non-null     object
 2   longitude   10 non-null     object
 3   latitude    10 non-null     object
 4   population  10 non-null     object
dtypes: object(5)
memory usage: 528.0+ bytes
