In [29]:
 !pip install beautifulsoup4



In [30]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sqlalchemy
import datetime as dt
import re
import keys

In [31]:
def get_city_data(cities_list):
  cities_data = []
  for city in cities_list:
    wikipedia_url = "https://www.wikipedia.org/wiki/"
    response = requests.get(wikipedia_url + city)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    city_data = {
        'City': city,
        'Country': city_soup.find("th", string="Country").find_next().get_text(),
        'Longitude': city_soup.find(class_="longitude").get_text(),
        'Latitude': city_soup.find(class_="latitude").get_text()
    }
    cities_data.append(city_data)
  return pd.DataFrame(cities_data)

In [32]:
def get_population_data(cities_list):
  population_data = []
  for city in cities_list:
    wikipedia_url = "https://www.wikipedia.org/wiki/"
    response = requests.get(wikipedia_url + city)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    pop = city_soup.find(string="Population").find_next('td').get_text()
    int_pop = int(pop.replace(',',''))

    pop_data = {
        'City': city,
        'Population': int_pop,
        'Timestamp_': dt.datetime.now().strftime('%d/%m/%y')
    }
    population_data.append(pop_data)
  return pd.DataFrame(population_data)

In [37]:
schema = "sql_cities"
host = "127.0.0.1"
user = "root"
password = keys.connection_password
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [39]:
cities_list = ['Berlin','Munich','Madrid']

In [41]:
cities_df = get_city_data(cities_list)
cities_df

Unnamed: 0,City,Country,Longitude,Latitude
0,Berlin,Germany,13°24′18″E,52°31′12″N
1,Munich,Germany,11°34′30″E,48°08′15″N
2,Madrid,Country,03°42′12″W,40°25′01″N


In [42]:
cities_df.to_sql('cities',
                  if_exists='append',
                  con=connection_string,
                  index=False)

3

In [45]:
cities_from_sql = pd.read_sql("cities", con=connection_string)
cities_from_sql

Unnamed: 0,city_id,City,Country,Longitude,Latitude
0,1,Berlin,Germany,13°24′18″E,52°31′12″N
1,2,Munich,Germany,11°34′30″E,48°08′15″N
2,3,Madrid,Country,03°42′12″W,40°25′01″N


In [47]:
pop_data_df = get_population_data(cities_from_sql['City'])
pop_data_df

Unnamed: 0,City,Population,Timestamp_
0,Berlin,3596999,28/01/25
1,Munich,1510378,28/01/25
2,Madrid,3223334,28/01/25


In [48]:
population_df = (
    pop_data_df
    .merge(cities_from_sql,
           on = "City",
           how = "inner")
    [['city_id','Population','Timestamp_']]
)
population_df

Unnamed: 0,city_id,Population,Timestamp_
0,1,3596999,28/01/25
1,2,1510378,28/01/25
2,3,3223334,28/01/25


In [51]:
population_df.to_sql('population',
                  if_exists='append',
                  con=connection_string,
                  index=False)

3

In [53]:
population_df = pd.read_sql("population", con=connection_string)
population_df

Unnamed: 0,population_id,city_id,Population,Timestamp_
0,1,1,3596999,2028-01-25
1,2,2,1510378,2028-01-25
2,3,3,3223334,2028-01-25
