In [25]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [26]:
webpage = 'https://es.wikipedia.org/wiki/Anexo:Municipios_de_Espa%C3%B1a_por_poblaci%C3%B3n'

req = Request(webpage, headers={'User-Agent': 'Mozilla/5.0'})
raw_web = urlopen(req, timeout=10).read()

In [27]:
soup = BeautifulSoup(raw_web, 'html.parser')
tables = soup.find_all('table')
titulos_tablas = soup.find_all('span', attrs={"class": "mw-headline"})

In [28]:
print('Títulos de tablas: ', len(tables))
print('Tablas: ', len(tables))

Títulos de tablas:  7
Tablas:  7


In [29]:
head = []
rows_raw = []
for table in tables:
    
    head = table.find_all('th') #We skip the first header as we saw we don't want it
    table_rows = table.find_all('tr')[1:]
    
    for fila in table_rows:
        cols = []
        for col in fila.find_all('td'):
            cols.append(col.text.strip())
            
        rows_raw.append(cols)
    

In [30]:
rows_raw

[['1',
  'Madrid',
  '3 280 782',
  'Madrid\xa0Madrid',
  'Comunidad de Madrid\xa0Comunidad de Madrid'],
 ['2',
  'Barcelona',
  '1 636 193',
  'Barcelona\xa0Barcelona',
  'Cataluña\xa0Cataluña'],
 ['3',
  'Valencia',
  '792 492',
  'ValenciaValencia',
  'Comunidad\xa0Valenciana\xa0Comunidad\xa0Valenciana'],
 ['4', 'Sevilla', '681 998', 'Sevilla\xa0Sevilla', 'Andalucía\xa0Andalucía'],
 ['5', 'Zaragoza', '673 010', 'Zaragoza\xa0Zaragoza', 'Aragón\xa0Aragón'],
 ['6', 'Málaga', '579 076', 'Málaga\xa0Málaga', 'Andalucía\xa0Andalucía'],
 ['7', 'Murcia', '462 979', 'Región de Murcia\xa0Región de Murcia'],
 ['8', 'Palma', '415 940', 'Islas Baleares\xa0Islas Baleares'],
 ['9',
  'Las Palmas de Gran Canaria',
  '378 797',
  'Las PalmasLas Palmas',
  'Canarias'],
 ['10', 'Bilbao', '344 127', 'Vizcaya\xa0Vizcaya', 'País Vasco\xa0País Vasco'],
 ['11',
  'Alicante',
  '338 577',
  'AlicanteAlicante',
  'Comunidad\xa0Valenciana\xa0Comunidad\xa0Valenciana'],
 ['12', 'Córdoba', '319 515', 'Córdoba\xa0

In [31]:
rows_raw[0]

['1',
 'Madrid',
 '3 280 782',
 'Madrid\xa0Madrid',
 'Comunidad de Madrid\xa0Comunidad de Madrid']

In [69]:
len(rows_raw)

1313

In [72]:
#Limpieza de cada fila

for row in rows_raw:
    
    if len(row) == 4:
        row.append(row[3])
        
    row[0] = int(row[0])+1000
    row[2] = int("".join(str(row[2]).split(" ")))
    row[3] = row[3].split("\xa0")[0]
    row[4] = row[4].split("\xa0")[0]
    
rows_raw[0]

[2001, 'Madrid', 3280782, 'Madrid', 'Comunidad de Madrid']

In [73]:
len(head)

5

In [36]:
for fila in rows_raw:
    
    if len(fila) != len(head):
        print('Cols', len(fila))
        print(fila)
        print('---------------')

In [37]:
all_data = pd.DataFrame(rows_raw, columns=['ID', 'City', 'Population', 'Region', 'State'])

In [38]:
all_data

Unnamed: 0,ID,City,Population,Region,State
0,1001,Madrid,3280782,Madrid,Comunidad de Madrid
1,1002,Barcelona,1636193,Barcelona,Cataluña
2,1003,Valencia,792492,ValenciaValencia,Comunidad
3,1004,Sevilla,681998,Sevilla,Andalucía
4,1005,Zaragoza,673010,Zaragoza,Aragón
...,...,...,...,...,...
1308,2309,Oliva de la Frontera,5010,Badajoz,Extremadura
1309,2310,Canyellas,5007,Barcelona,Cataluña
1310,2311,Vedra,5005,La CoruñaLa Coruña,Galicia
1311,2312,Mocejón,5004,Toledo,Castilla-La Mancha


In [39]:
# Generar la tabla State

state_id = 1000
states_raw = all_data['State'].drop_duplicates()

states_list = []

for state in states_raw:
    
    states_list.append({"id": state_id, "name": state, "country": "ES"})
    
    state_id += 1

states = pd.DataFrame(states_list, columns = states_list[0].keys())

states.head()

Unnamed: 0,id,name,country
0,1000,Comunidad de Madrid,ES
1,1001,Cataluña,ES
2,1002,Comunidad,ES
3,1003,Andalucía,ES
4,1004,Aragón,ES


In [58]:
states['id']

0     1000
1     1001
2     1002
3     1003
4     1004
5     1005
6     1006
7     1007
8     1008
9     1009
10    1010
11    1011
12    1012
13    1013
14    1014
15    1015
16    1016
17    1017
18    1018
Name: id, dtype: int64

In [51]:
states['name']

0        Comunidad de Madrid
1                   Cataluña
2                  Comunidad
3                  Andalucía
4                     Aragón
5           Región de Murcia
6             Islas Baleares
7                   Canarias
8                 País Vasco
9            Castilla y León
10                   Galicia
11    Principado de Asturias
12                   Navarra
13        Castilla-La Mancha
14                 Cantabria
15               Extremadura
16                  La Rioja
17                   Melilla
18                     Ceuta
Name: name, dtype: object

In [40]:
# Generar la tabla region

region_id = 1000
regions_raw = all_data[['Region','State']].drop_duplicates().reset_index(drop=True)

regions_list = []

for i in regions_raw.index:
    
    region = regions_raw.iloc[i].to_dict()
    
    state_id = states[states['name'] == region['State']].iloc[0]['id']
    
    regions_list.append({"id":region_id, "stateID": state_id, "name": region['Region']})
    
    region_id += 1
    
regions = pd.DataFrame(regions_list, columns=regions_list[0].keys())

regions.head()

Unnamed: 0,id,stateID,name
0,1000,1000,Madrid
1,1001,1001,Barcelona
2,1002,1002,ValenciaValencia
3,1003,1003,Sevilla
4,1004,1004,Zaragoza


In [41]:
# Generar la tabla poblaciones

cities_list = []

for ciudad in rows_raw:
    
    region_id = regions[regions['name'] == ciudad[3]].iloc[0]['id']
    
    cities_list.append({"id": ciudad[0], "regionID": region_id, "name": ciudad[1], "population": ciudad[2]})
    

cities = pd.DataFrame(cities_list, columns=cities_list[0].keys())

cities.head()

Unnamed: 0,id,regionID,name,population
0,1001,1000,Madrid,3280782
1,1002,1001,Barcelona,1636193
2,1003,1002,Valencia,792492
3,1004,1003,Sevilla,681998
4,1005,1004,Zaragoza,673010


Por último solo es necesario iterar sobre cada tabla con un INSERT INTO para agregar a las tablas SQL

In [52]:
import mysql.connector

In [53]:
db = mysql.connector.connect(
    host="localhost",
    user="root",
    password="Pepito.08"
)
cursor = db.cursor()

In [54]:
cursor.execute("CREATE DATABASE states")
cursor.execute("USE states")
cursor.execute("""
    CREATE TABLE estados (
    id int(5),
    name varchar(60),
    country char(2)
    );
""")

In [63]:
    cols = states.keys()
    print(cols)

Index(['id', 'name', 'country'], dtype='object')


In [64]:
table_name = 'estados'
for _, row in states.iterrows():
    values = tuple(row.values)
    query = f"INSERT INTO {table_name} (id, name, country) VALUES {values}"
    cursor.execute(query)
db.commit()

In [66]:
db.close()
cursor.close()

True