In [1]:
import csv
import datetime
import itertools
import requests

In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [3]:
N_TOTAL_DEPUTADOS = 230
N_SONDAGENS = 5
ERRO_PCT = 1  # Pct do total de votos
PARTIDOS = ['PS', 'PSD', 'B.E.', 'CDU', 'CDS–PP', 'PAN', 'CH', 'IL', 'L']
WIKI_URL = "https://pt.wikipedia.org/wiki/Elei%C3%A7%C3%B5es_legislativas_portuguesas_de_2022"

In [4]:
table_class = "wikitable sortable jquery-tablesorter"
response = requests.get(WIKI_URL)
assert response.status_code == 200
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
def get_sondagens(soup):
    table = soup.find('table', {'class': 'wikitable mw-datatable'})
    # Remove everything that is inside span tag (projecção de assentos)
    spans = table.find_all('span')
    for s in spans:
        s.string.replace_with("")
    # Convert to pandas DataFrame
    df = pd.read_html(str(table))[0]
    # Remove bottom row of the header
    df = df.droplevel(1, axis=1)
    # Remove rows where 'Amostra' is not a number
    df = df[pd.to_numeric(df['Amostra'], errors='coerce').notnull()]
    # Convert 'Amostra' column to numeric
    df['Amostra'] = pd.to_numeric(df['Amostra'])
    # Replace dashes by none
    df = df.replace(("—", "–"), np.nan)
    return df

sondagens = get_sondagens(soup)
sondagens

Unnamed: 0,Empresa de sondagens,Data de amostragem,Amostra,Abstenção,PS,PSD,B.E.,CDU,CDS–PP,PAN,CH,IL,L,O.,V.
0,Pitagórica (diário),17-20 jan 2022,608.0,,34.6,33.5,4.9,4.5,1.2,1.6,6.3,6.3,1.6,5.3,1.1
1,Pitagórica (diário),16–19 jan 2022,608.0,,36.5,32.9,5.0,5.0,1.0,1.9,6.3,5.2,1.5,4.8,3.6
2,CESOP–UCP,12–18 Jan 2022,1456.0,,37,33,5,5,2,2,6,5,2,3,4
3,Pitagórica (diário),15–18 jan 2022,608.0,,38.7,30.4,5.1,5.3,1.1,1.7,7.2,4.7,1.1,4.7,8.3
4,Pitagórica (diário),14–17 jan 2022,608.0,,39.8,30.4,4.5,5.8,1.3,1.7,7.5,4.3,1.5,3.2,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,Eurosondagem,8–12 dez 2019,1019.0,,37.1,26.9,9.5,7.1,3.6,3.5,1.9,1.1,0.6,8.7,10.2
115,Intercampus,20–26 nov 2019,604.0,,34.9,24.9,10.8,8.1,2.9,4.8,4.8,2.9,2.7,3.2,10.0
116,Eurosondagem,17–21 nov 2019,1011.0,,36.9,27.1,10.1,6.9,4.0,3.3,,,,11.7,9.8
117,Aximage,8–11 nov 2019,639.0,,37.4,27.1,10.4,6.2,4.0,3.2,3.0,1.3,0.9,6.5,10.3


In [6]:
with open('numero_deputados.csv', encoding="utf-8") as csv_file:
    reader = csv.reader(csv_file)
    n_deps = {dist: int(n) for dist, n in reader}

assert sum(n_deps.values()) == N_TOTAL_DEPUTADOS
n_deps

{'Lisboa': 48,
 'Porto': 40,
 'Braga': 19,
 'Setúbal': 18,
 'Aveiro': 16,
 'Leiria': 10,
 'Coimbra': 9,
 'Faro': 9,
 'Santarém': 9,
 'Viseu': 8,
 'Madeira': 6,
 'Viana do Castelo': 6,
 'Açores': 5,
 'Vila Real': 5,
 'Castelo Branco': 4,
 'Beja': 3,
 'Bragança': 3,
 'Évora': 3,
 'Guarda': 3,
 'Portalegre': 2,
 'Europa': 2,
 'Fora da Europa': 2}

In [7]:
# Funcoes para fazer parse as datas de inicio e de fim das sondagens
month_map = {"jan": 1, "fev": 2, "mar": 3, "abr": 4, "mai": 5, "jun": 6,
             "jul": 7, "ago": 8, "set": 9, "out": 10, "nov": 11, "dez": 12}


def _split_start_end(datas_sond):
    for splitter in ("–", "-"):
        if splitter in datas_sond:
            return datas_sond.split(splitter)
        
    # Se nao ha splitter, sondagem foi feita no mesmo dia
    return datas_sond, datas_sond
        

def _get_dt(dt_str):
    dd, mm, yy = dt_str.split()
    return datetime.date(year=int(yy), month=month_map[mm.lower()], day=int(dd))


def get_start_sond(datas_sond):
    start_str, end_str = _split_start_end(datas_sond)
    start_split = start_str.split()
    if len(start_split) == 3:
        # data tem ano, mes e dia
        return _get_dt(start_str)
    else:
        end_dt = _get_dt(end_str)
        if len(start_split) == 2:
            # data tem mes e dia, ano tirado do end_date
            dd, mm = start_split
            return datetime.date(year=end_dt.year, month=month_map[mm.lower()], day=int(dd))
        else:
            # data tem apenas dia, ano e mes sao os da end_date
            return datetime.date(year=end_dt.year, month=end_dt.month, day=int(start_split[0]))


def get_end_sond(datas_sond):
    end_str = _split_start_end(datas_sond)[1]
    return _get_dt(end_str)

In [8]:
sondagens["Início"] = sondagens["Data de amostragem"].apply(get_start_sond)
sondagens["Fim"] = sondagens["Data de amostragem"].apply(get_end_sond)
sondagens

Unnamed: 0,Empresa de sondagens,Data de amostragem,Amostra,Abstenção,PS,PSD,B.E.,CDU,CDS–PP,PAN,CH,IL,L,O.,V.,Início,Fim
0,Pitagórica (diário),17-20 jan 2022,608.0,,34.6,33.5,4.9,4.5,1.2,1.6,6.3,6.3,1.6,5.3,1.1,2022-01-17,2022-01-20
1,Pitagórica (diário),16–19 jan 2022,608.0,,36.5,32.9,5.0,5.0,1.0,1.9,6.3,5.2,1.5,4.8,3.6,2022-01-16,2022-01-19
2,CESOP–UCP,12–18 Jan 2022,1456.0,,37,33,5,5,2,2,6,5,2,3,4,2022-01-12,2022-01-18
3,Pitagórica (diário),15–18 jan 2022,608.0,,38.7,30.4,5.1,5.3,1.1,1.7,7.2,4.7,1.1,4.7,8.3,2022-01-15,2022-01-18
4,Pitagórica (diário),14–17 jan 2022,608.0,,39.8,30.4,4.5,5.8,1.3,1.7,7.5,4.3,1.5,3.2,9.4,2022-01-14,2022-01-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,Eurosondagem,8–12 dez 2019,1019.0,,37.1,26.9,9.5,7.1,3.6,3.5,1.9,1.1,0.6,8.7,10.2,2019-12-08,2019-12-12
115,Intercampus,20–26 nov 2019,604.0,,34.9,24.9,10.8,8.1,2.9,4.8,4.8,2.9,2.7,3.2,10.0,2019-11-20,2019-11-26
116,Eurosondagem,17–21 nov 2019,1011.0,,36.9,27.1,10.1,6.9,4.0,3.3,,,,11.7,9.8,2019-11-17,2019-11-21
117,Aximage,8–11 nov 2019,639.0,,37.4,27.1,10.4,6.2,4.0,3.2,3.0,1.3,0.9,6.5,10.3,2019-11-08,2019-11-11


In [9]:
# Remover sondagens da mesma empresa com datas coincidentes
EMPRESA = "Empresa de sondagens"
to_drop = []
for emp in set(sondagens[EMPRESA]):
    df = sondagens[sondagens[EMPRESA] == emp]
    prev_inicio = None
    for i, row in df.iterrows():
        inicio, fim = row["Início"], row["Fim"]
        if prev_inicio is not None and fim >= prev_inicio:
            to_drop.append(i)
        else:
            prev_inicio = inicio

sondagens = sondagens.drop(to_drop)

In [10]:
# Ordenar por data de fim da sondagem (data de inicio em caso de empate)
sondagens = sondagens.sort_values(by=["Fim", "Início"])
# Preencher valores vazios dos diferentes partidos e converter para numeric
for p in PARTIDOS:
    sondagens[p] = pd.to_numeric(sondagens[p].ffill().fillna(0))
# Mostrar ultimas N sondages
sondagens.tail(N_SONDAGENS)

Unnamed: 0,Empresa de sondagens,Data de amostragem,Amostra,Abstenção,PS,PSD,B.E.,CDU,CDS–PP,PAN,CH,IL,L,O.,V.,Início,Fim
11,CESOP–UCP,6–10 jan 2022,1246.0,,39.0,30.0,6.0,5.0,2.0,3.0,6.0,4.0,2.0,3.0,9.0,2022-01-06,2022-01-10
10,Aximage,6–12 jan 2022,807.0,,38.1,28.5,7.4,4.8,1.8,2.1,9.0,3.7,2.0,4.6,9.6,2022-01-06,2022-01-12
5,Pitagórica (diário),13–16 jan 2022,608.0,,40.1,28.8,5.9,5.9,0.6,1.5,8.0,5.0,1.3,2.9,11.3,2022-01-13,2022-01-16
2,CESOP–UCP,12–18 Jan 2022,1456.0,,37.0,33.0,5.0,5.0,2.0,2.0,6.0,5.0,2.0,3.0,4.0,2022-01-12,2022-01-18
0,Pitagórica (diário),17-20 jan 2022,608.0,,34.6,33.5,4.9,4.5,1.2,1.6,6.3,6.3,1.6,5.3,1.1,2022-01-17,2022-01-20


In [11]:
# Multiplicar percentagens por amostras para dar mais importancia a sondagens com maior amostragem
sond_abs = sondagens[PARTIDOS] * np.tile(sondagens["Amostra"], (len(PARTIDOS), 1)).T
sond_abs.tail()

Unnamed: 0,PS,PSD,B.E.,CDU,CDS–PP,PAN,CH,IL,L
11,48594.0,37380.0,7476.0,6230.0,2492.0,3738.0,7476.0,4984.0,2492.0
10,30746.7,22999.5,5971.8,3873.6,1452.6,1694.7,7263.0,2985.9,1614.0
5,24380.8,17510.4,3587.2,3587.2,364.8,912.0,4864.0,3040.0,790.4
2,53872.0,48048.0,7280.0,7280.0,2912.0,2912.0,8736.0,7280.0,2912.0
0,21036.8,20368.0,2979.2,2736.0,729.6,972.8,3830.4,3830.4,972.8


In [12]:
sond_sum = sond_abs.rolling(window=N_SONDAGENS).sum().dropna(how="all")
sond_sum

Unnamed: 0,PS,PSD,B.E.,CDU,CDS–PP,PAN,CH,IL,L
113,141591.4,102144.9,39523.2,27232.6,14677.6,15048.0,11295.3,5500.7,5358.0
112,140632.4,102739.9,39544.6,26882.8,14383.4,15543.4,13239.5,6471.9,4393.8
111,154305.8,112188.0,42595.0,29587.0,15362.4,17134.6,13544.5,6954.2,4424.7
109,137303.1,100760.1,39750.0,26448.9,12494.5,17512.3,14349.3,7063.6,4567.1
107,142623.5,108120.5,40426.8,27956.5,13942.9,17013.1,16250.1,6912.0,3736.3
...,...,...,...,...,...,...,...,...,...
11,175345.5,140946.5,28476.5,25814.0,8346.5,11740.5,27759.5,21282.0,5419.8
10,171854.2,136015.0,29943.3,24281.6,7997.1,11633.2,28715.5,20663.9,6313.0
5,149191.0,113909.4,26102.5,20440.8,5885.9,10069.2,27389.5,17513.9,5865.4
2,179303.0,143957.4,29542.5,24660.8,7897.9,11901.2,32705.5,21733.9,8177.4


In [13]:
votos_base = sond_sum.iloc[-1].to_dict()
votos_base

{'PS': 178630.29999999987,
 'PSD': 146305.89999999997,
 'B.E.': 27294.199999999993,
 'CDU': 23706.79999999999,
 'CDS–PP': 7950.999999999996,
 'PAN': 10229.499999999989,
 'CH': 32169.400000000016,
 'IL': 22120.300000000003,
 'L': 8781.199999999997}

In [14]:
def dhont(n_seats, votes):
    votes_cp = votes.copy()
    seats = {key: 0 for key in votes}
    s = 0
    while s < n_seats:
        max_v = max(votes_cp.values())
        next_seat = list(votes_cp.keys())[list(votes_cp.values()).index(max_v)]
        seats[next_seat] += 1
        votes_cp[next_seat] = votes[next_seat] / (seats[next_seat] + 1)
        s += 1

    return seats

In [15]:
total = {p: {"exp": 0, "min": 0, "max": 0} for p in PARTIDOS}
votes_margin = sum(votos_base.values()) * ERRO_PCT / 100
for dist, n in n_deps.items():
    for p in votos_base.keys():
        # Resultado esperado
        result_exp = dhont(n, votos_base)
        total[p]["exp"] += result_exp[p]
        # Resultado pessimista
        votes_min = votos_base.copy()
        votes_min[p] = max(0, votes_min[p] - votes_margin)
        result_min = dhont(n, votes_min)
        total[p]["min"] += result_min[p]
        # Resultado optimista
        votes_max = votos_base.copy()
        votes_max[p] += votes_margin
        result_max = dhont(n, votes_max)
        total[p]["max"] += result_max[p]
        
assert sum([i["exp"] for i in total.values()]) == N_TOTAL_DEPUTADOS
total

{'PS': {'exp': 109, 'min': 109, 'max': 110},
 'PSD': {'exp': 88, 'min': 86, 'max': 89},
 'B.E.': {'exp': 8, 'min': 7, 'max': 9},
 'CDU': {'exp': 7, 'min': 3, 'max': 8},
 'CDS–PP': {'exp': 0, 'min': 0, 'max': 2},
 'PAN': {'exp': 1, 'min': 0, 'max': 2},
 'CH': {'exp': 10, 'min': 8, 'max': 15},
 'IL': {'exp': 6, 'min': 3, 'max': 8},
 'L': {'exp': 1, 'min': 0, 'max': 2}}

In [16]:
min_deps = N_TOTAL_DEPUTADOS // 2 + 1

def get_solutions(n_deputados):
    solutions = set()
    total_list = [(k, v) for k, v in n_deputados.items()]
    for i in range(1, len(n_deputados)+1):
        for subset in itertools.combinations(total_list, i):
            sorted_subset = sorted(subset, key=lambda x: x[1], reverse=True)
            s = 0
            for j, elem in enumerate(sorted_subset):
                s += elem[1]
                if s >= min_deps:
                    sol = tuple(sorted_subset[:j+1])
                    solutions.add(tuple((k, v) for k, v in sol))
                    break
                    
    return solutions

In [17]:
get_solutions({p: total[p]["exp"] for p in total})

{(('PS', 109), ('B.E.', 8)),
 (('PS', 109), ('CDU', 7)),
 (('PS', 109), ('CH', 10)),
 (('PS', 109), ('IL', 6), ('L', 1)),
 (('PS', 109), ('IL', 6), ('PAN', 1)),
 (('PS', 109), ('PSD', 88)),
 (('PSD', 88), ('CH', 10), ('B.E.', 8), ('CDU', 7), ('IL', 6))}

In [18]:
get_solutions({p: total[p]["max"] for p in total})

{(('PS', 110), ('B.E.', 9)),
 (('PS', 110), ('CDS–PP', 2), ('PAN', 2), ('L', 2)),
 (('PS', 110), ('CDU', 8)),
 (('PS', 110), ('CH', 15)),
 (('PS', 110), ('IL', 8)),
 (('PS', 110), ('PSD', 89)),
 (('PSD', 89), ('B.E.', 9), ('CDU', 8), ('IL', 8), ('CDS–PP', 2)),
 (('PSD', 89), ('B.E.', 9), ('CDU', 8), ('IL', 8), ('L', 2)),
 (('PSD', 89), ('B.E.', 9), ('CDU', 8), ('IL', 8), ('PAN', 2)),
 (('PSD', 89), ('CH', 15), ('B.E.', 9), ('CDS–PP', 2), ('L', 2)),
 (('PSD', 89), ('CH', 15), ('B.E.', 9), ('CDS–PP', 2), ('PAN', 2)),
 (('PSD', 89), ('CH', 15), ('B.E.', 9), ('CDU', 8)),
 (('PSD', 89), ('CH', 15), ('B.E.', 9), ('IL', 8)),
 (('PSD', 89), ('CH', 15), ('B.E.', 9), ('PAN', 2), ('L', 2)),
 (('PSD', 89), ('CH', 15), ('CDU', 8), ('CDS–PP', 2), ('L', 2)),
 (('PSD', 89), ('CH', 15), ('CDU', 8), ('CDS–PP', 2), ('PAN', 2)),
 (('PSD', 89), ('CH', 15), ('CDU', 8), ('IL', 8)),
 (('PSD', 89), ('CH', 15), ('CDU', 8), ('PAN', 2), ('L', 2)),
 (('PSD', 89), ('CH', 15), ('IL', 8), ('CDS–PP', 2), ('L', 2)),
 (

In [19]:
esquerda = ['PS', 'B.E.', 'CDU', 'PAN', 'L']
get_solutions({p: total[p]["max"] for p in esquerda})

{(('PS', 110), ('B.E.', 9)), (('PS', 110), ('CDU', 8))}