In [1]:
%load_ext watermark

import arviz as az
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import pymc3 as pm
import theano.tensor as tt
import warnings

from scipy.special import expit as logistic
from scipy.special import softmax

warnings.simplefilter(action="ignore", category=FutureWarning)
RANDOM_SEED = 8927
np.random.seed(286)

PARTIES = ["farleft", "left", "green", "center", "right", "farright", "other"]
PARTIES_AGG = [
    "farleft_agg",
    "left_agg",
    "green_agg",
    "center_agg",
    "right_agg",
    "farright_agg",
]
SPAN = 20
ALPHA = 2 / (SPAN + 1)

In [2]:
import bokeh.plotting as bkp

from bokeh.io import output_notebook, show
from bokeh.layouts import column, gridplot
from bokeh.models import ColorBar, GeoJSONDataSource, HoverTool, LinearColorMapper
from bokeh.models.annotations import Title
from bokeh.models.widgets import Select
from bokeh.palettes import brewer
from bokeh.transform import linear_cmap

output_notebook()
BINS = np.array([15., 25., 35., 45., 55., 65., 75.])
COLORS = {
    "farleft": np.array(brewer["Reds"][7][::-1]),
    "left": np.array(brewer["PuRd"][7][::-1]),
    "green": np.array(brewer["Greens"][7][::-1]),
    "center": np.array(brewer["Oranges"][7][::-1]),
    "right": np.array(brewer["Blues"][7][::-1]),
    "farright": np.array(brewer["Purples"][7][::-1]),
    "other": np.array(brewer["Greys"][7][::-1]),
}

In [3]:
az.style.use("arviz-darkgrid")


def stdz(series: pd.Series) -> pd.Series:
    """Standardize the given pandas Series"""
    return (series - series.mean()) / series.std()


def get_top_n(df: pd.DataFrame, nlargest: int = 3) -> pd.DataFrame:

    order = np.argsort(-df.values, axis=1)[:, :nlargest]
    # the "-" is a common trick to argsort in decreasing order

    return pd.DataFrame(
        df.columns[order].values,
        columns=[f"top{i}" for i in range(1, nlargest + 1)],
        index=df.index,
    )

Let's load the data and transform every missing values to 0: except for the "other" category, when a party is missing, this means that it got 0 votes, so it makes sense to replace NaNs by 0s. Note however that these zeros do not come from the multinomial process - unlike the zeros in the "other" category, but from an earlier process that determines if any given party competes in the election. We'll deal with that below, but let's already remark that this looks a lot like a zero-inflated process...

In [4]:
d = pd.read_excel("../data/results_by_districts_paris.xlsx", index_col=0)
# a party that wasn't there won 0 ballots:
d[PARTIES] = d[PARTIES].fillna(0).astype(int)

euro2019 = pd.read_excel("../data/raw_election_results_1st_round/euro2019-districts.xlsx").rename(columns={"district": "arrondissement"})
euro2019["date"] = pd.to_datetime("2019-05-25")
euro2019["ville"] = "Paris"
euro2019["type"] = "european"

d = pd.concat([d, euro2019], axis=0, sort=False).sort_values(["arrondissement", "date"]).reset_index(drop=True)

In [5]:
# keep track of which party was the incumbent for each election:
INCUMBENTS = {
    date: incumbent
    for date, incumbent in zip(
        d.date.unique(),
        [
            "right",
            "right",
            "left",
            "left",
            "left",
            "right",
            "right",
            "left",
            "right",
            "left",
            "left",
            "left",
            "farright"
        ],
    )
}
for date, incumbent in INCUMBENTS.items():
    d.loc[d.date == date, "incumbent"] = incumbent
right_inc = (d.incumbent == "right").astype(int).values
left_inc = (d.incumbent == "left").astype(int).values
d

Unnamed: 0,date,ville,arrondissement,N,farleft,left,green,center,right,farright,other,type,incumbent
0,2007-04-22,Paris,1,9152,239,2530,205,2051,3595,418,114,president,right
1,2007-06-10,Paris,1,6825,243,0,1969,818,3430,158,207,legislative,right
2,2008-03-09,Paris,1,6127,75,2289,439,531,2641,152,0,municipale,left
3,2009-06-07,Paris,1,5212,275,665,1493,419,1808,127,425,european,left
4,2010-03-14,Paris,1,4843,295,1077,1038,177,1758,261,237,regional,left
...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,2014-05-25,Paris,20,49075,4974,10575,9498,4218,5994,4584,9232,european,right
256,2015-12-06,Paris,20,49130,6100,18315,7348,0,8340,4977,4050,regional,left
257,2017-04-23,Paris,20,89574,28512,12469,0,27399,11451,5305,4438,president,left
258,2017-06-11,Paris,20,57413,11546,10700,7766,6505,4300,2446,14150,legislative,left


In [6]:
TIMELINE = sorted(np.arange(np.min(d.date.dt.year.unique()), 2021), reverse=True)
year_weights = pd.DataFrame(
    index=TIMELINE,
    data=[(1 - ALPHA) ** i for i, _ in enumerate(TIMELINE)],
    columns=["year_weights"],
)

# add year_weights to results dataframe:
d.index = d.date.dt.year
d.index.name = "year"
d = d.join(year_weights).sort_values(["arrondissement", "date"]).reset_index(drop=True)

d["N"] = d[PARTIES].multiply(d.year_weights, axis="index").round().sum(1).astype(int)
d[PARTIES] = d[PARTIES].multiply(d.year_weights, axis="index").round().astype(int)
d

Unnamed: 0,date,ville,arrondissement,N,farleft,left,green,center,right,farright,other,type,incumbent,year_weights
0,2007-04-22,Paris,1,2492,65,689,56,558,979,114,31,president,right,0.272236
1,2007-06-10,Paris,1,1858,66,0,536,223,934,43,56,legislative,right,0.272236
2,2008-03-09,Paris,1,1845,23,689,132,160,795,46,0,municipale,left,0.300893
3,2009-06-07,Paris,1,1732,91,221,497,139,601,42,141,european,left,0.332566
4,2010-03-14,Paris,1,1780,108,396,382,65,646,96,87,regional,left,0.367573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,2014-05-25,Paris,20,26919,2728,5801,5210,2314,3288,2514,5064,european,right,0.548537
256,2015-12-06,Paris,20,29785,3698,11104,4455,0,5056,3017,2455,regional,left,0.606278
257,2017-04-23,Paris,20,66342,21117,9235,0,20293,8481,3929,3287,president,left,0.740633
258,2017-06-11,Paris,20,42523,8551,7925,5752,4818,3185,1812,10480,legislative,left,0.740633


In [7]:
district_id, districts = d.arrondissement.factorize(sort=True)
Ndistricts = len(districts)

type_id, types = d.type.factorize(sort=True)
Ntypes = len(types)

Nparties = len(PARTIES) - 1
N = d.N.values
R_obs = d[PARTIES].values

# which parties are available for choice
# huge value for missing parties, so that probability is 0 after softmax:
parties_available = (
    d[PARTIES]
    .astype(bool)
    .astype(int)
    .iloc[:, :-1]
    .replace(to_replace=0, value=50_000)
    .values
)
parties_available

array([[    1,     1,     1,     1,     1,     1],
       [    1, 50000,     1,     1,     1,     1],
       [    1,     1,     1,     1,     1,     1],
       ...,
       [    1,     1, 50000,     1,     1,     1],
       [    1,     1,     1,     1,     1,     1],
       [    1,     1,     1,     1,     1,     1]])

Then we add a predictor to the dataset, which is the unemployment rate in Paris on a quarterly basis:

In [8]:
unemp = pd.read_excel(
    "../data/predictors/chomage-zone-demploi-2003-2019.xls",
    header=5,
    sheet_name="txcho_ze",
)
unemp = unemp[unemp["LIBZE2010"] == "Paris"].iloc[:, 4:].T
unemp.columns = ["unemployment"]

# as timestamps variables:
unemp.index = pd.PeriodIndex(start=unemp.index[0], periods=len(unemp), freq="Q")
unemp

Unnamed: 0,unemployment
2003Q1,8.4
2003Q2,8.7
2003Q3,8.6
2003Q4,9.0
2004Q1,9.2
...,...
2018Q3,7.7
2018Q4,7.4
2019Q1,7.4
2019Q2,7.2


Now ne need to import the poll aggregation for each election. This will be our baseline for each party in each district:

In [9]:
polls1 = pd.read_excel(
    "../data/polls_1st_round/aggregated_polls.xlsx", index_col=0
)
polls2 = pd.read_excel(
    "../data/polls_1st_round/agg_polls_euro2019.xlsx"
).iloc[-1].drop(["souv", "other"])
polls2.index = polls1.columns[1:]
polls2["type"] = "europeennes"

aggregated_polls = polls1.append(polls2, ignore_index=True)
aggregated_polls[PARTIES_AGG] = aggregated_polls[PARTIES_AGG].div(100)
aggregated_polls.round(2)

Unnamed: 0,type,dateelection,samplesize_agg,farleft_agg,left_agg,green_agg,center_agg,right_agg,farright_agg
0,president,2007-04-22,1513,0.03,0.24,0.01,0.19,0.27,0.14
1,legislatives,2007-06-10,916,0.07,0.28,0.04,0.11,0.41,0.06
2,municipale,2008-03-09,755,0.04,0.45,0.06,0.08,0.33,0.02
3,europeennes,2009-06-07,2287,0.13,0.2,0.13,0.11,0.27,0.06
4,regionales,2010-03-14,907,0.1,0.29,0.14,0.05,0.29,0.09
5,president,2012-04-22,1400,0.14,0.28,0.03,0.1,0.27,0.16
6,legislatives,2012-06-10,1193,0.08,0.32,0.05,0.03,0.34,0.15
7,municipale,2014-03-23,977,0.06,0.38,0.07,0.0,0.37,0.08
8,europeennes,2014-05-25,3248,0.08,0.17,0.09,0.1,0.21,0.23
9,regionales,2015-12-06,1749,0.05,0.23,0.06,0.0,0.28,0.29


As you probably noticed, the polls are in shares of support. Since our link function is softmax, we need to convert these proportions back on the real line (because the polls will be used before the link function appears in our model). It's [really easy to invert the softmax](https://math.stackexchange.com/questions/2786600/invert-the-softmax-function): we just have to take the log of each proportion and add an arbitrary constant -- here, we chose 1.

But hold on! There are two zeros in these data (for center party during the 2014 city-council and 2015 regional elections). It's due to the fact that this party was not running during these elections. When we'll take the logarithm of these zeros, we'll get minus infinity -- spoiler alert: our sampler won't like that. So we'll have to use a trick here.

Notice that the green party has a very low share of polls in the 2017 presidential elections. It's because there were some polls featuring the green party at the beginning of the race, but not anymore once the party dropped out. This is useful for us here: as a results, the poll aggregation is very near zero but not _exactly_ zero, so on the log scale it will give a very negative number but not minus infinity.

So here is our trick: let's replace the zeros for the center party by the value for the green party during the 2017 presidential election. This might seem like a cheap trick but it actually makes some conceptual sense: our model is concerned with the _latent_ support of each party's in the population. And in reality, neither the center nor the green parties were at _exactly_ zero percent support during these years -- they just had a very low support, which actually forced them to drop out and make alliances. So, replacing our perfect zeros with not-so-perfect zeros does make sense!

Here is how it looks in code:

In [10]:
aggregated_polls[PARTIES_AGG] = aggregated_polls[PARTIES_AGG].replace(
    to_replace=0,
    value=aggregated_polls.loc[
        aggregated_polls.dateelection == "2017-04-23", "green_agg"
    ].values[0],
)

# revert the softmax:
aggregated_polls[PARTIES_AGG] = aggregated_polls[PARTIES_AGG].apply(np.log) + 1
aggregated_polls.round(2)

Unnamed: 0,type,dateelection,samplesize_agg,farleft_agg,left_agg,green_agg,center_agg,right_agg,farright_agg
0,president,2007-04-22,1513,-2.63,-0.43,-3.95,-0.67,-0.3,-0.95
1,legislatives,2007-06-10,916,-1.61,-0.29,-2.3,-1.2,0.12,-1.76
2,municipale,2008-03-09,755,-2.33,0.19,-1.85,-1.51,-0.12,-2.82
3,europeennes,2009-06-07,2287,-1.04,-0.63,-1.01,-1.19,-0.3,-1.85
4,regionales,2010-03-14,907,-1.32,-0.24,-1.0,-2.06,-0.24,-1.4
5,president,2012-04-22,1400,-0.99,-0.29,-2.68,-1.27,-0.32,-0.84
6,legislatives,2012-06-10,1193,-1.53,-0.14,-1.98,-2.58,-0.09,-0.88
7,municipale,2014-03-23,977,-1.88,0.04,-1.73,-22.24,-0.01,-1.5
8,europeennes,2014-05-25,3248,-1.58,-0.79,-1.42,-1.33,-0.54,-0.46
9,regionales,2015-12-06,1749,-1.94,-0.47,-1.84,-22.24,-0.27,-0.24


And now we combine all district- and city-level predictors as well as polls and election results into the same dataframe:

In [11]:
# add quarters to results dataframe:
d.index = pd.DatetimeIndex(d["date"].values).to_period("Q")
d.index.name = "quarter"
# merge with city-level unemployment:
d = d.join(unemp).sort_values(["arrondissement", "date"]).reset_index(drop=True)

# join polling aggregation:
d = d.merge(
    aggregated_polls.drop("type", 1),
    left_on="date",
    right_on="dateelection",
    how="left",
).drop("dateelection", 1)
d

Unnamed: 0,date,ville,arrondissement,N,farleft,left,green,center,right,farright,...,incumbent,year_weights,unemployment,samplesize_agg,farleft_agg,left_agg,green_agg,center_agg,right_agg,farright_agg
0,2007-04-22,Paris,1,2492,65,689,56,558,979,114,...,right,0.272236,7.8,1513,-2.633513,-0.432471,-3.950292,-0.670963,-0.296018,-0.946153
1,2007-06-10,Paris,1,1858,66,0,536,223,934,43,...,right,0.272236,7.8,916,-1.613136,-0.288538,-2.295177,-1.196553,0.116238,-1.763649
2,2008-03-09,Paris,1,1845,23,689,132,160,795,46,...,left,0.300893,6.6,755,-2.327655,0.190572,-1.849138,-1.513482,-0.116553,-2.824834
3,2009-06-07,Paris,1,1732,91,221,497,139,601,42,...,left,0.332566,7.9,2287,-1.040641,-0.628886,-1.013539,-1.192074,-0.299655,-1.854871
4,2010-03-14,Paris,1,1780,108,396,382,65,646,96,...,left,0.367573,8.2,907,-1.319981,-0.240301,-0.996263,-2.061018,-0.240266,-1.398709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,2014-05-25,Paris,20,26919,2728,5801,5210,2314,3288,2514,...,right,0.548537,8.8,3248,-1.582788,-0.790411,-1.420928,-1.334419,-0.537229,-0.460404
256,2015-12-06,Paris,20,29785,3698,11104,4455,0,5056,3017,...,left,0.606278,8.7,1749,-1.943570,-0.473766,-1.836909,-22.243938,-0.268955,-0.241968
257,2017-04-23,Paris,20,66342,21117,9235,0,20293,8481,3929,...,left,0.740633,8.0,1555,-0.667318,-1.565596,-22.243938,-0.437694,-0.640817,-0.493163
258,2017-06-11,Paris,20,42523,8551,7925,5752,4818,3185,1812,...,left,0.740633,8.0,1299,-1.144984,-1.528224,-2.510045,-0.191539,-0.560378,-0.747776


Ok, everything's ready, let's run our model! This a varying-effects model that takes into account not only the variation in slopes and intercepts, but also accounts for the *covariation between slopes and intercepts* -- it exploits additional information about the population in order to shrink in both dimensions.

#### Scrape 2020 polls for out-of-sample predictions:

In [155]:
CANDIDATES = {"Simonnet": "farleft", "Hidalgo": "left", "Belliard": "green", "Buzyn": "center", "Dati": "right", "Federbusch": "farright"}
MONTHS = {"janvier": 1, "février": 2, "mars": 3}
RIGHT_POLLSTER = {
    "Harris Interactive": "Harris",
    "Ifop-Fiducial": "Ifop",
    "Ipsos-Sopra Steria": "Ipsos",
}

In [166]:
raw_polls = pd.read_html("https://fr.wikipedia.org/wiki/%C3%89lections_municipales_de_2020_%C3%A0_Paris", 
                         attrs = {'class': 'wikitable centre'}, 
                         match="Date de réalisation",
                         decimal=",",
                         thousands=" ",
                         na_values="—")[0]
raw_polls.columns = raw_polls.columns.droplevel([0, 2])
raw_polls = raw_polls[~raw_polls.Source.str.contains("candidature | annonce | renonce")].drop(["Gantzer", "Villani", "Bournazel", "Campion", "Berkani", "Autres"], axis=1)

# clean polls' characteristics:
raw_polls = raw_polls.rename(columns={"Source": "pollster", "Date de réalisation": "date", "Échantillon": "N"})
raw_polls["pollster"] = raw_polls.pollster.replace(RIGHT_POLLSTER)
raw_polls["N"] = raw_polls["N"].str.split().str.join("").astype(int)

#compute median field date:
field = raw_polls["date"].str.split(expand=True)
field["day"] = field[[0, 2]].median(axis=1).apply(np.ceil).astype(int)
field["month"] = field[3].replace(MONTHS)
field["year"] = 2020
raw_polls["date"] = pd.to_datetime(field[["day", "month", "year"]])

# clean candidates' values:
raw_polls[list(CANDIDATES.keys())] = raw_polls[CANDIDATES.keys()].astype(float)
raw_polls["Buzyn"] = raw_polls[["Buzyn", "Griveaux"]].fillna(0).sum(axis=1)
raw_polls = raw_polls.drop("Griveaux", axis=1).rename(columns=CANDIDATES).sort_values("date").reset_index(drop=True)
raw_polls

Unnamed: 0,pollster,date,N,farleft,left,green,center,right,farright
0,Ifop,2020-01-15,1102,5.0,25.0,14.0,0.0,19.0,5.0
1,Ifop,2020-01-15,1102,5.0,25.0,14.0,0.0,17.0,5.0
2,Odoxa,2020-01-17,1005,8.0,24.0,13.0,0.0,18.0,5.0
3,Odoxa,2020-01-22,1002,4.0,23.0,14.5,0.0,20.0,6.0
4,Harris,2020-02-18,1092,6.0,23.0,13.0,17.0,23.0,5.0
5,Odoxa,2020-02-18,809,7.0,23.0,14.0,17.0,25.0,4.0
6,Ifop,2020-02-19,976,6.0,24.0,12.0,19.0,22.0,3.5
7,Ipsos,2020-02-19,1000,5.0,24.0,13.0,19.0,20.0,4.0
8,Ifop,2020-02-27,946,5.0,24.0,11.0,20.0,25.0,3.5


#### Aggregate those polls: