In [1]:
%load_ext lab_black

import numpy as np
import os
import pandas as pd

np.random.seed(286)
RESULTS_FOLDER = "/Users/alexandreandorra/repos/contesdefaits/modeles/fondamentaux/"
DATES_ELECTIONS = {
    "dep1992": pd.to_datetime("1992-03-22"),
    "dep1994": pd.to_datetime("1994-03-20"),
    "dep1998": pd.to_datetime("1998-03-15"),
    "dep2001": pd.to_datetime("2001-03-11"),
    "dep2004": pd.to_datetime("2004-03-21"),
    "dep2008": pd.to_datetime("2008-03-09"),
    "dep2011": pd.to_datetime("2011-03-20"),
    "dep2015": pd.to_datetime("2015-03-22"),
    "euro1994": pd.to_datetime("1994-06-12"),
    "euro1999": pd.to_datetime("1999-06-13"),
    "euro2004": pd.to_datetime("2004-06-13"),
    "euro2009": pd.to_datetime("2009-06-07"),
    "euro2014": pd.to_datetime("2014-05-25"),
    "leg1993": pd.to_datetime("1993-03-21"),
    "leg1997": pd.to_datetime("1997-05-25"),
    "leg2002": pd.to_datetime("2002-06-09"),
    "leg2007": pd.to_datetime("2007-06-10"),
    "leg2012": pd.to_datetime("2012-06-10"),
    "leg2017": pd.to_datetime("2017-06-11"),
    "pres1974": pd.to_datetime("1974-05-05"),
    "pres1988": pd.to_datetime("1988-04-24"),
    "pres1995": pd.to_datetime("1995-04-23"),
    "pres2002": pd.to_datetime("2002-04-21"),
    "pres2007": pd.to_datetime("2007-04-22"),
    "pres2012": pd.to_datetime("2012-04-22"),
    "pres2017": pd.to_datetime("2017-04-23"),
    "reg1998": pd.to_datetime("1998-03-15"),
    "reg2004": pd.to_datetime("2004-03-21"),
    "reg2010": pd.to_datetime("2010-03-14"),
    "reg2015": pd.to_datetime("2015-12-06"),
}

Let's first load the predictors. They are on a daily scale to allow easy matching with election dates later:

In [2]:
predictors = pd.read_json("predictors_daily.json")
predictors.head()

Unnamed: 0,departement,date,climat_affaires,conf_menages,prix_gazole,inflation,pib,net_app,chomage
0,ain,1992-01-31,92.8,101,0.54,3.2,0.546012,-16.67,5.0
1,ain,1992-02-01,92.8,101,0.54,3.2,0.546012,-16.67,5.0
2,ain,1992-02-02,92.8,101,0.54,3.2,0.546012,-16.67,5.0
3,ain,1992-02-03,92.8,101,0.54,3.2,0.546012,-16.67,5.0
4,ain,1992-02-04,92.8,101,0.54,3.2,0.546012,-16.67,5.0


In [3]:
predictors.describe().round(2)

Unnamed: 0,climat_affaires,conf_menages,prix_gazole,inflation,pib,net_app,chomage
count,926208.0,926208.0,926208.0,926208.0,926208.0,926208.0,926208.0
mean,99.72,98.82,0.96,1.52,0.4,-12.97,8.78
std,10.33,10.06,0.28,0.8,0.47,26.16,2.04
min,68.2,80.0,0.53,0.0,-1.64,-67.48,3.7
25%,92.9,92.0,0.68,0.9,0.13,-30.49,7.4
50%,100.9,98.0,0.97,1.7,0.42,-13.59,8.6
75%,107.7,105.0,1.2,2.1,0.7,8.89,9.9
max,118.4,125.0,1.46,3.2,1.26,37.0,15.8


In [4]:
metropole = predictors.departement.unique().tolist()  # metropolitan departments only
len(metropole)

96

Now let's load the election results and restrict them to metropolitan departments:

In [None]:
election_results = pd.read_excel(
    os.path.join(RESULTS_FOLDER, "election_results_concat.xlsx")
)
election_results.departement.fillna(method="ffill", inplace=True)

election_results = election_results.set_index(["departement"]).sort_index()
election_results = election_results.loc[metropole]  # restrict results to metropole

For our first analysis we're gonna restrict us to elections where *all* parties where competing:

In [5]:
election_results = election_results.replace(to_replace=0, value=np.nan)
election_results = (
    election_results.dropna().reset_index()
)  # drop any election where at least one party is missing
election_results

Unnamed: 0,departement,election,farleft,left,green,center,right,farright,other
0,ain,dep1992,4.82,13.89,7.41,24.88,20.64,12.59,15.77
1,ain,dep1994,6.98,12.16,5.48,26.17,14.76,11.49,22.96
2,ain,dep1998,7.32,11.61,2.67,25.24,14.73,15.08,23.35
3,ain,dep2001,7.17,12.21,5.31,2.40,12.11,8.17,52.63
4,ain,dep2004,4.61,10.84,3.15,1.71,24.11,14.84,40.74
...,...,...,...,...,...,...,...,...,...
1884,yvelines,pres1995,11.26,21.36,3.41,19.57,25.95,13.78,4.67
1885,yvelines,pres2002,9.93,15.00,5.78,8.84,22.62,14.75,23.08
1886,yvelines,pres2007,3.68,23.11,2.51,21.86,37.66,7.34,3.84
1887,yvelines,pres2012,9.11,27.32,2.50,11.24,34.24,12.44,3.15


Let's add the exact election dates to match each voting day with the predictors:

In [6]:
election_results["date"] = election_results.election.replace(
    DATES_ELECTIONS
)
election_results = election_results.set_index(["departement", "date"]).sort_index()
election_results

Unnamed: 0_level_0,Unnamed: 1_level_0,election,farleft,left,green,center,right,farright,other
departement,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ain,1992-03-22,dep1992,4.82,13.89,7.41,24.88,20.64,12.59,15.77
ain,1993-03-21,leg1993,6.47,11.82,5.53,25.60,22.74,14.83,13.01
ain,1994-03-20,dep1994,6.98,12.16,5.48,26.17,14.76,11.49,22.96
ain,1994-06-12,euro1994,6.75,13.39,5.52,25.48,26.56,12.11,10.19
ain,1995-04-23,pres1995,11.04,20.50,3.91,19.80,19.23,19.86,5.66
...,...,...,...,...,...,...,...,...,...
yvelines,2009-06-07,euro2009,7.31,11.60,19.58,9.20,34.98,3.95,13.38
yvelines,2010-03-14,reg2010,6.44,22.81,16.70,4.22,33.74,8.83,7.26
yvelines,2012-04-22,pres2012,9.11,27.32,2.50,11.24,34.24,12.44,3.15
yvelines,2014-05-25,euro2014,4.13,11.95,7.66,13.58,24.95,17.78,19.95


And now we have to restrict our dataframe of predictors to only the election dates:

In [10]:
predictors = (
    predictors.set_index(["departement", "date"])
    .sort_index()
    .loc[election_results.index]
)

Finally, we just concatenate our two dataframes into one and do some checks before saving:

In [36]:
d = pd.concat([predictors, election_results], axis=1)

if not predictors.index.equals(election_results.index):
    raise ValueError("The indexes of predictors and election_results are not equal.")
elif (d.shape[0] != election_results.shape[0]) or (d.shape[0] != predictors.shape[0]):
    raise ShapeError(
        "Concatenation of results and predictors doesn't have same nbr of lines as either predictors or results."
    )
elif d.isna().values.any():
    raise ValueError("Concatenation of results and predictors contains missing values")
else:
    d.reset_index().to_json("full_funds.json")