In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pymc3 as pm
import seaborn as sns
import theano.tensor as tt
import unidecode
import warnings

from sklearn import preprocessing
from scipy import stats
from typing import List, Set, Tuple

%matplotlib inline
sns.set()
warnings.simplefilter(action='ignore', category=FutureWarning)
print('Running on PyMC3 v{}'.format(pm.__version__), 'and Pandas v{}'.format(pd.__version__))



Running on PyMC3 v3.1 and Pandas v0.20.1


In [23]:
AFFILIATIONS = {'farleft': ['arthaud', 'besancenot', 'buffet', 'gluckstein', 'hue', 'laguiller', 'mÃ©lenchon', 'poutou',
                           'LFG', 'FG', 'FI', 'BC-FG', 'LXG', 'LEXG', 'EXG', 'BC-EXG', 'LCOP', 'LPC', 'LCOM', 'COM', 'BC-COM', 
                            'LUTTE OUVRIERE ET LCR', 'LUTTE OUVRIERE', "BOUGE L'EUROPE", 'PARTI DES TRAVAILL.', 'PCF'], 
                'left': ['hamon', 'hollande', 'jospin', 'royal', 'LUG', 'LSOC', 'LPS', 'SOC', 'BC-SOC', 'BC-UG', 'LGA',
                        'CONSTRUISONS NOTRE EUROPE', 'EUROPE SOLIDAIRE'], 
                'green': ['bovÃ©', 'cohn-bendit', 'joly', 'mamere', 'voynet', 'LVEC', 'LVE', 'LVEG', 'LEC', 'VEC', 'BC-VEC', 'LECO', 'ECO',
                         "L'ECOLOGIE, LES VERTS", 'ECOLOGIE  CHOIX DE LA VIE', 'UNION DES ECOLOG.', 'GENERATION ECOLOGIE'], 
                'center': ['balladur', 'bayrou', 'macron', 'REM', 'LUC', 'LCMD', 'MDM', 'LUDF', 'UDF', 'UDFD', "AVEC L'EUROPE", 
                           'ENERGIE RADICALE', 'AUTRE EUROPE'], 
                'right': ['chirac', 'fillon', 'sarkozy', 'LR', 'LUMP', 'LUD', 'LMAJ', 'UDI', 'BC-UDI', 'UMP', 'BC-UMP', 'BC-UD', 'LDR',
                          "L'UNION POUR L'EUROPE", 'UDF-RPR', 'RPR', 'M-NC', 'M'],
                'farright': ['le pen', 'LFN', 'FN', 'FRN', 'BC-FN', 'FRONT NATIONAL']}

DATES_ELECTIONS = {'dep1992': pd.to_datetime('1992-03-22'), 'dep1994': pd.to_datetime('1994-03-20'),
                   'dep1998': pd.to_datetime('1998-03-15'), 'dep2001': pd.to_datetime('2001-03-11'),
                   'dep2004': pd.to_datetime('2004-03-21'), 'dep2008': pd.to_datetime('2008-03-09'),
                   'dep2011': pd.to_datetime('2011-03-20'), 'dep2015': pd.to_datetime('2015-03-22'),
                   'euro1994': pd.to_datetime('1994-06-12'), 'euro1999': pd.to_datetime('1999-06-13'),
                   'euro2004': pd.to_datetime('2004-06-13'), 'euro2009': pd.to_datetime('2009-06-07'), 
                   'euro2014': pd.to_datetime('2014-05-25'),
                   'leg1993': pd.to_datetime('1993-03-21'), 'leg1997': pd.to_datetime('1997-05-25'),
                   'leg2002': pd.to_datetime('2002-06-09'), 'leg2007': pd.to_datetime('2007-06-10'),
                   'leg2012': pd.to_datetime('2012-06-10'), 'leg2017': pd.to_datetime('2017-06-11'),
                   'pres1974': pd.to_datetime('1974-05-05'), 'pres1988': pd.to_datetime('1988-04-24'),
                   'pres1995': pd.to_datetime('1995-04-23'), 'pres2002': pd.to_datetime('2002-04-21'), 
                   'pres2007': pd.to_datetime('2007-04-22'), 'pres2012': pd.to_datetime('2012-04-22'), 
                   'pres2017': pd.to_datetime('2017-04-23'),
                   'reg1998': pd.to_datetime('1998-03-15'), 'reg2004': pd.to_datetime('2004-03-21'), 
                   'reg2010': pd.to_datetime('2010-03-14'), 'reg2015': pd.to_datetime('2015-12-06')}

FOND_FOLDER = '/Users/alexandreandorra/repos/contesdefaits/modeles/euro/fondamentaux'
REG_FOLDER = '/Users/alexandreandorra/repos/contesdefaits/modeles/fondamentaux'
RESULTS_FOLDER = '/Users/alexandreandorra/repos/contesdefaits/modeles/fondamentaux/election_results_1st_round'

In [3]:
def uniformise(x: str) -> str:
    '''
    Uniformise les dÃ©partements pour pouvoir fusionner les dataframes des diffÃ©rentes Ã©lections.
    En dÃ©tail, la fonction lowercase les dÃ©partements, et enlÃ¨ve les accents, les tirets, les espaces,
    et les apostrophes
    '''
    x = x.lower()
    x = unidecode.unidecode(x) # remove accents
    x = x.replace('-', '')
    x = x.replace(' ', '')
    x = x.replace("'", '')
    x = x.replace('francaisetablishorsdefrance', 'francaisdeletranger')
    x = x.replace('corsedusud', 'corsesud')
    
    return x

# Import the dependent variable (election results)

### PrÃ©sidentielles 2002, 2007 et 2012

In [3]:
def merge_presidentielles() -> pd.DataFrame:
    
    presids = []
    for e in ['pres2002', 'pres2007', 'pres2012']:
        presids.append(parse_pres(e))
    
    return pd.concat(presids)[list(AFFILIATIONS.keys()) + ['other']].sort_index()


def parse_pres(election: str) -> pd.DataFrame:
    
    results = isolate_results(election)
    results = attribute_parties(election, results)
    
    return results
    

def isolate_results(election: str) -> pd.DataFrame:
    
    df = pd.read_excel(os.path.join(RESULTS_FOLDER, 'preprocessed/{}.xls'.format(election)), 
                       sheetname='DÃ©partements T1', decimal=",")
    
    candidates = df.filter(like='Nom').iloc[0].tolist()
    candidates = [c.strip().lower() for c in candidates]
    support = df.filter(like='% Voix/Exp').columns.tolist()
    dpmts = df['LibellÃ© du dÃ©partement'].apply(lambda x: uniformise(x)).unique()

    df = df.set_index(dpmts)[support]
    df.index.name = 'departement'
    df.columns = candidates
    
    if election == 'pres2007':
        df.drop('buffet', axis=1, inplace=True) # because we curiously lack the polls
    
    return df


def uniformise(x: str) -> str:
    '''
    Uniformise les dÃ©partements pour pouvoir fusionner les dataframes des diffÃ©rentes Ã©lections.
    En dÃ©tail, la fonction lowercase les dÃ©partements, et enlÃ¨ve les accents, les tirets, les espaces,
    et les apostrophes
    '''
    x = x.lower()
    x = unidecode.unidecode(x) # remove accents
    x = x.replace('-', '')
    x = x.replace(' ', '')
    x = x.replace("'", '')
    x = x.replace('francaisetablishorsdefrance', 'francaisdeletranger')
    x = x.replace('corsedusud', 'corsesud')
    
    return x


def attribute_parties(election: str, df: pd.DataFrame) -> pd.DataFrame:
    
    for p in AFFILIATIONS.keys():
        intersection = list(set(df.columns) & set(AFFILIATIONS[p])) # quel candidat reprÃ©sente le parti cette annÃ©e?
        
        if ('mÃ©lenchon' in intersection) or ('FI' in intersection) or ('LFG' in intersection)\
            or ('FG' in intersection) or ('BC-FG' in intersection): # take only LFI for farleft, starting in 2012
            df = df.rename(columns={'mÃ©lenchon': p, 'FI': p, 'LFG': p, 'FG': p, 'BC-FG': p})
        else:
            if len(intersection) >= 2: # somme les candidats de mÃªme nuance, puis drop
                df[p] = df[intersection].sum(axis=1)
                df.drop(intersection, axis=1, inplace=True)
            
            elif len(intersection) == 1: # rename column of only candidate of this party
                df = df.rename(columns={intersection[0]: p})

    competing = list(set(AFFILIATIONS.keys()) & set(df.columns)) # quels partis sont prÃ©sents cette annÃ©e?
    df = df[competing]
    df['other'] = 100 - df.copy().sum(axis=1) # capte tout parti manquant
    df = pd.concat([df], keys=[election], names=['election'])
    
    return df.swaplevel().sort_index()

In [4]:
results_concat = merge_presidentielles()
results_concat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,farleft,left,green,center,right,farright,other
departement,election,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ain,pres2002,11.29,12.64,5.13,7.73,17.18,21.86,24.17
ain,pres2007,4.58,20.67,2.99,19.62,34.5,12.28,5.36
ain,pres2012,9.61,22.75,2.26,10.16,30.41,20.71,4.1
aisne,pres2002,17.55,15.36,3.58,5.34,18.82,21.22,18.13
aisne,pres2007,7.81,23.42,2.27,13.51,29.3,17.28,6.41


### PrÃ©sidentielles 1995

In [5]:
pres95 = pd.read_excel(os.path.join(RESULTS_FOLDER, 'preprocessed/pres1995.xls'), decimal=",", header=6, 
                       parse_cols=range(10), index_col=0).sort_index()

pres95.index = pd.Series(pres95.index).apply(lambda x: uniformise(x)).values
pres95.columns = [x.strip().lower().split(',')[0] for x in pres95.columns]

pres95 = attribute_parties('pres1995', pres95)
results_concat = pd.concat([results_concat, pres95])[list(AFFILIATIONS.keys()) + ['other']].sort_index()
results_concat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,farleft,left,green,center,right,farright,other
departement,election,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ain,pres1995,11.04,20.5,3.91,19.8,19.23,19.86,5.66
ain,pres2002,11.29,12.64,5.13,7.73,17.18,21.86,24.17
ain,pres2007,4.58,20.67,2.99,19.62,34.5,12.28,5.36
ain,pres2012,9.61,22.75,2.26,10.16,30.41,20.71,4.1
aisne,pres1995,16.73,24.38,2.43,15.63,18.17,17.69,4.97


### Reste des Ã©lections

**Oneshot**: preprocessing reg1998: dÃ©nomination 'LMAJ' fluctuante selon les dÃ©partements. Excel modifiÃ© en consÃ©quence et enregistrÃ©

In [14]:
df = pd.read_excel(os.path.join(RESULTS_FOLDER, 'reg1998.xls'), sheetname='DÃ©partements', index_col=1, 
                   decimal=",")
df.index = pd.Series(df.index).apply(lambda x: uniformise(x)).values
df.sort_index(inplace=True)

nuances_df = df.filter(like='Nuance')

for _, line in nuances_df.iterrows():
    temp_nu = line.dropna().unique()
    if 'LMA' in temp_nu:
        if 'LPS' in temp_nu:
            line[line == 'LMA'] = 'LDR'
        elif 'LDR' in temp_nu:
            line[line == 'LMA'] = 'LPS'
        elif ('LPS' and 'LDR') in temp_nu:
            line[line == 'LMA'] = 'CHECK PARTY MANUALLY'
        elif ('LPS' and 'LDR') not in temp_nu:
            line[line == 'LMA'] = 'CHECK PARTY MANUALLY'

df[nuances_df.columns] = nuances_df
#df.to_excel(os.path.join(FOND_FOLDER, 'reg98mod.xlsx'))

nuances_df[nuances_df.isin(['CHECK PARTY MANUALLY']).any(axis=1)]#.empty

#### Fonction gÃ©nÃ©rale

In [6]:
def isolate_nuances_and_supports(file: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    
    df = pd.read_excel(os.path.join(RESULTS_FOLDER, file), sheetname='DÃ©partements T1', index_col=1, decimal=",")
    df.index = pd.Series(df.index).apply(lambda x: uniformise(x)).values
    df.sort_index(inplace=True)

    if ('Nuance Liste' in df.columns) or ('Code Nuance' in df.columns):
        nuances_df = df.filter(like='Nuance').copy()
        
    elif ('Nom' in df.columns) or ('Nom TÃªte de Liste' in df.columns): # pour prÃ©sidentielles, euro1994 et euro1999
        if 'Nom' in df.columns:
            df[df.filter(like='Nom').columns] = df.filter(like='Nom').applymap(str.strip).applymap(str.lower) # to match affiliations.keys
        nuances_df = df.filter(like='Nom').copy()
        
    else:
        print("Couldn't affiliate nuances in nuances_df")
        raise SystemExit

    if '% Voix/Exp' not in df.columns: # pour euro1999, euro2004, leg1993
        for i, c in enumerate(df.filter(like='Voix').columns):
            df['% Voix/Exp.{}'.format(str(i))] = (df[c] / df['ExprimÃ©s'])*100
    supports_df = df.filter(like='% Voix/Exp').copy()
    
    return df, nuances_df, supports_df


def extract_nuances(nuances_df: pd.DataFrame) -> Set[str]:
    '''Extract the nuances competing in this election'''
    nuances_set = set()
    
    for _, line in nuances_df.iterrows():
        for col in nuances_df.columns:
            if pd.notnull(line[col]):
                nuances_set.update({line[col]})
    
    return nuances_set


def format_results(df, nuances_df, nuances_set, supports_df) -> pd.DataFrame:
    res = {nuance: [] for nuance in nuances_set}
    for _, line in df.iterrows(): # each line is a department
        tempset = nuances_set.copy()

        for n, s in zip(nuances_df.columns, supports_df.columns): # iterate over nuances in line
            if pd.notnull(line[n]):

                if line[n] in tempset: # if 1st time we see this nuance in this line
                    res[line[n]].append(line[s])
                    tempset.remove(line[n])

                else: # if we already saw this nuance in this line
                    res[line[n]][-1] += line[s]

        for nuance in tempset: # if nuance still in tempset, then it's not competing in this department
            res[nuance].append(np.nan)
    
    results = pd.DataFrame(index=df.index, data=res)
    return results

In [7]:
results_files = [f for f in os.listdir(RESULTS_FOLDER) if os.path.isfile(os.path.join(RESULTS_FOLDER, f))][1:] # exclude hidden DS file

for f in results_files:
    print('Formatting {}...'.format(f))
    df, nuances_df, supports_df = isolate_nuances_and_supports(f)
    nuances_set = extract_nuances(nuances_df)
    results = format_results(df, nuances_df, nuances_set, supports_df)
    results = attribute_parties(f.split('.')[0], results)
    results_concat = pd.concat([results_concat, results])[list(AFFILIATIONS.keys()) + ['other']].sort_index().round(2)
    print('Concatenated {} to results_concat df'.format(f))

results_concat.to_excel(os.path.join(REG_FOLDER, 'election_results_concat.xlsx'))
results_concat

Formatting dep1992.xls...
Concatenated dep1992.xls to results_concat df
Formatting dep1994.xls...
Concatenated dep1994.xls to results_concat df
Formatting dep1998.xls...
Concatenated dep1998.xls to results_concat df
Formatting dep2001.xls...
Concatenated dep2001.xls to results_concat df
Formatting dep2004.xls...
Concatenated dep2004.xls to results_concat df
Formatting dep2008.xls...
Concatenated dep2008.xls to results_concat df
Formatting dep2011.xls...
Concatenated dep2011.xls to results_concat df
Formatting dep2015.xlsx...
Concatenated dep2015.xlsx to results_concat df
Formatting euro1994.xls...
Concatenated euro1994.xls to results_concat df
Formatting euro1999.xls...
Concatenated euro1999.xls to results_concat df
Formatting euro2004.xls...
Concatenated euro2004.xls to results_concat df
Formatting euro2009.xls...
Concatenated euro2009.xls to results_concat df
Formatting euro2014.xlsx...
Concatenated euro2014.xlsx to results_concat df
Formatting leg1993.xls...
Concatenated leg1993.xls

Unnamed: 0_level_0,Unnamed: 1_level_0,farleft,left,green,center,right,farright,other
departement,election,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ain,dep1992,4.82,13.89,7.41,24.88,20.64,12.59,15.77
ain,dep1994,6.98,12.16,5.48,26.17,14.76,11.49,22.96
ain,dep1998,7.32,11.61,2.67,25.24,14.73,15.08,23.35
ain,dep2001,7.17,12.21,5.31,2.40,12.11,8.17,52.63
ain,dep2004,4.61,10.84,3.15,1.71,24.11,14.84,40.74
ain,dep2008,4.97,12.23,3.25,3.17,37.58,7.03,31.77
ain,dep2011,3.54,14.10,7.96,,13.60,16.49,44.31
ain,dep2015,,25.69,3.80,,36.79,27.85,5.87
ain,euro1994,6.75,13.39,5.52,25.48,26.56,12.11,10.19
ain,euro1999,9.42,20.80,9.91,9.58,11.94,5.86,32.50


In [104]:
election_results = pd.read_excel(os.path.join(RESULTS_FOLDER, 'election_results_concat.xlsx'))
election_results.departement.fillna(method='ffill', inplace=True)
election_results.election.replace(DATES_ELECTIONS, inplace=True)
election_results = election_results.set_index(['departement', 'election']).sort_index()
election_results = election_results.loc[metropole] # restrict results to metropole
election_results

Unnamed: 0_level_0,Unnamed: 1_level_0,farleft,left,green,center,right,farright,other
departement,election,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ain,1992-03-22,4.82,13.89,7.41,24.88,20.64,12.59,15.77
ain,1993-03-21,6.47,11.82,5.53,25.60,22.74,14.83,13.01
ain,1994-03-20,6.98,12.16,5.48,26.17,14.76,11.49,22.96
ain,1994-06-12,6.75,13.39,5.52,25.48,26.56,12.11,10.19
ain,1995-04-23,11.04,20.50,3.91,19.80,19.23,19.86,5.66
ain,1997-05-25,9.24,20.48,9.80,19.99,14.86,18.80,6.83
ain,1998-03-15,7.32,11.61,2.67,25.24,14.73,15.08,23.35
ain,1998-03-15,4.26,31.48,,,33.72,18.60,11.94
ain,1999-06-13,9.42,20.80,9.91,9.58,11.94,5.86,32.50
ain,2001-03-11,7.17,12.21,5.31,2.40,12.11,8.17,52.63
