In [18]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
DATA_2019 = 'data/2019.csv'

election_2019 = pd.read_csv(DATA_2019, encoding='utf-8', 
                            names=['community', 'province', 'election_year', 
                                   'party', 'votes', 'percent', 'seats', 'iteration'])

In [20]:
election_2019.head(10)

Unnamed: 0,community,province,election_year,party,votes,percent,seats,iteration
0,,,2019,Censo,36898883,,,
1,,,2019,Votantes,26478140,71.8,,
2,,,2019,Nulos,276769,,,
3,,,2019,V&aacute;lidos,26201371,,,
4,,,2019,Blancos,199836,,,
5,,,2019,Partido Socialista Obrero Espa&ntilde;ol (PSOE),7513142,28.7,123.0,
6,,,2019,Partido Popular (PP),4373653,16.7,66.0,
7,,,2019,Ciudadanos-Partido de la Ciudadan&iacute;a (C's),4155665,15.9,57.0,
8,,,2019,Unidas Podemos (PODEMOS-IU-EQUO),3751145,14.3,42.0,
9,,,2019,Vox (VOX),2688092,10.3,24.0,


In [21]:
election_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1760 entries, 0 to 1759
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   community      1712 non-null   float64
 1   province       1426 non-null   float64
 2   election_year  1760 non-null   int64  
 3   party          1760 non-null   object 
 4   votes          1760 non-null   int64  
 5   percent        1256 non-null   float64
 6   seats          1130 non-null   float64
 7   iteration      890 non-null    object 
dtypes: float64(4), int64(2), object(2)
memory usage: 110.1+ KB


In [25]:
# reformatting spanish chars
accent_map = {'&aacute;': 'á', '&ntilde;': 'ñ', '&iacute;': 'í',
              '&oacute;': 'ó', '&Iacute;': 'Í', '&agrave;': 'à',
              '&Agrave;': 'À', '&Aacute;': 'Á', '&iexcl;': '¡',
              '&uacute;': 'ú', '&Uacute;': 'Ú', '&eacute;': 'é',
              '&Eacute;': 'É'}

party_names_cleaned = list(election_2019.party)
for k in accent_map:
    party_names_cleaned = [re.sub(k, accent_map[k], i) for i in party_names_cleaned]

election_2019['party'] = party_names_cleaned

# Nan values in 'community' col represent totals for entire country - we'll code totals as 0
election_2019['community'] = election_2019.community.fillna(0.0)

# mapping community codes to community names
comm_map = {0: 'Spain', 1: 'Andalucía', 2: 'Aragón', 3: 'Asturias',
            4: 'Illes Balears', 5: 'Canarias', 6: 'Cantabria', 7:'Castilla-La Mancha',
            8: 'Castilla y León', 9: 'Cataluña', 10: 'Extremadura', 11: 'Galicia',
            12: 'Madrid', 13: 'Navarra', 14: 'País Vasco', 15: 'Región de Murcia',
            16: 'La Rioja', 17: 'Comunidad Valenciana', 18: 'Ceuta', 19: 'Melilla'}

election_2019['community_name'] = election_2019['community'].map(comm_map)


# filling in election month info - two elections were held in 2019 (April and November)
election_2019['iteration'] = election_2019.iteration.fillna('A')

april_index = election_2019[election_2019['iteration']=='A'].index
nov_index = election_2019[election_2019['iteration']=='B'].index

election_2019.loc[april_index, 'election_year'] = 'april-2019'
election_2019.loc[nov_index, 'election_year'] = 'november-2019'

# drop 'iteration' column
election_2019.drop('iteration', axis=1, inplace=True)

# split the two datasets by election

april_2019 = election_2019[election_2019['election_year']=='april-2019'].reset_index(drop=True)
nov_2019 = election_2019[election_2019['election_year']=='november-2019'].reset_index(drop=True)


In [26]:
april_2019

Unnamed: 0,community,province,election_year,party,votes,percent,seats,community_name
0,0.0,,april-2019,Censo,36898883,,,Spain
1,0.0,,april-2019,Votantes,26478140,71.8,,Spain
2,0.0,,april-2019,Nulos,276769,,,Spain
3,0.0,,april-2019,Válidos,26201371,,,Spain
4,0.0,,april-2019,Blancos,199836,,,Spain
...,...,...,...,...,...,...,...,...
865,19.0,52.0,april-2019,Vox (VOX),5807,17.2,0.0,Melilla
866,19.0,52.0,april-2019,Ciudadanos-Partido de la Ciudadanía (C's),4351,12.9,0.0,Melilla
867,19.0,52.0,april-2019,Unidas Podemos (PODEMOS-IU-EQUO),1292,3.8,0.0,Melilla
868,19.0,52.0,april-2019,Recortes Cero-Grupo Verde,99,0.3,0.0,Melilla


In [27]:
# for an inputted dataframe, 
#separate the data relating to community-wide totals 
#from the data about specific political parties

def get_parties_data(df):
    
    election_totals = df[df['party'].isin(['Censo', 'Votantes', 'Nulos', 'Blancos', 'Válidos'])].reset_index(drop=True)
    party_totals = df[df['party'].isin(['Censo', 'Votantes', 'Nulos', 'Blancos', 'Válidos'])==False].reset_index(drop=True)
    
    return election_totals, party_totals

april_2019_totals, april_2019_parties = get_parties_data(april_2019)
nov_2019_totals, nov_2019_parties = get_parties_data(nov_2019)

In [28]:
nov_2019_parties

Unnamed: 0,community,province,election_year,party,votes,percent,seats,community_name
0,0.0,,november-2019,Partido Socialista Obrero Español (PSOE),6792199,28.0,120.0,Spain
1,0.0,,november-2019,Partido Popular (PP),5047040,20.8,89.0,Spain
2,0.0,,november-2019,Vox (VOX),3656979,15.1,52.0,Spain
3,0.0,,november-2019,Unidas Podemos (PODEMOS-IU),3119364,12.9,35.0,Spain
4,0.0,,november-2019,Esquerra Republicana de Catalunya-Sobiranistes...,880734,3.6,13.0,Spain
...,...,...,...,...,...,...,...,...
570,19.0,52.0,november-2019,Partido Socialista Obrero Español (PSOE),5087,16.4,0.0,Melilla
571,19.0,52.0,november-2019,Ciudadanos-Partido de la Ciudadanía (C's),917,3.0,0.0,Melilla
572,19.0,52.0,november-2019,Unidas Podemos (PODEMOS-IU),809,2.6,0.0,Melilla
573,19.0,52.0,november-2019,Recortes Cero-Grupo Verde (RECORTES CERO-GV),19,0.1,0.0,Melilla


In [None]:
# filter the parties by the top-15 
top_parties_2019 = party_totals_2019.groupby('party')['votes'].sum().sort_values(ascending=False).head(15).index