1. Install and import necessary packages

In [1]:
# Install a-world-of-countries if you don't have it yet (uncomment the next line of code)
#!pip install a-world-of-countries

In [2]:
import pandas as pd
import numpy as np
import awoc #import the a-world-of-countries (awoc) library



2. Load and check dataset

In [3]:
# Load the datasets as a dataframes
df_manifesto = pd.read_csv('RAW_DATA/MPDataset_MPDS2023a.csv')
df_gini = pd.read_csv('RAW_DATA/economic-inequality-gini-index.csv')
df_welfare = pd.read_csv('RAW_DATA/social_welfare_spending.csv')
df_dem_level = pd.read_csv('RAW_DATA/V-Dem-CY-Core-v14.csv')
df_gni = pd.read_csv('RAW_DATA/GNI_34b2ae76-01f3-4a11-82d6-b890757be071_Series - Metadata.csv', skipfooter= 7)

  df_manifesto = pd.read_csv('RAW_DATA/MPDataset_MPDS2023a.csv')
  df_gni = pd.read_csv('RAW_DATA/GNI_34b2ae76-01f3-4a11-82d6-b890757be071_Series - Metadata.csv', skipfooter= 7)


3. Change datatypes where necessary

In [4]:
# Change datatypes in df_manifesto for columns countryname, partyname, partyabbrev, candidatename, corpusversion, datasetversion, and id_perm from object into string.
columns_to_convert = ['countryname', 'partyname', 'partyabbrev', 'candidatename', 'corpusversion', 'datasetversion', 'id_perm']
df_manifesto[columns_to_convert] = df_manifesto[columns_to_convert].astype('string')

In [5]:
# Change datatype in df_manifesto for edate from object into datetime
df_manifesto['edate'] = pd.to_datetime(df_manifesto['edate'], format='%d/%m/%Y')

In [6]:
# Change datatypes in df_gini for columns Entity and Code from object to string
df_gini[['Entity', 'Code']] = df_gini[['Entity', 'Code']].astype('string')

4. Drop possible duplicate rows

In [7]:
df_manifesto = df_manifesto.drop_duplicates()
df_gini = df_gini.drop_duplicates()
df_welfare = df_welfare.drop_duplicates()
df_dem_level = df_dem_level.drop_duplicates()

5. Combine manifesto and gini dataset into one

5.1 Drop unnecessary columns from df_gini

In [8]:
df_gini = df_gini.drop(columns=['Code'])

5.2 Relabel Entity and Year column in df_gini to match with df_manifesto

In [9]:
df_gini = df_gini.rename(columns={"Entity" : "countryname", "Year" : "year"})

5.3 Extract year from edate in df_manifesto

In [10]:
df_manifesto['year'] = df_manifesto['edate'].dt.year

5.4 Merge dataframes based on 'countryname' and 'year' columns

In [11]:
df = pd.merge(df_manifesto, df_gini, on=['countryname', 'year'], how='left')

6. Add welfare state to dataframe

6.1 Reduce welfare dataframe to required columns only (country, year, welfare spending)

In [12]:
df_welfare = df_welfare[['Reference area', 'TIME_PERIOD', 'OBS_VALUE']]

6.2 Relabel columns in df_welfare to match with df

In [13]:
df_welfare.rename(columns={'Reference area': 'countryname', 'TIME_PERIOD': 'year', 'OBS_VALUE': 'welfare spending'}, inplace=True)

6.3 Merge the datasets based on 'countryname' and 'year' columns

In [14]:
df = pd.merge(df, df_welfare, on=['countryname', 'year'], how='left')

7. Add democracy level to dataframe

7.1 Reduce democracy level dataframe to required columns only (country, year, electoral democracy index)

In [15]:
df_dem_level = df_dem_level[['countryname', 'year', 'electoral_dem_ind']]

7.2 Merge the datasets based on 'countryname' and 'year' columns

In [16]:
df = pd.merge(df, df_dem_level, on=['countryname', 'year'], how='left')

8. Add GNI per capita to dataframe

8.1 Reduce GNI dataframe to required columns only (countryname, year, gni)

In [17]:
df_gni = df_gni[['Country Name', 'Time', 'GNI (current US$) [NY.GNP.MKTP.CD]']]

8.2 Relabel columns in df_gni to match with df

In [18]:
df_gni.rename(columns={'Country Name': 'countryname', 'Time': 'year', 'GNI (current US$) [NY.GNP.MKTP.CD]': 'GNI'}, inplace=True)

8.3 Merge the datasets based on 'countryname' and 'year' columns

In [19]:
df = pd.merge(df, df_gni, on=['countryname', 'year'], how='left')

9. Create a list of European countries and create a dataframe with only european countries

In [20]:
# Initialize AWOC class
my_world = awoc.AWOC()

# Create list of European countries and print it
europe_countries = my_world.get_countries_list_of('Europe')
print(europe_countries)

['Albania', 'Andorra', 'Austria', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Faroe Islands', 'Finland', 'France', 'Germany', 'Gibraltar', 'Greece', 'Guernsey', 'Hungary', 'Iceland', 'Ireland', 'Isle of Man', 'Italy', 'Jersey', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Svalbard and Jan Mayen', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom', 'Vatican']


In [21]:
europe_df = df[df['countryname'].isin(europe_countries)]

display(europe_df)

Unnamed: 0,country,countryname,oecdmember,eumember,edate,date,party,partyname,partyabbrev,parfam,...,markeco,welfare,intpeace,datasetversion,id_perm,year,Gini coefficient,welfare spending,electoral_dem_ind,GNI
0,11,Sweden,0,0.0,1944-09-17,194409,11220,Communist Party of Sweden,SKP,20,...,1.900,0.000,1.900,2023a,JN1LZH,1944,,,,
1,11,Sweden,0,0.0,1944-09-17,194409,11320,Social Democratic Labour Party,SAP,30,...,2.200,33.400,5.600,2023a,CMR7F6,1944,,,,
2,11,Sweden,0,0.0,1944-09-17,194409,11420,People’s Party,FP,40,...,6.400,14.300,1.600,2023a,Z6OL6C,1944,,,,
3,11,Sweden,0,0.0,1944-09-17,194409,11620,Right Party,,60,...,22.800,10.600,0.000,2023a,YMKVN2,1944,,,,
4,11,Sweden,0,0.0,1944-09-17,194409,11810,Agrarian Party,,80,...,19.048,0.000,4.762,2023a,U4SCRD,1944,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4665,98,Ukraine,0,0.0,2019-07-21,201907,98350,Opposition Bloc,,30,...,1.887,13.208,12.264,2023a,ZHUFI7,2019,0.266195,,0.524,155795133939.49
4666,98,Ukraine,0,0.0,2019-07-21,201907,98440,Servant of the People,,40,...,3.191,4.255,0.000,2023a,RB6257,2019,0.266195,,0.524,155795133939.49
4667,98,Ukraine,0,0.0,2019-07-21,201907,98450,Voice,,40,...,2.410,3.614,9.639,2023a,JT5WL6,2019,0.266195,,0.524,155795133939.49
4668,98,Ukraine,0,0.0,2019-07-21,201907,98617,All-Ukrainian Union ‘Fatherland',,60,...,3.896,15.584,5.195,2023a,W96R9B,2019,0.266195,,0.524,155795133939.49


In [22]:
europe_df_2005_2020 = europe_df[(europe_df['edate'].dt.year >= 2005) & (europe_df['edate'].dt.year <= 2020)]
display(europe_df_2005_2020)

Unnamed: 0,country,countryname,oecdmember,eumember,edate,date,party,partyname,partyabbrev,parfam,...,markeco,welfare,intpeace,datasetversion,id_perm,year,Gini coefficient,welfare spending,electoral_dem_ind,GNI
106,11,Sweden,10,10.0,2006-09-17,200609,11110,Green Ecology Party,MP,10,...,0.000,29.798,1.515,2023a,4REHJD,2006,0.263525,26.408,0.915,434074299964.761
107,11,Sweden,10,10.0,2006-09-17,200609,11220,Left Party,V,20,...,0.000,24.679,1.028,2023a,PNMPGP,2006,0.263525,26.408,0.915,434074299964.761
108,11,Sweden,10,10.0,2006-09-17,200609,11320,Social Democratic Labour Party,SAP,30,...,1.527,29.517,1.781,2023a,K3SXQ7,2006,0.263525,26.408,0.915,434074299964.761
109,11,Sweden,10,10.0,2006-09-17,200609,11420,Liberal People’s Party,FP,40,...,4.298,16.694,0.000,2023a,VSD37H,2006,0.263525,26.408,0.915,434074299964.761
110,11,Sweden,10,10.0,2006-09-17,200609,11520,Christian Democrats,Kd,50,...,5.645,20.968,0.000,2023a,H41DX2,2006,0.263525,26.408,0.915,434074299964.761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4665,98,Ukraine,0,0.0,2019-07-21,201907,98350,Opposition Bloc,,30,...,1.887,13.208,12.264,2023a,ZHUFI7,2019,0.266195,,0.524,155795133939.49
4666,98,Ukraine,0,0.0,2019-07-21,201907,98440,Servant of the People,,40,...,3.191,4.255,0.000,2023a,RB6257,2019,0.266195,,0.524,155795133939.49
4667,98,Ukraine,0,0.0,2019-07-21,201907,98450,Voice,,40,...,2.410,3.614,9.639,2023a,JT5WL6,2019,0.266195,,0.524,155795133939.49
4668,98,Ukraine,0,0.0,2019-07-21,201907,98617,All-Ukrainian Union ‘Fatherland',,60,...,3.896,15.584,5.195,2023a,W96R9B,2019,0.266195,,0.524,155795133939.49


In [23]:
# Save dataframe as csv file
europe_df_2005_2020.to_csv('CLEAN_DATA/Data_europe_2005_2020.csv', index=False)

In [24]:
df_dem_level

Unnamed: 0,countryname,year,electoral_dem_ind
0,Sweden,2005,0.915
1,Sweden,2006,0.915
2,Sweden,2007,0.916
3,Sweden,2008,0.916
4,Sweden,2009,0.916
...,...,...,...
619,Hungary,2016,0.613
620,Hungary,2017,0.561
621,Hungary,2018,0.482
622,Hungary,2019,0.472


In [25]:
# Filter rows where the year is 2015 - 2020, saving in new dataframe for chunk selection for the survey
df_2005_2020 = df[(df['edate'].dt.year >= 2005) & (df['edate'].dt.year <= 2020)]

# Filter European countries
europe_2005_2020_df = df_2005_2020[df_2005_2020['countryname'].isin(europe_countries)]
display(europe_2005_2020_df)

Unnamed: 0,country,countryname,oecdmember,eumember,edate,date,party,partyname,partyabbrev,parfam,...,markeco,welfare,intpeace,datasetversion,id_perm,year,Gini coefficient,welfare spending,electoral_dem_ind,GNI
106,11,Sweden,10,10.0,2006-09-17,200609,11110,Green Ecology Party,MP,10,...,0.000,29.798,1.515,2023a,4REHJD,2006,0.263525,26.408,0.915,434074299964.761
107,11,Sweden,10,10.0,2006-09-17,200609,11220,Left Party,V,20,...,0.000,24.679,1.028,2023a,PNMPGP,2006,0.263525,26.408,0.915,434074299964.761
108,11,Sweden,10,10.0,2006-09-17,200609,11320,Social Democratic Labour Party,SAP,30,...,1.527,29.517,1.781,2023a,K3SXQ7,2006,0.263525,26.408,0.915,434074299964.761
109,11,Sweden,10,10.0,2006-09-17,200609,11420,Liberal People’s Party,FP,40,...,4.298,16.694,0.000,2023a,VSD37H,2006,0.263525,26.408,0.915,434074299964.761
110,11,Sweden,10,10.0,2006-09-17,200609,11520,Christian Democrats,Kd,50,...,5.645,20.968,0.000,2023a,H41DX2,2006,0.263525,26.408,0.915,434074299964.761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4665,98,Ukraine,0,0.0,2019-07-21,201907,98350,Opposition Bloc,,30,...,1.887,13.208,12.264,2023a,ZHUFI7,2019,0.266195,,0.524,155795133939.49
4666,98,Ukraine,0,0.0,2019-07-21,201907,98440,Servant of the People,,40,...,3.191,4.255,0.000,2023a,RB6257,2019,0.266195,,0.524,155795133939.49
4667,98,Ukraine,0,0.0,2019-07-21,201907,98450,Voice,,40,...,2.410,3.614,9.639,2023a,JT5WL6,2019,0.266195,,0.524,155795133939.49
4668,98,Ukraine,0,0.0,2019-07-21,201907,98617,All-Ukrainian Union ‘Fatherland',,60,...,3.896,15.584,5.195,2023a,W96R9B,2019,0.266195,,0.524,155795133939.49


In [26]:
# Manifesto selection for survey
survey = europe_2005_2020_df.sample(n=10, random_state=8)
display(survey)

Unnamed: 0,country,countryname,oecdmember,eumember,edate,date,party,partyname,partyabbrev,parfam,...,markeco,welfare,intpeace,datasetversion,id_perm,year,Gini coefficient,welfare spending,electoral_dem_ind,GNI
4601,97,Slovenia,10,10.0,2018-06-03,201806,97341,List of Marjan Šarec,LMŠ,40,...,0.699,13.287,0.0,2023a,88775P,2018,0.246334,21.325,0.832,53372127792.3945
4507,96,Slovakia,10,10.0,2006-06-17,200606,96523,Slovak Democratic and Christian Union - Democa...,SDKÚ-DS,50,...,2.821,7.732,0.209,2023a,DQOK57,2006,0.277095,,0.823,
3770,83,Estonia,10,10.0,2015-03-01,201503,83611,Pro Patria and Res Publica Union,IRL,60,...,4.048,17.401,0.142,2023a,IN1E1D,2015,0.326703,17.279,0.895,22387751107.0312
4505,96,Slovakia,10,10.0,2006-06-17,200606,96423,Direction-Social Democracy,Smer,30,...,0.22,31.429,0.0,2023a,SZX857,2006,0.277095,,0.823,
767,15,Iceland,10,0.0,2009-04-25,200904,15620,Independence Party,Sj,60,...,17.284,7.407,0.0,2023a,HBV6AV,2009,0.286564,18.491,0.885,10553993507.6805
263,12,Norway,10,0.0,2017-09-11,201709,12520,Christian People’s Party,KrF,50,...,0.531,22.027,1.423,2023a,DNU3AP,2017,0.269873,25.129,0.891,416379568809.188
4015,88,Lithuania,0,10.0,2016-10-09,201610,88820,Lithuanian Peasant and Green Union,LVŽS,80,...,2.086,23.108,0.0,2023a,H1SS57,2016,0.383912,15.573,0.824,41513241212.6196
3627,81,Croatia,0,20.0,2007-11-25,200711,81910,Independent Democratic Serbian Party,SDSS,90,...,0.0,2.74,0.0,2023a,2ZX536,2007,,,0.789,57926566622.372
1156,22,Netherlands,10,10.0,2017-03-15,201703,22321,DENK,DENK,30,...,0.802,26.002,2.176,2023a,HKW5ZY,2017,0.285358,16.649,0.879,824285426585.884
1651,33,Spain,10,10.0,2008-03-09,200803,33902,Basque Nationalist Party,PNV/EAJ,90,...,2.165,12.446,0.541,2023a,AMJBEN,2008,0.342216,22.326,0.885,1586564225114.84


In [27]:
# Filter democracy data and replace data file, file size too large for git otherwise
df_dem_level = df_dem_level[(df_dem_level['year'] >= 2005) & (df_dem_level['year'] <= 2020)]
df_dem_level = df_dem_level[df_dem_level['countryname'].isin(europe_countries)]
df_dem_level.to_csv('RAW_DATA/V-Dem-CY-Core-v14.csv')