In [48]:
import pandas as pd
import numpy as np
import re
import operator

In [49]:
worried_data_path = '../Data/Raw/worried_data.csv'

In [50]:
worried_data = pd.read_csv(worried_data_path)
worried_data.rename(columns = {'2011 [YR2011]': 2011, '2014 [YR2014]': 2014, '2017 [YR2017]': 2017, '2021 [YR2021]': 2021, '2022 [YR2022]': 2022}, inplace = True)
# drop all rows where no country name or series code is provided
# it will remove all data description rows generated while exporting from Findex Database
worried_data.dropna(subset = ['Country Name', 'Series Code'], inplace = True)

In [51]:
series_codes = worried_data["Series Code"].unique()
series_desc = {}
for i in range(len(series_codes)):
    code = series_codes[i]
    series_name = worried_data[worried_data["Series Code"] ==  code]["Series Name"].unique()
    if series_name.shape[0] == 1:
        series_desc[code] = series_name.item()
    elif series_name.shape[0] == 0:
        print("No series name for", code)
    elif series_name.shape[0] > 1:
        print("Something went wrong")

In [52]:
# For the purpose of documenting the data used in the README
with open("../Data/Processed/worried_desc.txt", "w") as worried_data_desc:
    worried_data_desc.write("\n".join([" ".join(["|", key, "|", series_desc[key], "|"]) for key in series_desc.keys()]))

In [53]:
# The data is sorted according to the specific series, so the relevant series are grouped together for the purpose of clarity
for key in series_desc.keys():
    m = re.search('(.*):(.*?)([(,].*)', series_desc[key])
    series_name = " ".join([m.groups()[0].strip(), m.groups()[2].strip(" ,"), m.groups()[1].strip()])
    series_desc[key] = series_name
series_desc = dict(sorted(series_desc.items(), key = operator.itemgetter(1)))

In [54]:
# Values exist in 2021 and 2022 columns (exclusively), all values from 2011, 2014 and 2017 are Nan
# change strings to numeric values, use NaN wherever data not available
value_columns = [2011, 2014, 2017, 2021, 2022]
worried_data[value_columns] = worried_data[value_columns].apply(pd.to_numeric, errors = 'coerce')
worried_data.drop(columns = ['Country Code', 'Series Name', 2011, 2014, 2017], inplace = True)
# store values in percentage columns
worried_data['percentage'] = worried_data[2021].fillna(worried_data[2022])
worried_data.drop(columns = [2021, 2022], inplace = True)

In [55]:
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain',
       'Bangladesh', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cyprus', 'Czechia',
       'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador',
       'Egypt, Arab Rep.', 'El Salvador', 'Estonia', 'Eswatini',
       'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia, The', 'Georgia',
       'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Haiti',
       'Honduras', 'Hong Kong SAR, China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Iran, Islamic Rep.', 'Iraq', 'Ireland', 'Israel',
       'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya',
       'Korea, Rep.', 'Kosovo', 'Kuwait', 'Kyrgyz Republic', 'Lao PDR',
       'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Lithuania',
       'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',
       'Mali', 'Malta', 'Mauritania', 'Mauritius', 'Mexico', 'Moldova',
       'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar',
       'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua',
       'Niger', 'Nigeria', 'North Macedonia', 'Norway', 'Oman',
       'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland',
       'Portugal', 'Puerto Rico', 'Qatar', 'Romania',
       'Russian Federation', 'Rwanda', 'Saudi Arabia', 'Senegal',
       'Serbia', 'Sierra Leone', 'Singapore', 'Slovak Republic',
       'Slovenia', 'Somalia', 'South Africa', 'South Sudan', 'Spain',
       'Sri Lanka', 'Sudan', 'Sweden', 'Switzerland',
       'Syrian Arab Republic', 'Taiwan, China', 'Tajikistan', 'Tanzania',
       'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkiye',
       'Turkmenistan', 'Uganda', 'Ukraine', 'United Arab Emirates',
       'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan',
       'Venezuela, RB', 'Vietnam', 'West Bank and Gaza', 'Yemen, Rep.',
       'Zambia', 'Zimbabwe'])
not_countries = ['Arab World', 'East Asia & Pacific',
       'East Asia & Pacific (excluding high income)', 'Euro area',
       'Europe & Central Asia',
       'Europe & Central Asia (excluding high income)', 'High income',
       'Latin America & Caribbean',
       'Latin America & Caribbean (excluding high income)',
       'Low & middle income', 'Low income', 'Lower middle income',
       'Middle East & North Africa',
       'Middle East & North Africa (excluding high income)',
       'Middle income', 'North America', 'OECD members', 'South Asia',
       'Sub-Saharan Africa', 'Sub-Saharan Africa (excluding high income)',
       'Upper middle income', 'World']

In the Findex database values are provided in columns corresponding to years representing the survey waves. In 2024 the data about what population is worried about is available for the last wave (years 2021 and 2022). The data is pivoted, it is indexed by the name of the country, and columns represent series. Values are concatenated from 2021 and 2022.

In [56]:
worried_data_pivoted = []
for country in countries:
    filtered_data = worried_data[(worried_data["Country Name"] == country)].copy()
    # ordered_filtered_data = filtered_data.set_index("Series Code").reindex(series_codes)
    worried_data_pivoted.append(filtered_data.pivot(index = 'Country Name', columns = 'Series Code', values = 'percentage'))

for country in not_countries:
#for country in ['Azerbaijan']:
    filtered_data = worried_data[(worried_data["Country Name"] == country)].copy()
    # ordered_filtered_data = filtered_data.set_index("Series Code").reindex(series_codes)
    worried_data_pivoted.append(filtered_data.pivot(index = 'Country Name', columns = 'Series Code', values = 'percentage'))

worried_data_pivoted = pd.concat(worried_data_pivoted)
ordered_worried_data_pivoted = worried_data_pivoted[series_desc.keys()].copy()

# some countries have no data
# drop countries with all missing values
ordered_worried_data_pivoted.dropna(axis = 'index', how = 'all', inplace = True)
# some of the columns/series have less than 50% of data present (rural/urban series)
# drop columns with 50% or more missing values
ordered_worried_data_pivoted.dropna(axis = 'columns', thresh = 0.5*len(ordered_worried_data_pivoted.columns), inplace = True)

In [65]:
countries_data = ordered_worried_data_pivoted[ordered_worried_data_pivoted.index.isin(countries)]
countries_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 139 entries, Afghanistan to Zimbabwe
Columns: 132 entries, fin44b3.d to fin44a1.d.3
dtypes: float64(132)
memory usage: 144.4+ KB


There are 139 countries each with 132 long feature vector.