In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

In [2]:
def change_countries(x, countries):
    '''makes countries consistent across dataframes'''
    for country,variations in countries.items():
        for variation in variations.values():
            if x == variation:
                return country 
    return x

### Box Plot of democracy index data

In [3]:
# read in democracy data
dem_idx = pd.read_csv('../data/democracy_index_data.csv')

In [4]:
# create box plot
fig = px.box(dem_idx, 
            y="five_year_mean",
            color="region",
            template='plotly_white',    
            color_discrete_sequence=px.colors.qualitative.Prism,
            title= 'Box Plots of the Democracy Index Score by Region',
            width=800,
            height=400,
            )
fig.update_yaxes(title_text='Democracy Index Score (5 year mean)')
fig.show()

### Histogram of women data

In addition to having the percent women in parliments for each country for each year, it is helpful to have the data for only 2022. Addtionally, this 2022 dataframe will be converted from wide to long, with a column indicating whether the data is from the lower or upper house.  

In [5]:
women = pd.read_csv('../data/world_data_final.csv')

In [6]:
# convert to datetime
women['lower_single_house_elections'] = pd.to_datetime(women['lower_single_house_elections'])
women['upper_house_senate_elections_year'] = pd.to_datetime(women['upper_house_senate_elections'])

# get 2022 data
women2022 = women[(women['lower_single_house_elections'].dt.year== 2022)|(women['upper_house_senate_elections_year'].dt.year== 2022)]

# melt dataframe 
women_melted = women2022.reset_index().melt(id_vars=['country'],value_vars=['lower_single_house_percent_w','upper_house_senate_percent_w'],value_name='Percent Women',var_name='House')

# convert to percents
women_melted['Percent Women'] = round(women_melted.loc[:,'Percent Women']*100,2).astype(float)

In [8]:
# make histogram
fig = px.histogram(women_melted, x='Percent Women',
                    facet_col='House',
                    nbins=10,
                    width=800,
                    height=400,
                    template='plotly_white',    
                    color_discrete_sequence=[px.colors.qualitative.Prism[1],px.colors.qualitative.Prism[2]],
                    title= 'Histogram of the Percent Women in Parliaments in Countries across the World in 2022',)

facet_titles = ['Lower Houses','Upper Houses']
for i, a in enumerate(fig.layout.annotations):
    a.text = facet_titles[i]

fig.show()

### Correlation between percent women in parliaments and democracy index

#### Joining the two datasets together

In order to run the correlation and produce some of the visualzations, the datasets need to be joined together into one dataframe.

In [9]:
# read in democracy index data
democracy_index = pd.read_csv('../data/democracy_index_data.csv')
# read in the percent women in parliments data
women_in_parliments =  pd.read_csv('../data/world_data_final.csv')

In [10]:
# get a list of years to be used to melt the dataframe
years_list = [str(year) for year in range(2010,2023)]
years_list.insert(0,'2008')
years_list.insert(0,'2006')
# melt dataframe from wide to long
democracy_index_melted = democracy_index.melt(id_vars=['region','2022_rank','country','regime_type','five_year_mean'],value_vars=years_list,var_name='year',value_name='democracy_index_score')

The women in parliments needs to be grouped by year so that each country has one value for each year. 

In [11]:
# groupby so that there is only one value for each year, get only lower house
lower_house = women_in_parliments[['country','lower_single_house_percent_w','lower_single_house_elections_year']].groupby(['country','lower_single_house_elections_year']).apply(lambda x: x['lower_single_house_percent_w'].mean(skipna=False)).dropna().reset_index()
lower_house = lower_house.rename({'lower_single_house_elections_year':'year',0:'lower_single_house_percent_w'},axis=1)
# take the mean across months so that there is only one value for each year, get only upper house
upper_house = women_in_parliments[['country','upper_house_senate_percent_w','upper_house_senate_elections_year']].groupby(['country','upper_house_senate_elections_year']).apply(lambda x: x['upper_house_senate_percent_w'].mean(skipna=False)).dropna().reset_index()
upper_house = upper_house.rename({'upper_house_senate_elections_year':'year',0:'upper_house_senate_percent_w'},axis=1)
# merge upper and lower back together
parliaments = lower_house.merge(upper_house,on=['country','year'], how='outer')

##### Changing the countries to match in both datasets

In order to join the two datasets together, both datasets must have the same spelling for countries.

In [12]:
# make sure there is not whitespace in country columns
democracy_index_melted['country'] = democracy_index_melted['country'].str.strip()
parliaments['country']  = parliaments['country'].str.strip()

In [13]:
# read in a list of countries and different spelling variations of those countries
countries = pd.read_csv('../data/list_of_countries.csv')
countries['Variation1'] = countries['Variation1'].astype(str)
countries['Variation2'] = countries['Variation2'].astype(str)
countries['Variation3'] = countries['Variation3'].astype(str)
countries_dict = countries.set_index('Country')[['Variation1','Variation2','Variation3']].to_dict(orient='index')

In [14]:
# make country names consistent 
parliaments['country'] = parliaments['country'].apply(change_countries,args=(countries_dict,))
democracy_index_melted['country'] = democracy_index_melted['country'].apply(change_countries,args=(countries_dict,))

In [15]:
# join the two datasets on year and country
df = democracy_index_melted.merge(parliaments, on=['country','year'], how='outer')

#### Overall correlation

In [16]:
corr = df[['democracy_index_score','lower_single_house_percent_w','upper_house_senate_percent_w']].corr()
corr

Unnamed: 0,democracy_index_score,lower_single_house_percent_w,upper_house_senate_percent_w
democracy_index_score,1.0,0.310565,0.180674
lower_single_house_percent_w,0.310565,1.0,0.626076
upper_house_senate_percent_w,0.180674,0.626076,1.0


#### Correlation by region

In [17]:
corr_by_region = df.groupby('region')[['democracy_index_score','lower_single_house_percent_w','upper_house_senate_percent_w']].corr()
corr_by_region

Unnamed: 0_level_0,Unnamed: 1_level_0,democracy_index_score,lower_single_house_percent_w,upper_house_senate_percent_w
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia and Australasia,democracy_index_score,1.0,0.128786,0.293848
Asia and Australasia,lower_single_house_percent_w,0.128786,1.0,0.885088
Asia and Australasia,upper_house_senate_percent_w,0.293848,0.885088,1.0
Central and Eastern Europe,democracy_index_score,1.0,0.067009,-0.442193
Central and Eastern Europe,lower_single_house_percent_w,0.067009,1.0,0.422929
Central and Eastern Europe,upper_house_senate_percent_w,-0.442193,0.422929,1.0
Latin America and the Caribbean,democracy_index_score,1.0,-0.217821,0.327432
Latin America and the Caribbean,lower_single_house_percent_w,-0.217821,1.0,0.824936
Latin America and the Caribbean,upper_house_senate_percent_w,0.327432,0.824936,1.0
Middle East and North Africa,democracy_index_score,1.0,0.376117,-0.500286


In [18]:
# drop nas so that the plot works
scatter_df = df.drop('upper_house_senate_percent_w',axis=1).dropna()
# make figure
fig = px.scatter(scatter_df, 
            x='democracy_index_score',
            y='lower_single_house_percent_w',
            template='plotly_white',    
            #hover_data=['country','year'],
            color='region',
            color_discrete_sequence=px.colors.qualitative.Prism,
            animation_frame='year',
            title= 'Democracy Index vs Percent Women in Lower Parliaments across Time',
            size='lower_single_house_percent_w',
            width=1200,
            height=800
        )

fig.update_xaxes(title_text='Democracy Index Score')
fig.update_yaxes(title_text='Percent Women in Lower Legislative Branches')

fig.show()

In [19]:
# This is other things I explored but that we are no longer using 

In [20]:
country_codes = pd.read_html('https://www.iban.com/country-codes')

In [21]:
# helper function to get  each women's time mandate_end as a column
#def get_last_year_in_office(df):
#    functions = {'hrllo': (' (assassinated)','',regex=False), 
#                    'test': ('Incumbent',np.NaNm),
#                     fillna: df['Mandate start']}


In [22]:
# read in female heads of state dataset
df = pd.read_csv('data/female_heads_of_state.csv')
countries = pd.read_csv('data/list_of_countries.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/female_heads_of_state.csv'

In [None]:
# add in the Alpha-3 code
df = df.merge(countries[['Country','Alpha-3 code']],on='Country')

# add colunms that have the start and end years of their terms
df['start_year'] = df['Mandate start'].apply(lambda x: f'19{x[-2:]}' if int(x[-2:]) > 39 else f'20{x[-2:]}')
df['Mandate end'] = df['Mandate end'].str.replace(' (assassinated)','',regex=False)
df['Mandate end'] = df['Mandate end'].replace('Incumbent',np.NaN)
df['Mandate end'] = df['Mandate end'].fillna(df['Mandate start'])
df['end_year'] = df['Mandate end'].apply(lambda x: f'19{x[-2:]}' if int(x[-2:]) > 39 else f'20{x[-2:]}')

In [None]:
# make it so that each year of each leader's term is a row
df['Year'] = [pd.date_range(x, y,freq='YS',inclusive='both') for x, y in zip(df.start_year, df.end_year)]
df = df.explode('Year')
df['Year'] = df['Year'].astype(str).apply(lambda x:x[:4])

In [None]:
# read in geojson for choropleth 
with open('data/countries.geojson') as f:
    gj = geojson.load(f)

# add in the 'id' which allows plotly to associate the cordinates to a teh countries in my dataframe
for feature in gj['features']:
    feature['id'] = feature['properties']['ISO_A3']

In [None]:
# only do three years to save memory b/c choropleth is to large otherwise
df_less = df[['Name','Country','Office','Head of state or government','Alpha-3 code','Year']].sort_values('Year')
df_less = df_less[df_less['Year'].isin([str(year) for year in range(2020,2023)])]

### Choropleth visualzation

In [None]:
# without animation (only 2022)
#fig = px.choropleth_mapbox(df[[df_less]==2022], geojson=gj, locations='Alpha-3 code', 
#                    color='Head of state or government', mapbox_style='carto-positron', zoom=1,
#                           color_discrete_sequence=px.colors.qualitative.Prism,
#                          hover_data=['Name','Country','Office'])

#fig.show()

In [None]:
# with animation (2020 - 2023)
#fig = px.choropleth_mapbox(df_less, geojson=gj, locations='Alpha-3 code', 
#                    color='Head of state or government', mapbox_style='carto-positron', zoom=1,
#                           color_discrete_sequence=px.colors.qualitative.Prism,
#                          hover_data=['Name','Country','Office'],animation_frame='Year')

#fig.show()

In [None]:
import plotly.graph_objects as go

In [None]:
fig = px.scatter(df, 
            x='democracy_indx_score',
            y='percent_W',
            template='plotly_white',    
            hover_data=['Country','year'],
            facet_row='Region',
            color='Region',
            color_discrete_sequence=px.colors.qualitative.Prism,
            animation_frame='year',
            title= 'Democracy Index vs Percent Women in Parliment across time',
            size='percent_W',
            height=800
        )
for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.YAxis:
        fig.layout[axis].title.text = ''
    if type(fig.layout[axis]) == go.layout.XAxis:
        fig.layout[axis].title.text = ''
 
# ensure that each chart has its own y range and tick labels
fig.update_yaxes(matches=None, showticklabels=True, visible=True)
#fig.update_xaxes(title_text='Democracy Index Score')
#fig.update_yaxes(title_text='Percent Women in Lower Legislative Branches')

fig.show()