In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/selinekarabulut/gendergap_dashboard/main/Data/Data.csv')
df

Unnamed: 0,country,leader_name,times,sex,age,cochair,in_year,out_year,tenure,status,sysofgov
0,Albania,Ramiz Tafë Alia,,M,60.0,,1985.0,1991.0,6.0,0.0,parliamentary
1,Albania,Sali Berisha,I,M,47.0,,1991.0,1992.0,1.0,0.0,parliamentary
2,Albania,Sali Berisha,II,M,53.0,,1997.0,2013.0,17.0,0.0,parliamentary
3,Albania,Eduart Selami,,M,,,1992.0,1995.0,3.0,0.0,parliamentary
4,Albania,Tritan Shehu,,M,46.0,,1995.0,1997.0,2.0,0.0,parliamentary
...,...,...,...,...,...,...,...,...,...,...,...
4550,Zambia,Edgar Lungu,,M,58.0,,2014.0,,8.0,1.0,hybrid
4551,Zambia,Nevers Mumba,,M,52.0,,2012.0,,10.0,1.0,hybrid
4552,Zambia,Hakainde Hichilema,,M,44.0,,2006.0,,16.0,1.0,hybrid
4553,Zambia,,,,,,,,,,hybrid


In [None]:
# Count rows where 'sex' and 'in_year' columns have missing values
missing_count = df[(df['sex'].isnull()) & (df['in_year'].isnull())].shape[0]


missing_count

368

In [None]:
# Remove rows where 'sex' and 'in_year' columns have missing values
data_cleaned = df.dropna(subset=['sex', 'in_year'])
data_cleaned.head()


Unnamed: 0,country,leader_name,times,sex,age,cochair,in_year,out_year,tenure,status,sysofgov
0,Albania,Ramiz Tafë Alia,,M,60.0,,1985.0,1991.0,6.0,0.0,parliamentary
1,Albania,Sali Berisha,I,M,47.0,,1991.0,1992.0,1.0,0.0,parliamentary
2,Albania,Sali Berisha,II,M,53.0,,1997.0,2013.0,17.0,0.0,parliamentary
3,Albania,Eduart Selami,,M,,,1992.0,1995.0,3.0,0.0,parliamentary
4,Albania,Tritan Shehu,,M,46.0,,1995.0,1997.0,2.0,0.0,parliamentary


In [None]:
data_cleaned.to_csv('data_cleaned.csv', index=False)

In [None]:
from google.colab import files

files.download('data_cleaned.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import altair as alt

# Create a DataFrame for counts of 'sex'
sex_counts = data_cleaned['sex'].value_counts().reset_index()
sex_counts.columns = ['sex', 'count']

## Calculate the total for percentage calculation
total = sex_counts['count'].sum()
sex_counts['percentage'] = (sex_counts['count'] / total * 100).round(2)

# Create a donut chart using Altair
donut_chart = alt.Chart(sex_counts).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="count", type="quantitative"),
    color=alt.Color(field="sex", type="nominal", legend=alt.Legend(title="Sex")),
    tooltip=[alt.Tooltip('sex'), alt.Tooltip('count'), alt.Tooltip('percentage', title='Percentage', format='.2f')]
).properties(
    title="Percentage of Women (F) vs Men (M) in the Dataset"
)

# Display the chart
donut_chart.display()

In [None]:
import plotly.express as px

# Create a donut chart using Plotly
donut_chart = px.pie(sex_counts, values='count', names='sex', title='Percentage of Women (F) vs Men (M) in the Dataset',
                     hole=0.5, color_discrete_sequence=px.colors.sequential.RdBu)
donut_chart.show()


In [None]:
import altair as alt
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame
# Calculate percentage of females and males for each country by year
gender_percentage = data_cleaned.groupby(['country', 'in_year', 'sex']).size().unstack(fill_value=0)
total = gender_percentage.sum(axis=1)
gender_percentage = gender_percentage.div(total, axis=0) * 100

# Reshape DataFrame for Altair
df_reshaped = gender_percentage.reset_index().melt(id_vars=['country', 'in_year'], value_vars=['F', 'M'], var_name='sex', value_name='percentage')

# Enable dark theme
alt.themes.enable("dark")

# Create the heatmap
heatmap = alt.Chart(df_reshaped).mark_rect().encode(
    y=alt.Y('in_year:O', axis=alt.Axis(title="Year", titleFontSize=16, titlePadding=15, titleFontWeight=900, labelAngle=0)),
    x=alt.X('country:O', axis=alt.Axis(title="Country", titleFontSize=16, titlePadding=15, titleFontWeight=900)),
    color=alt.Color('percentage:Q',
                    legend=alt.Legend(title="Percentage"),
                    scale=alt.Scale(scheme="blueorange")),
    stroke=alt.value('black'),
    strokeWidth=alt.value(0.25),
    tooltip=[
        alt.Tooltip('in_year:O', title='Year'),
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('sex:N', title='Sex'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.2f')
    ]
).properties(width=900).configure_axis(
    labelFontSize=12,
    titleFontSize=12
)

# Display the chart
heatmap.display()


In [None]:
import altair as alt
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame
# Calculate percentage of females for each country by year
gender_percentage = data_cleaned[data_cleaned['sex'] == 'F'].groupby(['country', 'in_year']).size()
total = data_cleaned.groupby(['country', 'in_year']).size()
female_percentage = (gender_percentage / total * 100).reset_index(name='percentage')

# Replace 'F' with 'Female' for visualization clarity
female_percentage['sex'] = 'Female'

# Enable dark theme
alt.themes.enable("dark")

# Create the heatmap
heatmap = alt.Chart(female_percentage).mark_rect().encode(
    y=alt.Y('in_year:O', axis=alt.Axis(title="Year", titleFontSize=16, titlePadding=15, titleFontWeight=900, labelAngle=0)),
    x=alt.X('country:O', axis=alt.Axis(title="Country", titleFontSize=16, titlePadding=15, titleFontWeight=900)),
    color=alt.Color('percentage:Q',
                    legend=alt.Legend(title="Percentage of Females"),
                    scale=alt.Scale(scheme="blueorange")),
    stroke=alt.value('black'),
    strokeWidth=alt.value(0.55),
    tooltip=[
        alt.Tooltip('in_year:O', title='Year'),
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('sex:N', title='Sex'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.2f')
    ]
).properties(width=950).configure_axis(
    labelFontSize=12,
    titleFontSize=12
)

# Display the chart
heatmap.display()


In [None]:

# Load the latitude and longitude data for countries
lat_lon_path = "https://raw.githubusercontent.com/selinekarabulut/gendergap_dashboard/main/Data/world_country_and_usa_states_latitude_and_longitude_values.csv"
lat_lon_data = pd.read_csv(lat_lon_path)

# Display the first few rows of the latitude and longitude data to understand its structure
lat_lon_data.head()




Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska
1,AE,23.424076,53.847818,United Arab Emirates,AL,32.318231,-86.902298,Alabama
2,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
3,AG,17.060816,-61.796428,Antigua and Barbuda,AZ,34.048928,-111.093731,Arizona
4,AI,18.220554,-63.068615,Anguilla,CA,36.778261,-119.417932,California


In [None]:
# Filter the dataset to keep only relevant columns for countries
country_lat_lon = lat_lon_data[['country', 'latitude', 'longitude']]

# Merge this geographic data with the cleaned data on the country field
merged_data = pd.merge(data_cleaned, country_lat_lon, on='country', how='left')

# Check the first few rows of the merged data to confirm the merge was successful
merged_data.head()


Unnamed: 0,country,Unnamed: 1,Unnamed: 2,leader_name,times,sex,age,cochair,in_year,out_year,tenure,status,sysofgov,latitude,longitude
0,Albania,,,Ramiz Tafë Alia,,M,60.0,,1985.0,1991.0,6.0,0.0,parliamentary,41.153332,20.168331
1,Albania,,,Sali Berisha,I,M,47.0,,1991.0,1992.0,1.0,0.0,parliamentary,41.153332,20.168331
2,Albania,,,Sali Berisha,II,M,53.0,,1997.0,2013.0,17.0,0.0,parliamentary,41.153332,20.168331
3,Albania,,,Eduart Selami,,M,,,1992.0,1995.0,3.0,0.0,parliamentary,41.153332,20.168331
4,Albania,,,Tritan Shehu,,M,46.0,,1995.0,1997.0,2.0,0.0,parliamentary,41.153332,20.168331


In [None]:
# Calculate the percentage of females for each country and year
gender_percentage = merged_data.groupby(['country', 'in_year', 'sex']).size().unstack(fill_value=0)
total = gender_percentage.sum(axis=1)
gender_percentage = gender_percentage.div(total, axis=0) * 100
gender_percentage = gender_percentage.reset_index()

# Focus on the percentage of females
female_percentage = gender_percentage[['country', 'in_year', 'F']].rename(columns={'F': 'percentage'})
female_percentage['sex'] = 'Female'

# Merge the latitude and longitude again for plotting purposes
female_percentage = pd.merge(female_percentage, country_lat_lon, on='country', how='left')

# Drop rows where coordinates might be missing (if any)
female_percentage = female_percentage.dropna(subset=['latitude', 'longitude'])

# Check the first few rows to ensure the data is correct
female_percentage.head()


Unnamed: 0,country,in_year,percentage,sex,latitude,longitude
0,Albania,1985.0,0.0,Female,41.153332,20.168331
1,Albania,1991.0,0.0,Female,41.153332,20.168331
2,Albania,1992.0,0.0,Female,41.153332,20.168331
3,Albania,1995.0,0.0,Female,41.153332,20.168331
4,Albania,1997.0,0.0,Female,41.153332,20.168331


In [None]:
import plotly.express as px

# Create a Choropleth map to visualize the percentage of females by year
choropleth_map = px.choropleth(
    female_percentage,
    locations="country",
    locationmode='country names',
    color="percentage",
    hover_name="country",
    hover_data={"percentage": True, "in_year": True},
    animation_frame="in_year",
    color_continuous_scale=px.colors.sequential.Plasma,
    labels={'percentage': 'Percentage of Females'},
    title="Percentage of Females by Year in Each Country"
)

choropleth_map.show()
