In [76]:
#Importing required libraries
import numpy as np
import pandas as pd

#Updating setting to prevent truncation of DataFrame column content
pd.set_option('display.max_colwidth',-1)

In [77]:
#Reading the first dataset
country = pd.read_csv('Gender_Stats_csv/Gender_StatsCountry.csv')
country.head(2)

Unnamed: 0,Country Code,Short Name,Table Name,Long Name,2-alpha code,Currency Unit,Special Notes,Region,Income Group,WB-2 code,...,Government Accounting concept,IMF data dissemination standard,Latest population census,Latest household survey,Source of most recent Income and expenditure data,Vital registration complete,Latest agricultural census,Latest industrial data,Latest trade data,Unnamed: 30
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,,Latin America & Caribbean,High income,AW,...,,Enhanced General Data Dissemination System (e-GDDS),2010,,,Yes,,,2016.0,
1,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,,South Asia,Low income,AF,...,Consolidated central government,Enhanced General Data Dissemination System (e-GDDS),1979,"Demographic and Health Survey, 2015","Integrated household survey (IHS), 2016/17",,,,2017.0,


In [78]:
#Dropping the non-required columns
country = country[['Country Code','Short Name','Region','Income Group']]
country.head()

Unnamed: 0,Country Code,Short Name,Region,Income Group
0,ABW,Aruba,Latin America & Caribbean,High income
1,AFG,Afghanistan,South Asia,Low income
2,AGO,Angola,Sub-Saharan Africa,Lower middle income
3,ALB,Albania,Europe & Central Asia,Upper middle income
4,AND,Andorra,Europe & Central Asia,High income


In [79]:
country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 4 columns):
Country Code    263 non-null object
Short Name      263 non-null object
Region          217 non-null object
Income Group    217 non-null object
dtypes: object(4)
memory usage: 8.3+ KB


In [80]:
#Reading the second dataset
stats = pd.read_csv('Gender_Stats_csv/Gender_StatsData.csv')
stats.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,A woman can be head of household in the same way as a man (1=yes; 0=no),SG.HLD.HEAD.EQ,,,,,,,...,,,,,,,,,,


In [81]:
#Converting the horizontal dataset into vertical format
idx = ['Country Name','Country Code','Indicator Name','Indicator Code']
multi_index_df = stats.set_index(idx)
stacked_df = multi_index_df.stack(dropna=False)
long_df = stacked_df.reset_index()
long_df.rename({'level_4':'Year', 0:'Indicator Value'}, axis=1, inplace=True)
long_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Indicator Value
0,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1960,
1,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1961,
2,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1962,
3,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1963,
4,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1964,


In [82]:
long_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10174944 entries, 0 to 10174943
Data columns (total 6 columns):
Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
Year               object
Indicator Value    float64
dtypes: float64(1), object(5)
memory usage: 465.8+ MB


In [83]:
long_df['Year'].unique()

array(['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967',
       '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975',
       '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991',
       '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
       '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', 'Unnamed: 65'],
      dtype=object)

In [84]:
long_df = long_df.loc[long_df['Year']!='Unnamed: 65']
long_df['Year'] = long_df['Year'].astype(int)
stats_df = long_df.drop('Indicator Code', axis=1)
stats_df = stats_df.loc[(stats_df['Year']>=1980) & (stats_df['Year']<2020)]
stats_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,Indicator Value
20,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,
21,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,
22,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,
23,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,
24,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,


In [85]:
join_df = pd.merge(stats_df,country,on='Country Code',how='left')
join_df = join_df[['Country Code','Country Name','Region','Income Group','Indicator Name','Year','Indicator Value']]
join_df = join_df[join_df['Region'].notnull()]
join_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Indicator Name,Year,Indicator Value
1148160,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,0.0
1148161,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,0.0
1148162,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,0.0
1148163,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,0.0
1148164,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,0.0


In [86]:
country_ncheck = {}

for i in join_df['Country Name'].unique():
    df = join_df[join_df['Country Name']==i]
    total_cnt = df['Indicator Value'].size
    null_cnt = df['Indicator Value'].isnull().sum()
    non_null_cnt = total_cnt - null_cnt
    cnt_list = [total_cnt, null_cnt, non_null_cnt]
    country_ncheck[i] = cnt_list

In [87]:
country_ndf = pd.DataFrame.from_dict(country_ncheck, orient='index')
country_ndf.reset_index(inplace=True)
col_names2 = ['Country Name','Total Count','Null Count','Non-Null Count']
country_ndf.columns = col_names2
country_ndf.sort_values(['Null Count'], ascending=False, inplace=True)
country_ndf.head()

Unnamed: 0,Country Name,Total Count,Null Count,Non-Null Count
146,Northern Mariana Islands,24960,24784,176
183,St. Martin (French part),24960,24735,225
172,Sint Maarten (Dutch part),24960,24732,228
3,American Samoa,24960,24715,245
65,Faroe Islands,24960,24659,301


In [88]:
remove_country = country_ndf[country_ndf['Non-Null Count']<=5000]
rc_lst = list(remove_country['Country Name'])

In [89]:
join_df = join_df[~join_df['Country Name'].isin(rc_lst)]
join_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Indicator Name,Year,Indicator Value
1148160,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,0.0
1148161,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,0.0
1148162,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,0.0
1148163,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,0.0
1148164,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,0.0


In [90]:
join_df.shape

(4467840, 7)

In [91]:
null_check = {}

for i in join_df['Indicator Name'].unique():
    df = join_df[join_df['Indicator Name']==i]
    total_cnt = df['Indicator Value'].size
    null_cnt = df['Indicator Value'].isnull().sum()
    non_null_cnt = total_cnt - null_cnt
    cnt_list = [total_cnt, null_cnt, non_null_cnt]
    null_check[i] = cnt_list

In [92]:
indicator_ndf = pd.DataFrame.from_dict(null_check, orient='index')
indicator_ndf.reset_index(inplace=True)
col_names = ['Indicator Name','Total Count','Null Count','Non-Null Count']
indicator_ndf.columns = col_names
indicator_ndf.sort_values(['Null Count'], ascending=False, inplace=True)
indicator_ndf.head(10)

Unnamed: 0,Indicator Name,Total Count,Null Count,Non-Null Count
136,"Female share of graduates in Health and Welfare programmes, tertiary (%)",7160,7160,0
132,"Female share of graduates in Agriculture programmes, tertiary (%)",7160,7160,0
133,"Female share of graduates in Education programmes, tertiary (%)",7160,7160,0
143,"Female share of graduates in unknown or unspecified fields, tertiary (%)",7160,7160,0
142,"Female share of graduates in Social Science, Business and Law programmes, tertiary (%)",7160,7160,0
141,"Female share of graduates in Services programmes, tertiary (%)",7160,7160,0
140,"Female share of graduates in Science programmes, tertiary (%)",7160,7160,0
134,"Female share of graduates in Engineering, Manufacturing and Construction programmes, tertiary (%)",7160,7160,0
137,"Female share of graduates in Humanities and Arts programmes, tertiary (%)",7160,7160,0
219,Length of paid parental leave for mother (days),7160,7117,43


In [93]:
remove_indicator = indicator_ndf[indicator_ndf['Non-Null Count']<=5000]
ri_lst = list(remove_indicator['Indicator Name'])

In [94]:
join_df = join_df[~join_df['Indicator Name'].isin(ri_lst)]
join_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Indicator Name,Year,Indicator Value
1148160,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,0.0
1148161,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,0.0
1148162,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,0.0
1148163,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,0.0
1148164,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,0.0


In [95]:
join_df.shape

(923640, 7)

In [96]:
join_df['Indicator Name'].unique()

array(['A woman can apply for a passport in the same way as a man (1=yes; 0=no)',
       'A woman can be head of household in the same way as a man (1=yes; 0=no)',
       'A woman can choose where to live in the same way as a man (1=yes; 0=no)',
       'A woman can get a job in the same way as a man (1=yes; 0=no)',
       'A woman can obtain a judgment of divorce in the same way as a man (1=yes; 0=no)',
       'A woman can open a bank account in the same way as a man (1=yes; 0=no)',
       'A woman can register a business in the same way as a man (1=yes; 0=no)',
       'A woman can sign a contract in the same way as a man (1=yes; 0=no)',
       'A woman can travel outside her home in the same way as a man (1=yes; 0=no)',
       'A woman can travel outside the country in the same way as a man (1=yes; 0=no)',
       'A woman has the same rights to remarry as a man (1=yes; 0=no)',
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Age dependency ratio (% of wo

In [97]:
join_df = join_df[
        (join_df['Indicator Name']=="School enrollment, primary (gross), gender parity index (GPI)")|
        (join_df['Indicator Name']=="School enrollment, primary, female (% gross)")|
        (join_df['Indicator Name']=="School enrollment, primary, male (% gross)")|
        (join_df['Indicator Name']=="Primary education, pupils (% female)")|
        (join_df['Indicator Name']=="Labor force, female (% of total labor force)")|
        (join_df['Indicator Name']=="Employers, female (% of female employment) (modeled ILO estimate)")|
        (join_df['Indicator Name']=="Employers, male (% of male employment) (modeled ILO estimate)")|
        (join_df['Indicator Name']=="Ratio of female to male labor force participation rate (%) (modeled ILO estimate)")|
        (join_df['Indicator Name']=="Wage and salaried workers, female (% of female employment) (modeled ILO estimate)")|
        (join_df['Indicator Name']=="Wage and salaried workers, male (% of male employment) (modeled ILO estimate)")|
        (join_df['Indicator Name']=="GDP growth (annual %)")|
        (join_df['Indicator Name']=="GDP per capita (Current US$)")|
        (join_df['Indicator Name']=="Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)")|
        (join_df['Indicator Name']=="Law prohibits discrimination in employment based on gender (1=yes; 0=no)")
       ]

In [98]:
join_df['Indicator Name'].unique()

array(['Employers, female (% of female employment) (modeled ILO estimate)',
       'Employers, male (% of male employment) (modeled ILO estimate)',
       'GDP growth (annual %)', 'GDP per capita (Current US$)',
       'Labor force, female (% of total labor force)',
       'Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)',
       'Law prohibits discrimination in employment based on gender (1=yes; 0=no)',
       'Primary education, pupils (% female)',
       'Ratio of female to male labor force participation rate (%) (modeled ILO estimate)',
       'School enrollment, primary (gross), gender parity index (GPI)',
       'School enrollment, primary, female (% gross)',
       'School enrollment, primary, male (% gross)',
       'Wage and salaried workers, female (% of female employment) (modeled ILO estimate)',
       'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)'],
      dtype=object)

In [99]:
ind_pivot = join_df.pivot_table('Indicator Value', ['Country Code','Country Name','Region','Income Group','Year'], 'Indicator Name')
ind_pivot.reset_index(inplace=True)
ind_pivot.rename_axis(None, axis=1, inplace=True)
ind_pivot.head(2)

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,"Employers, female (% of female employment) (modeled ILO estimate)","Employers, male (% of male employment) (modeled ILO estimate)",GDP growth (annual %),GDP per capita (Current US$),"Labor force, female (% of total labor force)",Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no),Law prohibits discrimination in employment based on gender (1=yes; 0=no),"Primary education, pupils (% female)",Ratio of female to male labor force participation rate (%) (modeled ILO estimate),"School enrollment, primary (gross), gender parity index (GPI)","School enrollment, primary, female (% gross)","School enrollment, primary, male (% gross)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)"
0,AFG,Afghanistan,South Asia,Low income,1980,,,,272.655286,,0.0,0.0,18.06618,,0.22998,16.289379,70.829109,,
1,AFG,Afghanistan,South Asia,Low income,1981,,,,264.111317,,0.0,0.0,18.47081,,0.23702,18.03261,76.080238,,


In [106]:
ind_pivot.rename({
'Employers, female (% of female employment) (modeled ILO estimate)':'Female Employers (%)',
'Employers, male (% of male employment) (modeled ILO estimate)':'Male Employers (%)',
'GDP growth (annual %)':'Annual GDP Growth (%)',
'GDP per capita (Current US$)':'GDP per capita (US$)',
'Labor force, female (% of total labor force)':'Female Labor Force (%)',
'Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)':'Equal Remuneration for Females and Males (1=yes; 0=no)',
'Law prohibits discrimination in employment based on gender (1=yes; 0=no)':'Law Prohibits Gender Discrimination in Employment (1=yes; 0=no)',
'Primary education, pupils (% female)':'Female Students in Primary Education (%)',
'Ratio of female to male labor force participation rate (%) (modeled ILO estimate)':'Female to Male Labor Force Participation Rate (%)',
'School enrollment, primary (gross), gender parity index (GPI)':'Gross Primary School Enrollment',
'School enrollment, primary, female (% gross)':'Female Primary School Enrollment (%)',
'School enrollment, primary, male (% gross)':'Male Primary School Enrollment (%)',
'Wage and salaried workers, female (% of female employment) (modeled ILO estimate)':'Female Wage and Salaried Workers (%)',
'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)':'Male Wage and Salaried Workers (%)'
}, axis=1, inplace=True)

ind_pivot.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,Female Employers (%),Male Employers (%),Annual GDP Growth (%),GDP per capita (US$),Female Labor Force (%),Equal Remuneration for Females and Males (1=yes; 0=no),Law Prohibits Gender Discrimination in Employment (1=yes; 0=no),Female Students in Primary Education (%),Female to Male Labor Force Participation Rate (%),Gross Primary School Enrollment,Female Primary School Enrollment (%),Male Primary School Enrollment (%),Female Wage and Salaried Workers (%),Male Wage and Salaried Workers (%)
0,AFG,Afghanistan,South Asia,Low income,1980,,,,272.655286,,0.0,0.0,18.06618,,0.22998,16.289379,70.829109,,
1,AFG,Afghanistan,South Asia,Low income,1981,,,,264.111317,,0.0,0.0,18.47081,,0.23702,18.03261,76.080238,,
2,AFG,Afghanistan,South Asia,Low income,1982,,,,,,0.0,0.0,32.38347,,0.50285,11.49034,22.85051,,
3,AFG,Afghanistan,South Asia,Low income,1983,,,,,,0.0,0.0,,,,,,,
4,AFG,Afghanistan,South Asia,Low income,1984,,,,,,0.0,0.0,30.97858,,0.47464,14.01731,29.532339,,


In [107]:
ind_pivot.shape

(7160, 19)

In [108]:
ind_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 19 columns):
Country Code                                                       7160 non-null object
Country Name                                                       7160 non-null object
Region                                                             7160 non-null object
Income Group                                                       7160 non-null object
Year                                                               7160 non-null int64
Female Employers (%)                                               5162 non-null float64
Male Employers (%)                                                 5162 non-null float64
Annual GDP Growth (%)                                              6513 non-null float64
GDP per capita (US$)                                               6584 non-null float64
Female Labor Force (%)                                             5329 non-null float64
Equal Rem

In [113]:
ind_pivot = ind_pivot.dropna(how='any',axis=0)
ind_pivot.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,Female Employers (%),Male Employers (%),Annual GDP Growth (%),GDP per capita (US$),Female Labor Force (%),Equal Remuneration for Females and Males (1=yes; 0=no),Law Prohibits Gender Discrimination in Employment (1=yes; 0=no),Female Students in Primary Education (%),Female to Male Labor Force Participation Rate (%),Gross Primary School Enrollment,Female Primary School Enrollment (%),Male Primary School Enrollment (%),Female Wage and Salaried Workers (%),Male Wage and Salaried Workers (%)
23,AFG,Afghanistan,South Asia,Low income,2003,0.059,0.567,8.832278,190.683814,15.353224,0.0,0.0,34.76495,19.499872,0.5622,66.69796,118.637657,1.67,9.591
24,AFG,Afghanistan,South Asia,Low income,2004,0.057,0.568,1.414118,211.382117,15.513016,0.0,0.0,29.1264,19.784493,0.43282,62.591,144.611923,1.634,9.604
25,AFG,Afghanistan,South Asia,Low income,2005,0.061,0.574,11.229715,242.031285,15.685935,0.0,0.0,35.68696,20.046942,0.58334,71.812073,123.106041,1.741,9.784
26,AFG,Afghanistan,South Asia,Low income,2006,0.058,0.57,5.357403,263.733692,15.534228,0.0,0.0,37.23035,19.713375,0.62357,77.668404,124.554092,1.699,9.87
27,AFG,Afghanistan,South Asia,Low income,2007,0.059,0.569,13.82632,359.693238,15.402691,0.0,0.0,36.89484,19.392614,0.61379,74.821838,121.900589,1.756,10.082


In [119]:
final_df = ind_pivot.reset_index(drop=True)
final_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,Female Employers (%),Male Employers (%),Annual GDP Growth (%),GDP per capita (US$),Female Labor Force (%),Equal Remuneration for Females and Males (1=yes; 0=no),Law Prohibits Gender Discrimination in Employment (1=yes; 0=no),Female Students in Primary Education (%),Female to Male Labor Force Participation Rate (%),Gross Primary School Enrollment,Female Primary School Enrollment (%),Male Primary School Enrollment (%),Female Wage and Salaried Workers (%),Male Wage and Salaried Workers (%)
0,AFG,Afghanistan,South Asia,Low income,2003,0.059,0.567,8.832278,190.683814,15.353224,0.0,0.0,34.76495,19.499872,0.5622,66.69796,118.637657,1.67,9.591
1,AFG,Afghanistan,South Asia,Low income,2004,0.057,0.568,1.414118,211.382117,15.513016,0.0,0.0,29.1264,19.784493,0.43282,62.591,144.611923,1.634,9.604
2,AFG,Afghanistan,South Asia,Low income,2005,0.061,0.574,11.229715,242.031285,15.685935,0.0,0.0,35.68696,20.046942,0.58334,71.812073,123.106041,1.741,9.784
3,AFG,Afghanistan,South Asia,Low income,2006,0.058,0.57,5.357403,263.733692,15.534228,0.0,0.0,37.23035,19.713375,0.62357,77.668404,124.554092,1.699,9.87
4,AFG,Afghanistan,South Asia,Low income,2007,0.059,0.569,13.82632,359.693238,15.402691,0.0,0.0,36.89484,19.392614,0.61379,74.821838,121.900589,1.756,10.082


In [120]:
final_df.shape

(3824, 19)