In [34]:
#Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Makes graph display in notebook
%matplotlib inline 

#Updating setting to prevent truncation of DataFrame column content
pd.set_option('display.max_colwidth',-1)

In [35]:
#Reading the first dataset
country = pd.read_csv('Gender_Stats_csv/Gender_StatsCountry.csv')
country.head(2)

Unnamed: 0,Country Code,Short Name,Table Name,Long Name,2-alpha code,Currency Unit,Special Notes,Region,Income Group,WB-2 code,...,Government Accounting concept,IMF data dissemination standard,Latest population census,Latest household survey,Source of most recent Income and expenditure data,Vital registration complete,Latest agricultural census,Latest industrial data,Latest trade data,Unnamed: 30
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,,Latin America & Caribbean,High income,AW,...,,Enhanced General Data Dissemination System (e-GDDS),2010,,,Yes,,,2016.0,
1,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,,South Asia,Low income,AF,...,Consolidated central government,Enhanced General Data Dissemination System (e-GDDS),1979,"Demographic and Health Survey, 2015","Integrated household survey (IHS), 2016/17",,,,2017.0,


In [3]:
#Dropping the non-required columns
country = country[['Country Code','Short Name','Region','Income Group']]
country.head(2)

Unnamed: 0,Country Code,Short Name,Region,Income Group
0,ABW,Aruba,Latin America & Caribbean,High income
1,AFG,Afghanistan,South Asia,Low income


In [4]:
country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 4 columns):
Country Code    263 non-null object
Short Name      263 non-null object
Region          217 non-null object
Income Group    217 non-null object
dtypes: object(4)
memory usage: 8.3+ KB


In [5]:
#Reading the second dataset
stats = pd.read_csv('Gender_Stats_csv/Gender_StatsData.csv')
stats.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,A woman can be head of household in the same way as a man (1=yes; 0=no),SG.HLD.HEAD.EQ,,,,,,,...,,,,,,,,,,


In [6]:
#Converting the horizontal dataset into vertical format
idx = ['Country Name','Country Code','Indicator Name','Indicator Code']
multi_index_df = stats.set_index(idx)
stacked_df = multi_index_df.stack(dropna=False)
long_df = stacked_df.reset_index()
long_df.rename({'level_4':'Year', 0:'Indicator Value'}, axis=1, inplace=True)
long_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Indicator Value
0,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1960,
1,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1961,
2,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1962,
3,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1963,
4,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),SG.APL.PSPT.EQ,1964,


In [7]:
long_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10174944 entries, 0 to 10174943
Data columns (total 6 columns):
Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
Year               object
Indicator Value    float64
dtypes: float64(1), object(5)
memory usage: 465.8+ MB


In [8]:
long_df['Year'].unique()

array(['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967',
       '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975',
       '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991',
       '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
       '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', 'Unnamed: 65'],
      dtype=object)

In [9]:
year_df = long_df.loc[long_df['Year']!='Unnamed: 65']
year_df['Year'] = year_df['Year'].astype(int)

stats_df = year_df.drop('Indicator Code', axis=1)
stats_df = stats_df.loc[(stats_df['Year']>=1980) & (stats_df['Year']<2020)]
stats_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Country Name,Country Code,Indicator Name,Year,Indicator Value
20,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,
21,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,
22,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,
23,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,
24,Arab World,ARB,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,


In [10]:
join_df = pd.merge(stats_df,country,on='Country Code',how='left')
join_df = join_df[['Country Code','Country Name','Region','Income Group','Indicator Name','Year','Indicator Value']]
join_df = join_df[join_df['Region'].notnull()]
join_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Indicator Name,Year,Indicator Value
1148160,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,0.0
1148161,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,0.0
1148162,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,0.0
1148163,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,0.0
1148164,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,0.0


In [11]:
country_ncheck = {}

for i in join_df['Country Name'].unique():
    df = join_df[join_df['Country Name']==i]
    total_cnt = df['Indicator Value'].size
    null_cnt = df['Indicator Value'].isnull().sum()
    non_null_cnt = df['Indicator Value'].notnull().sum()
    cnt_list = [total_cnt, null_cnt, non_null_cnt]
    country_ncheck[i] = cnt_list

In [12]:
country_ndf = pd.DataFrame.from_dict(country_ncheck, orient='index')
country_ndf.reset_index(inplace=True)
col_names2 = ['Country Name','Total Count','Null Count','Non-Null Count']
country_ndf.columns = col_names2
country_ndf.sort_values(['Null Count'], ascending=False, inplace=True)
country_ndf.head()

Unnamed: 0,Country Name,Total Count,Null Count,Non-Null Count
146,Northern Mariana Islands,24960,24784,176
183,St. Martin (French part),24960,24735,225
172,Sint Maarten (Dutch part),24960,24732,228
3,American Samoa,24960,24715,245
65,Faroe Islands,24960,24659,301


In [13]:
remove_country = country_ndf[country_ndf['Non-Null Count']<=5000]
rc_lst = list(remove_country['Country Name'])

filter_df1 = join_df[~join_df['Country Name'].isin(rc_lst)]
filter_df1.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Indicator Name,Year,Indicator Value
1148160,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,0.0
1148161,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,0.0
1148162,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,0.0
1148163,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,0.0
1148164,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,0.0


In [15]:
filter_df1.shape

(4467840, 7)

In [17]:
indicator_ncheck = {}

for i in filter_df1['Indicator Name'].unique():
    df = filter_df1[join_df['Indicator Name']==i]
    total_cnt = df['Indicator Value'].size
    null_cnt = df['Indicator Value'].isnull().sum()
    non_null_cnt = df['Indicator Value'].notnull().sum()
    cnt_list = [total_cnt, null_cnt, non_null_cnt]
    indicator_ncheck[i] = cnt_list

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [18]:
indicator_ndf = pd.DataFrame.from_dict(indicator_ncheck, orient='index')
indicator_ndf.reset_index(inplace=True)
col_names = ['Indicator Name','Total Count','Null Count','Non-Null Count']
indicator_ndf.columns = col_names
indicator_ndf.sort_values(['Null Count'], ascending=False, inplace=True)
indicator_ndf.head()

Unnamed: 0,Indicator Name,Total Count,Null Count,Non-Null Count
136,"Female share of graduates in Health and Welfare programmes, tertiary (%)",7160,7160,0
132,"Female share of graduates in Agriculture programmes, tertiary (%)",7160,7160,0
133,"Female share of graduates in Education programmes, tertiary (%)",7160,7160,0
143,"Female share of graduates in unknown or unspecified fields, tertiary (%)",7160,7160,0
142,"Female share of graduates in Social Science, Business and Law programmes, tertiary (%)",7160,7160,0


In [19]:
remove_indicator = indicator_ndf[indicator_ndf['Non-Null Count']<=5000]
ri_lst = list(remove_indicator['Indicator Name'])

filter_df2 = filter_df1[~filter_df1['Indicator Name'].isin(ri_lst)]
filter_df2.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Indicator Name,Year,Indicator Value
1148160,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1980,0.0
1148161,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1981,0.0
1148162,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1982,0.0
1148163,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1983,0.0
1148164,AFG,Afghanistan,South Asia,Low income,A woman can apply for a passport in the same way as a man (1=yes; 0=no),1984,0.0


In [20]:
filter_df2.shape

(923640, 7)

In [24]:
new_df = filter_df2[
        (filter_df2['Indicator Name']=="School enrollment, primary (gross), gender parity index (GPI)")|
        (filter_df2['Indicator Name']=="School enrollment, primary, female (% gross)")|
        (filter_df2['Indicator Name']=="Labor force, female (% of total labor force)")|
        (filter_df2['Indicator Name']=="Ratio of female to male labor force participation rate (%) (modeled ILO estimate)")|
        (filter_df2['Indicator Name']=="GDP per capita (Current US$)")|
        (filter_df2['Indicator Name']=="Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)")|
        (filter_df2['Indicator Name']=="Law prohibits discrimination in employment based on gender (1=yes; 0=no)")
       ]

In [25]:
new_df['Indicator Name'].unique()

array(['GDP per capita (Current US$)',
       'Labor force, female (% of total labor force)',
       'Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)',
       'Law prohibits discrimination in employment based on gender (1=yes; 0=no)',
       'Ratio of female to male labor force participation rate (%) (modeled ILO estimate)',
       'School enrollment, primary (gross), gender parity index (GPI)',
       'School enrollment, primary, female (% gross)'], dtype=object)

In [26]:
ind_pivot = new_df.pivot_table('Indicator Value', ['Country Code','Country Name','Region','Income Group','Year'], 'Indicator Name')
ind_pivot.reset_index(inplace=True)
ind_pivot.rename_axis(None, axis=1, inplace=True)
ind_pivot.head(2)

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,GDP per capita (Current US$),"Labor force, female (% of total labor force)",Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no),Law prohibits discrimination in employment based on gender (1=yes; 0=no),Ratio of female to male labor force participation rate (%) (modeled ILO estimate),"School enrollment, primary (gross), gender parity index (GPI)","School enrollment, primary, female (% gross)"
0,AFG,Afghanistan,South Asia,Low income,1980,272.655286,,0.0,0.0,,0.22998,16.289379
1,AFG,Afghanistan,South Asia,Low income,1981,264.111317,,0.0,0.0,,0.23702,18.03261


In [27]:
ind_pivot.rename({
'GDP per capita (Current US$)':'GDP per capita (US$)',
'Labor force, female (% of total labor force)':'Female Labor Force (%)',
'Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)':'Equal Remuneration for Females and Males',
'Law prohibits discrimination in employment based on gender (1=yes; 0=no)':'Law Prohibits Gender Discrimination in Employment',
'Ratio of female to male labor force participation rate (%) (modeled ILO estimate)':'Female to Male Labor Force Participation Rate (%)',
'School enrollment, primary (gross), gender parity index (GPI)':'Primary School Enrollment GPI',
'School enrollment, primary, female (% gross)':'Female Primary School Enrollment (%)',
}, axis=1, inplace=True)

ind_pivot.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,GDP per capita (US$),Female Labor Force (%),Equal Remuneration for Females and Males,Law Prohibits Gender Discrimination in Employment,Female to Male Labor Force Participation Rate (%),Primary School Enrollment GPI,Female Primary School Enrollment (%)
0,AFG,Afghanistan,South Asia,Low income,1980,272.655286,,0.0,0.0,,0.22998,16.289379
1,AFG,Afghanistan,South Asia,Low income,1981,264.111317,,0.0,0.0,,0.23702,18.03261
2,AFG,Afghanistan,South Asia,Low income,1982,,,0.0,0.0,,0.50285,11.49034
3,AFG,Afghanistan,South Asia,Low income,1983,,,0.0,0.0,,,
4,AFG,Afghanistan,South Asia,Low income,1984,,,0.0,0.0,,0.47464,14.01731


In [28]:
ind_pivot.shape

(7160, 12)

In [29]:
ind_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 12 columns):
Country Code                                         7160 non-null object
Country Name                                         7160 non-null object
Region                                               7160 non-null object
Income Group                                         7160 non-null object
Year                                                 7160 non-null int64
GDP per capita (US$)                                 6584 non-null float64
Female Labor Force (%)                               5329 non-null float64
Equal Remuneration for Females and Males             7120 non-null float64
Law Prohibits Gender Discrimination in Employment    7120 non-null float64
Female to Male Labor Force Participation Rate (%)    5340 non-null float64
Primary School Enrollment GPI                        5480 non-null float64
Female Primary School Enrollment (%)                 5480 non-null float64

In [31]:
clean_df = ind_pivot.dropna(how='any',axis=0)
clean_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,GDP per capita (US$),Female Labor Force (%),Equal Remuneration for Females and Males,Law Prohibits Gender Discrimination in Employment,Female to Male Labor Force Participation Rate (%),Primary School Enrollment GPI,Female Primary School Enrollment (%)
22,AFG,Afghanistan,South Asia,Low income,2002,179.426611,15.201361,0.0,0.0,19.196412,0.45693,42.82885
23,AFG,Afghanistan,South Asia,Low income,2003,190.683814,15.353224,0.0,0.0,19.499872,0.5622,66.69796
24,AFG,Afghanistan,South Asia,Low income,2004,211.382117,15.513016,0.0,0.0,19.784493,0.43282,62.591
25,AFG,Afghanistan,South Asia,Low income,2005,242.031285,15.685935,0.0,0.0,20.046942,0.58334,71.812073
26,AFG,Afghanistan,South Asia,Low income,2006,263.733692,15.534228,0.0,0.0,19.713375,0.62357,77.668404


In [32]:
final_df = clean_df.reset_index(drop=True)
final_df.head()

Unnamed: 0,Country Code,Country Name,Region,Income Group,Year,GDP per capita (US$),Female Labor Force (%),Equal Remuneration for Females and Males,Law Prohibits Gender Discrimination in Employment,Female to Male Labor Force Participation Rate (%),Primary School Enrollment GPI,Female Primary School Enrollment (%)
0,AFG,Afghanistan,South Asia,Low income,2002,179.426611,15.201361,0.0,0.0,19.196412,0.45693,42.82885
1,AFG,Afghanistan,South Asia,Low income,2003,190.683814,15.353224,0.0,0.0,19.499872,0.5622,66.69796
2,AFG,Afghanistan,South Asia,Low income,2004,211.382117,15.513016,0.0,0.0,19.784493,0.43282,62.591
3,AFG,Afghanistan,South Asia,Low income,2005,242.031285,15.685935,0.0,0.0,20.046942,0.58334,71.812073
4,AFG,Afghanistan,South Asia,Low income,2006,263.733692,15.534228,0.0,0.0,19.713375,0.62357,77.668404


In [33]:
final_df.shape

(4110, 12)