In [1]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np
import datetime
import wbdata

In [2]:
countries = ["AFG","ALB","DZA","ASM","AND","AGO","ATG","ARG","ARM","ABW","AUS","AUT","AZE","BHS","BHR","BGD","BRB","BLR","BEL","BLZ","BEN","BMU","BTN","BOL","BIH","BWA","BRA","VGB","BRN","BGR","BFA","BDI","CPV","KHM","CMR","CAN","CYM","CAF","TCD","CHI","CHL","CHN","COL","COM","COD","COG","CRI","CIV","HRV","CUB","CUW","CYP","CZE","DNK","DJI","DMA","DOM","ECU","EGY","SLV","GNQ","ERI","EST","SWZ","ETH","FRO","FJI","FIN","FRA","PYF","GAB","GMB","GEO","DEU","GHA","GIB","GRC","GRL","GRD","GUM","GTM","GIN","GNB","GUY","HTI","HND","HKG","HUN","ISL","IND","IDN","IRN","IRQ","IRL","IMN","ISR","ITA","JAM","JPN","JOR","KAZ","KEN","KIR","PRK","KOR","XKX","KWT","KGZ","LAO","LVA","LBN","LSO","LBR","LBY","LIE","LTU","LUX","MAC","MDG","MWI","MYS","MDV","MLI","MLT","MHL","MRT","MUS","MEX","FSM","MDA","MCO","MNG","MNE","MAR","MOZ","MMR","NAM","NRU","NPL","NLD","NCL","NZL","NIC","NER","NGA","MKD","MNP","NOR","OMN","PAK","PLW","PAN","PNG","PRY","PER","PHL","POL","PRT","PRI","QAT","ROU","RUS","RWA","WSM","SMR","STP","SAU","SEN","SRB","SYC","SLE","SGP","SXM","SVK","SVN","SLB","SOM","ZAF","SSD","ESP","LKA","KNA","LCA","MAF","VCT","SDN","SUR","SWE","CHE","SYR","TWN","TJK","TZA","THA","TLS","TGO","TON","TTO","TUN","TUR","TKM","TCA","TUV","UGA","UKR","ARE","GBR","USA","URY","UZB","VUT","VEN","VNM","VIR","PSE","YEM","ZMB","ZWE"]

In [3]:
indicators = {"SP.DYN.TFRT.IN": "fert", "NY.GDP.PCAP.KD": "gdppc", "SP.POP.TOTL": "pop", "SL.AGR.EMPL.ZS": "agr_sh", "SL.SRV.EMPL.ZS": "ser_sh"}
df = wbdata.get_dataframe(indicators, country=countries)

In [4]:
wbdata.get_incomelevel()

id    value
----  -------------------
HIC   High income
INX   Not classified
LIC   Low income
LMC   Lower middle income
LMY   Low & middle income
MIC   Middle income
UMC   Upper middle income

In [5]:
# Extract data for low, medium, and high income countries
hicc = [i['id'] for i in wbdata.get_country(incomelevel='HIC')]
licc = [i['id'] for i in wbdata.get_country(incomelevel='LIC')]
micc = [i['id'] for i in wbdata.get_country(incomelevel='MIC')]

# Extract dataframes
dfhic = wbdata.get_dataframe(indicators, country = hicc)
dflic = wbdata.get_dataframe(indicators, country = licc)
dfmic = wbdata.get_dataframe(indicators, country = micc)

In [6]:
# Manage indices
df.reset_index(inplace = True, level = ['date'])
df['year'] = pd.to_numeric(df['date'])

dfhic.reset_index(inplace = True, level = ['date'])
dfhic['year'] = pd.to_numeric(dfhic['date'])

dfmic.reset_index(inplace = True, level = ['date'])
dfmic['year'] = pd.to_numeric(dfmic['date'])

dflic.reset_index(inplace = True, level = ['date'])
dflic['year'] = pd.to_numeric(dflic['date'])

# Add decade indicator
def decade(yr):
    if 1960 <= yr <= 1969: dec   = '1960s'
    elif 1970 <= yr <= 1979: dec = '1970s'
    elif 1980 <= yr <= 1989: dec = '1980s'
    elif 1990 <= yr <= 1999: dec = '1990s'
    elif 2000 <= yr <= 2009: dec = '2000s'
    elif 2010 <= yr <= 2019: dec = '2010s'
    else: 
        dec = 'NA'
    return(dec)


df['decade'] = df['year'].apply(decade)
df.set_index('date', append = True)
print(df.tail())

dfhic['decade'] = dfhic['year'].apply(decade)
dfhic.set_index('date', append = True)

dfmic['decade'] = dfmic['year'].apply(decade)
dfmic.set_index('date', append = True)

dflic['decade'] = dflic['year'].apply(decade)
dflic.set_index('date', append = True)

          date   fert        gdppc        pop  agr_sh  ser_sh  year decade
country                                                                   
Zimbabwe  1964  7.347   984.668540  4322861.0     NaN     NaN  1964  1960s
Zimbabwe  1963  7.311  1030.026148  4178726.0     NaN     NaN  1963  1960s
Zimbabwe  1962  7.267  1002.976733  4039201.0     NaN     NaN  1962  1960s
Zimbabwe  1961  7.215  1022.765185  3905034.0     NaN     NaN  1961  1960s
Zimbabwe  1960  7.158   994.697855  3776681.0     NaN     NaN  1960  1960s


Unnamed: 0_level_0,Unnamed: 1_level_0,fert,gdppc,pop,agr_sh,ser_sh,year,decade
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,2020,,,,42.352001,39.396000,2020,
Afghanistan,2019,,573.287551,38041754.0,42.839001,38.999001,2019,2010s
Afghanistan,2018,4.473,564.609992,37172386.0,43.383999,38.536999,2018,2010s
Afghanistan,2017,4.633,571.440691,36296400.0,43.988998,37.985001,2017,2010s
Afghanistan,2016,4.800,571.073775,35383128.0,45.803001,36.868999,2016,2010s
...,...,...,...,...,...,...,...,...
"Yemen, Rep.",1964,8.067,,5641597.0,,,1964,1960s
"Yemen, Rep.",1963,8.026,,5556766.0,,,1963,1960s
"Yemen, Rep.",1962,7.991,,5473671.0,,,1962,1960s
"Yemen, Rep.",1961,7.962,,5393036.0,,,1961,1960s


In [7]:
# Drop missing values from all datasets
df = df.dropna()
dfhic = dfhic.dropna()
dfmic = dfmic.dropna()
dflic = dflic.dropna()

In [7]:
print(df[df['year'] >= 1999].tail())

          date   fert        gdppc         pop     agr_sh     ser_sh  year  \
country                                                                      
Zimbabwe  2003  3.725  1102.230755  11982224.0  63.042999  26.705000  2003   
Zimbabwe  2002  3.718  1331.013034  11954290.0  61.898998  27.214001  2002   
Zimbabwe  2001  3.725  1464.672049  11923914.0  60.949001  27.591999  2001   
Zimbabwe  2000  3.748  1449.042767  11881477.0  60.613998  27.837999  2000   
Zimbabwe  1999  3.786  1502.199532  11822719.0  60.029999  28.139999  1999   

         decade  
country          
Zimbabwe  2000s  
Zimbabwe  2000s  
Zimbabwe  2000s  
Zimbabwe  2000s  
Zimbabwe  1990s  


In [8]:
df.to_csv('../../data/wb-data.csv')
dfhic.to_csv('../../data/wb-data-hic.csv')
dfmic.to_csv('../../data/wb-data-mic.csv')
dflic.to_csv('../../data/wb-data-lic.csv')