# JSON examples and exercise
****
+ get familiar with packages for dealing with JSON
+ study examples with JSON strings and files 
+ work on exercise to be completed and submitted 
****
+ reference: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
+ data source: http://jsonstudio.com/resources/
****

In [2]:
import pandas as pd

## imports for Python, Pandas

In [3]:
import json
from pandas.io.json import json_normalize

## JSON example, with string

+ demonstrates creation of normalized dataframes (tables) from nested json string
+ source: http://pandas.pydata.org/pandas-docs/stable/io.html#normalization

In [4]:
# define json string
data = [{'state': 'Florida', 
         'shortname': 'FL',
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [5]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [6]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


****
## JSON example, with file

+ demonstrates reading in a json file as a string and as a table
+ uses small sample file containing data about projects funded by the World Bank 
+ data source: http://jsonstudio.com/resources/

In [75]:
# load json as string
json.load((open('world_bank_projects_less.json')))

[{'_id': {'$oid': '52b213b38594d8a2be17c780'},
  'approvalfy': 1999,
  'board_approval_month': 'November',
  'boardapprovaldate': '2013-11-12T00:00:00Z',
  'borrower': 'FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA',
  'closingdate': '2018-07-07T00:00:00Z',
  'country_namecode': 'Federal Democratic Republic of Ethiopia!$!ET',
  'countrycode': 'ET',
  'countryname': 'Federal Democratic Republic of Ethiopia',
  'countryshortname': 'Ethiopia',
  'docty': 'Project Information Document,Indigenous Peoples Plan,Project Information Document',
  'envassesmentcategorycode': 'C',
  'grantamt': 0,
  'ibrdcommamt': 0,
  'id': 'P129828',
  'idacommamt': 130000000,
  'impagency': 'MINISTRY OF EDUCATION',
  'lendinginstr': 'Investment Project Financing',
  'lendinginstrtype': 'IN',
  'lendprojectcost': 550000000,
  'majorsector_percent': [{'Name': 'Education', 'Percent': 46},
   {'Name': 'Education', 'Percent': 26},
   {'Name': 'Public Administration, Law, and Justice', 'Percent': 16},
   {'Name': 'Educatio

In [8]:
# load as Pandas dataframe
sample_json_df = pd.read_json('world_bank_projects_less.json')
sample_json_df

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Name': 'Education for all', 'Percent': 100}","[{'name': 'Education for all', 'code': '65'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Name': 'Other economic management', 'Percent...","[{'name': 'Other economic management', 'code':...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en


****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [73]:
#Import Libraries

import pandas as pd

import json

import pandas_profiling


In [74]:
#Read World bank json file
df = pd.read_json('world_bank_projects.json')

## Exploratory Data Analysis

In [11]:
#Pandas Profiling
profile = pandas_profiling.ProfileReport(df)
profile

0,1
Number of variables,50
Number of observations,500
Total Missing (%),4.8%
Total size in memory,195.4 KiB
Average record size in memory,400.2 B

0,1
Numeric,6
Categorical,23
Boolean,0
Date,0
Text (Unique),3
Rejected,3
Unsupported,15

Unsupported value

0,1
Distinct count,4
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2013.1
Minimum,1999
Maximum,2015
Zeros (%),0.0%

0,1
Minimum,1999
5-th percentile,2013
Q1,2013
Median,2013
Q3,2013
95-th percentile,2014
Maximum,2015
Range,16
Interquartile range,0

0,1
Standard deviation,0.72207
Coef of variation,0.00035868
Kurtosis,292.97
Mean,2013.1
MAD,0.24306
Skewness,-14.724
Sum,1006554
Variance,0.52138
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
2013,432,86.4%,
2014,66,13.2%,
2015,1,0.2%,
1999,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
1999,1,0.2%,
2013,432,86.4%,
2014,66,13.2%,
2015,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
1999,1,0.2%,
2013,432,86.4%,
2014,66,13.2%,
2015,1,0.2%,

0,1
Distinct count,12
Unique (%),2.4%
Missing (%),0.0%
Missing (n),0

0,1
June,65
September,64
May,55
Other values (9),316

Value,Count,Frequency (%),Unnamed: 3
June,65,13.0%,
September,64,12.8%,
May,55,11.0%,
March,55,11.0%,
October,48,9.6%,
November,45,9.0%,
April,32,6.4%,
December,29,5.8%,
August,28,5.6%,
July,27,5.4%,

0,1
Distinct count,196
Unique (%),39.2%
Missing (%),0.0%
Missing (n),0

0,1
2012-09-27T00:00:00Z,8
2013-02-28T00:00:00Z,8
2012-09-20T00:00:00Z,8
Other values (193),476

Value,Count,Frequency (%),Unnamed: 3
2012-09-27T00:00:00Z,8,1.6%,
2013-02-28T00:00:00Z,8,1.6%,
2012-09-20T00:00:00Z,8,1.6%,
2013-03-20T00:00:00Z,7,1.4%,
2013-03-27T00:00:00Z,7,1.4%,
2013-05-31T00:00:00Z,7,1.4%,
2012-12-13T00:00:00Z,7,1.4%,
2012-10-23T00:00:00Z,7,1.4%,
2013-04-26T00:00:00Z,7,1.4%,
2013-06-27T00:00:00Z,7,1.4%,

0,1
Distinct count,293
Unique (%),58.6%
Missing (%),3.0%
Missing (n),15

0,1
MINISTRY OF FINANCE,30
GOVERNMENT OF INDIA,13
SOCIALIST REPUBLIC OF VIETNAM,10
Other values (289),432
(Missing),15

Value,Count,Frequency (%),Unnamed: 3
MINISTRY OF FINANCE,30,6.0%,
GOVERNMENT OF INDIA,13,2.6%,
SOCIALIST REPUBLIC OF VIETNAM,10,2.0%,
PEOPLE'S REPUBLIC OF CHINA,8,1.6%,
GOVERNMENT OF NEPAL,8,1.6%,
REPUBLIC OF INDONESIA,7,1.4%,
GOVERNMENT OF INDONESIA,6,1.2%,
REPUBLIC OF YEMEN,5,1.0%,
REPUBLIC OF UZBEKISTAN,5,1.0%,
GOVERNMENT OF BANGLADESH,5,1.0%,

0,1
Distinct count,115
Unique (%),23.0%
Missing (%),26.0%
Missing (n),130

0,1
2018-12-31T00:00:00Z,36
2014-06-30T00:00:00Z,21
2013-12-31T00:00:00Z,19
Other values (111),294
(Missing),130

Value,Count,Frequency (%),Unnamed: 3
2018-12-31T00:00:00Z,36,7.2%,
2014-06-30T00:00:00Z,21,4.2%,
2013-12-31T00:00:00Z,19,3.8%,
2017-12-31T00:00:00Z,19,3.8%,
2017-06-30T00:00:00Z,15,3.0%,
2018-06-30T00:00:00Z,15,3.0%,
2014-12-31T00:00:00Z,12,2.4%,
2015-06-30T00:00:00Z,11,2.2%,
2015-12-31T00:00:00Z,10,2.0%,
2019-06-30T00:00:00Z,10,2.0%,

0,1
Distinct count,118
Unique (%),23.6%
Missing (%),0.0%
Missing (n),0

0,1
People's Republic of China!$!CN,19
Republic of Indonesia!$!ID,19
Socialist Republic of Vietnam!$!VN,17
Other values (115),445

Value,Count,Frequency (%),Unnamed: 3
People's Republic of China!$!CN,19,3.8%,
Republic of Indonesia!$!ID,19,3.8%,
Socialist Republic of Vietnam!$!VN,17,3.4%,
Republic of India!$!IN,16,3.2%,
Republic of Yemen!$!RY,13,2.6%,
Kingdom of Morocco!$!MA,12,2.4%,
People's Republic of Bangladesh!$!BD,12,2.4%,
Nepal!$!NP,12,2.4%,
Africa!$!3A,11,2.2%,
Republic of Mozambique!$!MZ,11,2.2%,

0,1
Distinct count,118
Unique (%),23.6%
Missing (%),0.0%
Missing (n),0

0,1
CN,19
ID,19
VN,17
Other values (115),445

Value,Count,Frequency (%),Unnamed: 3
CN,19,3.8%,
ID,19,3.8%,
VN,17,3.4%,
IN,16,3.2%,
RY,13,2.6%,
BD,12,2.4%,
NP,12,2.4%,
MA,12,2.4%,
MZ,11,2.2%,
3A,11,2.2%,

0,1
Distinct count,118
Unique (%),23.6%
Missing (%),0.0%
Missing (n),0

0,1
Republic of Indonesia,19
People's Republic of China,19
Socialist Republic of Vietnam,17
Other values (115),445

Value,Count,Frequency (%),Unnamed: 3
Republic of Indonesia,19,3.8%,
People's Republic of China,19,3.8%,
Socialist Republic of Vietnam,17,3.4%,
Republic of India,16,3.2%,
Republic of Yemen,13,2.6%,
Kingdom of Morocco,12,2.4%,
People's Republic of Bangladesh,12,2.4%,
Nepal,12,2.4%,
Africa,11,2.2%,
Republic of Mozambique,11,2.2%,

0,1
Distinct count,118
Unique (%),23.6%
Missing (%),0.0%
Missing (n),0

0,1
Indonesia,19
China,19
Vietnam,17
Other values (115),445

Value,Count,Frequency (%),Unnamed: 3
Indonesia,19,3.8%,
China,19,3.8%,
Vietnam,17,3.4%,
India,16,3.2%,
"Yemen, Republic of",13,2.6%,
Morocco,12,2.4%,
Nepal,12,2.4%,
Bangladesh,12,2.4%,
Mozambique,11,2.2%,
Africa,11,2.2%,

0,1
Distinct count,393
Unique (%),78.6%
Missing (%),10.8%
Missing (n),54

0,1
"Project Paper,Integrated Safeguards Data Sheet,Project Information Document",7
Implementation Status and Results Report,7
Integrated Safeguards Data Sheet,6
Other values (389),426
(Missing),54

Value,Count,Frequency (%),Unnamed: 3
"Project Paper,Integrated Safeguards Data Sheet,Project Information Document",7,1.4%,
Implementation Status and Results Report,7,1.4%,
Integrated Safeguards Data Sheet,6,1.2%,
"Program Document,Program Information Document",6,1.2%,
Project Information Document,5,1.0%,
"Program Document,Project Information Document,Project Information Document",4,0.8%,
"Disbursement Letter,Grant or Trust Fund Agreement",4,0.8%,
"Integrated Safeguards Data Sheet,Project Information Document",4,0.8%,
"Implementation Status and Results Report,Project Appraisal Document,Project Information Document,Integrated Safeguards Data Sheet,Environmental Assessment,Integrated Safeguards Data Sheet,Project Information Document",3,0.6%,
"Program Document,Summary of Discussion",2,0.4%,

0,1
Distinct count,6
Unique (%),1.2%
Missing (%),14.0%
Missing (n),70

0,1
B,241
C,136
A,37
Other values (2),16
(Missing),70

Value,Count,Frequency (%),Unnamed: 3
B,241,48.2%,
C,136,27.2%,
A,37,7.4%,
F,13,2.6%,
U,3,0.6%,
(Missing),70,14.0%,

0,1
Distinct count,132
Unique (%),26.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,4432400
Minimum,0
Maximum,365000000
Zeros (%),63.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1695000
95-th percentile,18221000
Maximum,365000000
Range,365000000
Interquartile range,1695000

0,1
Standard deviation,20233000
Coef of variation,4.5648
Kurtosis,205.98
Mean,4432400
MAD,6786900
Skewness,12.517
Sum,2216200000
Variance,409380000000000
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
0,317,63.4%,
3000000,7,1.4%,
200000,6,1.2%,
1000000,5,1.0%,
10000000,4,0.8%,
300000,4,0.8%,
350000,4,0.8%,
8000000,4,0.8%,
2000000,3,0.6%,
900000,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
0,317,63.4%,
30000,3,0.6%,
50000,1,0.2%,
100000,1,0.2%,
150000,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
76500000,1,0.2%,
84600000,1,0.2%,
85400000,1,0.2%,
100000000,2,0.4%,
365000000,1,0.2%,

0,1
Distinct count,57
Unique (%),11.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,32860000
Minimum,0
Maximum,1307800000
Zeros (%),79.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,201580000
Maximum,1307800000
Range,1307800000
Interquartile range,0

0,1
Standard deviation,108920000
Coef of variation,3.3146
Kurtosis,48.917
Mean,32860000
MAD,53536000
Skewness,5.945
Sum,16430050000
Variance,1.1863e+16
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
0,399,79.8%,
100000000,12,2.4%,
150000000,6,1.2%,
80000000,5,1.0%,
50000000,5,1.0%,
200000000,5,1.0%,
300000000,5,1.0%,
30000000,3,0.6%,
20000000,3,0.6%,
40000000,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
0,399,79.8%,
6400000,1,0.2%,
7000000,2,0.4%,
10000000,3,0.6%,
15000000,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
585400000,1,0.2%,
600000000,1,0.2%,
650000000,1,0.2%,
800000000,1,0.2%,
1307800000,1,0.2%,

First 3 values
P128644
P121917
P128284

Last 3 values
P129381
P143841
P124761

Value,Count,Frequency (%),Unnamed: 3
P075941,1,0.2%,
P085621,1,0.2%,
P086592,1,0.2%,
P094183,1,0.2%,
P095003,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
P146125,1,0.2%,
P146161,1,0.2%,
P146271,1,0.2%,
P146653,1,0.2%,
P147689,1,0.2%,

0,1
Distinct count,94
Unique (%),18.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,35421000
Minimum,0
Maximum,600000000
Zeros (%),56.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,37000000
95-th percentile,181000000
Maximum,600000000
Range,600000000
Interquartile range,37000000

0,1
Standard deviation,76814000
Coef of variation,2.1686
Kurtosis,15.449
Mean,35421000
MAD,46781000
Skewness,3.5733
Sum,17710680000
Variance,5900400000000000
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
0,281,56.2%,
50000000,21,4.2%,
100000000,15,3.0%,
20000000,12,2.4%,
10000000,9,1.8%,
70000000,8,1.6%,
40000000,8,1.6%,
5000000,7,1.4%,
25000000,7,1.4%,
30000000,7,1.4%,

Value,Count,Frequency (%),Unnamed: 3
0,281,56.2%,
1800000,1,0.2%,
2000000,2,0.4%,
2150000,1,0.2%,
3000000,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
415000000,1,0.2%,
440000000,1,0.2%,
448900000,1,0.2%,
500000000,1,0.2%,
600000000,1,0.2%,

0,1
Distinct count,375
Unique (%),75.0%
Missing (%),5.6%
Missing (n),28

0,1
MINISTRY OF FINANCE,41
MINISTRY OF EDUCATION,13
MINISTRY OF HEALTH,9
Other values (371),409
(Missing),28

Value,Count,Frequency (%),Unnamed: 3
MINISTRY OF FINANCE,41,8.2%,
MINISTRY OF EDUCATION,13,2.6%,
MINISTRY OF HEALTH,9,1.8%,
MINISTRY OF ECONOMY AND FINANCE,7,1.4%,
MINISTRY OF PUBLIC WORKS,5,1.0%,
MINISTRY OF AGRICULTURE,5,1.0%,
MINISTRY OF FINANCE AND ECONOMIC DEVELOPMENT,4,0.8%,
GEF SECRETARIAT,3,0.6%,
MINISTRY OF TRANSPORT AND COMMUNICATIONS,3,0.6%,
MINISTRY OF EDUCATION AND TRAINING,3,0.6%,

0,1
Distinct count,10
Unique (%),2.0%
Missing (%),1.0%
Missing (n),5

0,1
Specific Investment Loan,243
Technical Assistance Loan,78
Development Policy Lending,68
Other values (6),106

Value,Count,Frequency (%),Unnamed: 3
Specific Investment Loan,243,48.6%,
Technical Assistance Loan,78,15.6%,
Development Policy Lending,68,13.6%,
Investment Project Financing,57,11.4%,
Emergency Recovery Loan,22,4.4%,
Adaptable Program Loan,11,2.2%,
Financial Intermediary Loan,7,1.4%,
Program-for-Results,6,1.2%,
Sector Investment and Maintenance Loan,3,0.6%,
(Missing),5,1.0%,

0,1
Distinct count,4
Unique (%),0.8%
Missing (%),1.0%
Missing (n),5

0,1
IN,421
AD,68
PR,6
(Missing),5

Value,Count,Frequency (%),Unnamed: 3
IN,421,84.2%,
AD,68,13.6%,
PR,6,1.2%,
(Missing),5,1.0%,

0,1
Distinct count,320
Unique (%),64.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,154720000
Minimum,30000
Maximum,5170000000
Zeros (%),0.0%

0,1
Minimum,30000
5-th percentile,350000
Q1,6472500
Median,35000000
Q3,102120000
95-th percentile,513780000
Maximum,5170000000
Range,5169970000
Interquartile range,95652000

0,1
Standard deviation,476420000
Coef of variation,3.0792
Kurtosis,62.101
Mean,154720000
MAD,192010000
Skewness,7.2332
Sum,77362040000
Variance,2.2698e+17
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
50000000,16,3.2%,
100000000,15,3.0%,
20000000,13,2.6%,
70000000,8,1.6%,
5000000,8,1.6%,
10000000,7,1.4%,
30000000,7,1.4%,
3000000,7,1.4%,
40000000,6,1.2%,
25000000,5,1.0%,

Value,Count,Frequency (%),Unnamed: 3
30000,3,0.6%,
50000,1,0.2%,
100000,1,0.2%,
150000,1,0.2%,
180000,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
2672000000,1,0.2%,
2700000000,1,0.2%,
4500000000,1,0.2%,
4887090000,1,0.2%,
5170000000,1,0.2%,

Unsupported value

Unsupported value

Unsupported value

Unsupported value

0,1
Distinct count,312
Unique (%),62.4%
Missing (%),0.0%
Missing (n),0

0,1
1111,17
811,12
88,11
Other values (309),460

Value,Count,Frequency (%),Unnamed: 3
1111,17,3.4%,
811,12,2.4%,
88,11,2.2%,
66,8,1.6%,
44,7,1.4%,
114,7,1.4%,
87,6,1.2%,
222,6,1.2%,
1110,6,1.2%,
45,5,1.0%,

0,1
Distinct count,8
Unique (%),1.6%
Missing (%),0.0%
Missing (n),0

0,1
PE,314
RE,140
GE,27
Other values (5),19

Value,Count,Frequency (%),Unnamed: 3
PE,314,62.8%,
RE,140,28.0%,
GE,27,5.4%,
SF,5,1.0%,
GM,5,1.0%,
MT,4,0.8%,
GU,3,0.6%,
CN,2,0.4%,

0,1
Distinct count,8
Unique (%),1.6%
Missing (%),0.0%
Missing (n),0

0,1
IBRD/IDA,314
Recipient Executed Activities,140
Global Environment Project,27
Other values (5),19

Value,Count,Frequency (%),Unnamed: 3
IBRD/IDA,314,62.8%,
Recipient Executed Activities,140,28.0%,
Global Environment Project,27,5.4%,
Special Financing,5,1.0%,
GEF Medium Sized Program,5,1.0%,
Montreal Protocol,4,0.8%,
Guarantees,3,0.6%,
Carbon Offset,2,0.4%,

0,1
Constant value,L

Unsupported value

First 3 values
Jordan Ozone Depleting Substances HCFC Phase-O...
Road Rehabilitation and Maintenance Program
Afghanistan - Second Skills Development Project

Last 3 values
Road Asset Management Project - Additional Fin...
Competitiveness DPL
REAL ESTATE REGISTRATION PROJECT

Value,Count,Frequency (%),Unnamed: 3
5M: Displaced People in Jordan / Lebanon,1,0.2%,
AF - Clean-up & Land Reclamation Project,1,0.2%,
AF - HP Mid-Himalayan Watershed Development Project,1,0.2%,
AF Infrastructure & Institutions Emergency Recovery,1,0.2%,
AF-Forest and Adjacent Land Management,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
Youth Employment,1,0.2%,
ZR SUPPORT TO BASIC EDUCATION PROGRAM,1,0.2%,
Zambia - Extractive Industries Transparency Initiative Implementation Post Compliance I,1,0.2%,
Zambia Strengthening Climate Resilience (PPCR Phase II),1,0.2%,
Zambia Water Resources Development Project,1,0.2%,

Unsupported value

0,1
Distinct count,3
Unique (%),0.6%
Missing (%),0.0%
Missing (n),0

0,1
IDA,216
OTHER,183
IBRD,101

Value,Count,Frequency (%),Unnamed: 3
IDA,216,43.2%,
OTHER,183,36.6%,
IBRD,101,20.2%,

0,1
Distinct count,2
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0

0,1
Active,438
Closed,62

Value,Count,Frequency (%),Unnamed: 3
Active,438,87.6%,
Closed,62,12.4%,

0,1
Distinct count,7
Unique (%),1.4%
Missing (%),0.0%
Missing (n),0

0,1
Africa,152
East Asia and Pacific,100
Europe and Central Asia,74
Other values (4),174

Value,Count,Frequency (%),Unnamed: 3
Africa,152,30.4%,
East Asia and Pacific,100,20.0%,
Europe and Central Asia,74,14.8%,
South Asia,65,13.0%,
Middle East and North Africa,54,10.8%,
Latin America and Caribbean,53,10.6%,
Other,2,0.4%,

Unsupported value

Unsupported value

Unsupported value

Unsupported value

Unsupported value

Unsupported value

0,1
Distinct count,373
Unique (%),74.6%
Missing (%),0.0%
Missing (n),0

0,1
JB,13
BC,12
"BQ,JA",11
Other values (370),464

Value,Count,Frequency (%),Unnamed: 3
JB,13,2.6%,
BC,12,2.4%,
"BQ,JA",11,2.2%,
BZ,9,1.8%,
"BV,TI",7,1.4%,
LS,7,1.4%,
TI,7,1.4%,
LR,6,1.2%,
"BS,JB",6,1.2%,
JA,5,1.0%,

0,1
Constant value,IBRD

0,1
Distinct count,2
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0

0,1
Active,438
Closed,62

Value,Count,Frequency (%),Unnamed: 3
Active,438,87.6%,
Closed,62,12.4%,

0,1
Distinct count,3
Unique (%),0.6%
Missing (%),0.4%
Missing (n),2

0,1
N,409
Y,89
(Missing),2

Value,Count,Frequency (%),Unnamed: 3
N,409,81.8%,
Y,89,17.8%,
(Missing),2,0.4%,

Unsupported value

Unsupported value

0,1
Distinct count,392
Unique (%),78.4%
Missing (%),1.8%
Missing (n),9

0,1
65,13
81,13
27,9
Other values (388),456

Value,Count,Frequency (%),Unnamed: 3
65,13,2.6%,
81,13,2.6%,
27,9,1.8%,
30,9,1.8%,
78,7,1.4%,
52,6,1.2%,
41,5,1.0%,
8284,5,1.0%,
91,5,1.0%,
86,4,0.8%,

0,1
Distinct count,122
Unique (%),24.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,68281000
Minimum,0
Maximum,1307800000
Zeros (%),36.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,20000000
Q3,86250000
95-th percentile,300000000
Maximum,1307800000
Range,1307800000
Interquartile range,86250000

0,1
Standard deviation,124270000
Coef of variation,1.8199
Kurtosis,25.291
Mean,68281000
MAD,76819000
Skewness,4.0077
Sum,34140730000
Variance,1.5442e+16
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
0,183,36.6%,
100000000,27,5.4%,
50000000,26,5.2%,
20000000,15,3.0%,
40000000,12,2.4%,
10000000,12,2.4%,
150000000,11,2.2%,
30000000,10,2.0%,
70000000,9,1.8%,
300000000,8,1.6%,

Value,Count,Frequency (%),Unnamed: 3
0,183,36.6%,
1800000,1,0.2%,
2000000,2,0.4%,
2150000,1,0.2%,
3000000,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
585400000,1,0.2%,
600000000,2,0.4%,
650000000,1,0.2%,
800000000,1,0.2%,
1307800000,1,0.2%,

0,1
Correlation,0.98668

First 3 values
http://www.worldbank.org/projects/P129663/afgh...
http://www.worldbank.org/projects/P128276/coas...
http://www.worldbank.org/projects/P132541/seco...

Last 3 values
http://www.worldbank.org/projects/P132268/soli...
http://www.worldbank.org/projects/P125022/cn-b...
http://www.worldbank.org/projects/P131138/enpi...

Value,Count,Frequency (%),Unnamed: 3
http://www.worldbank.org/projects/P075941/nelsap-regional-rusumo-falls-hydroelectric-multipurpose-project?lang=en,1,0.2%,
http://www.worldbank.org/projects/P085621/chile-sustainable-land-management-project?lang=en,1,0.2%,
http://www.worldbank.org/projects/P086592/second-irrigation-drainage-improvement-project?lang=en,1,0.2%,
http://www.worldbank.org/projects/P094183/agricultural-productivity-program-southern-africa-appsa?lang=en,1,0.2%,
http://www.worldbank.org/projects/P095003/ng-rural-access-mobility-project-phase-2?lang=en,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
http://www.worldbank.org/projects/P146125/rail-trade-transport-facilitation-af?lang=en,1,0.2%,
http://www.worldbank.org/projects/P146161?lang=en,1,0.2%,
http://www.worldbank.org/projects/P146271?lang=en,1,0.2%,
http://www.worldbank.org/projects/P146653?lang=en,1,0.2%,
http://www.worldbank.org/projects/P147689?lang=en,1,0.2%,

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,docty,envassesmentcategorycode,grantamt,ibrdcommamt,id,idacommamt,impagency,lendinginstr,lendinginstrtype,lendprojectcost,majorsector_percent,mjsector_namecode,mjtheme,mjtheme_namecode,mjthemecode,prodline,prodlinetext,productlinetype,project_abstract,project_name,projectdocs,projectfinancialtype,projectstatusdisplay,regionname,sector,sector1,sector2,sector3,sector4,sector_namecode,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,"Project Information Document,Indigenous People...",C,0,0,P129828,130000000,MINISTRY OF EDUCATION,Investment Project Financing,IN,550000000,"[{'Percent': 46, 'Name': 'Education'}, {'Perce...","[{'code': 'EX', 'name': 'Education'}, {'code':...",[Human development],"[{'code': '8', 'name': 'Human development'}, {...",811,PE,IBRD/IDA,L,{'cdata': 'The development objective of the Se...,Ethiopia General Education Quality Improvement...,"[{'DocDate': '28-AUG-2013', 'EntityID': '09022...",IDA,Active,Africa,"[{'Name': 'Primary education'}, {'Name': 'Seco...","{'Percent': 46, 'Name': 'Primary education'}","{'Percent': 26, 'Name': 'Secondary education'}","{'Percent': 16, 'Name': 'Public administration...","{'Percent': 12, 'Name': 'Tertiary education'}","[{'code': 'EP', 'name': 'Primary education'}, ...","ET,BS,ES,EP",IBRD,Active,N,"{'Percent': 100, 'Name': 'Education for all'}","[{'code': '65', 'name': 'Education for all'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,"Project Information Document,Integrated Safegu...",C,4700000,0,P144674,0,MINISTRY OF FINANCE,Specific Investment Loan,IN,5700000,"[{'Percent': 70, 'Name': 'Public Administratio...","[{'code': 'BX', 'name': 'Public Administration...","[Economic management, Social protection and ri...","[{'code': '1', 'name': 'Economic management'},...",16,RE,Recipient Executed Activities,L,,TN: DTF Social Protection Reforms Support,"[{'DocDate': '29-MAR-2013', 'EntityID': '00033...",OTHER,Active,Middle East and North Africa,[{'Name': 'Public administration- Other social...,"{'Percent': 70, 'Name': 'Public administration...","{'Percent': 30, 'Name': 'General public admini...",,,"[{'code': 'BS', 'name': 'Public administration...","BZ,BS",IBRD,Active,N,"{'Percent': 30, 'Name': 'Other economic manage...","[{'code': '24', 'name': 'Other economic manage...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en
2,{'$oid': '52b213b38594d8a2be17c782'},2014,November,2013-11-01T00:00:00Z,MINISTRY OF FINANCE AND ECONOMIC DEVEL,,Tuvalu!$!TV,TV,Tuvalu,Tuvalu,"Resettlement Plan,Environmental Assessment,Int...",B,0,0,P145310,6060000,MINISTRY OF TRANSPORT AND COMMUNICATIONS,Investment Project Financing,IN,6060000,"[{'Percent': 100, 'Name': 'Transportation'}]","[{'code': 'TX', 'name': 'Transportation'}]","[Trade and integration, Public sector governan...","[{'code': '5', 'name': 'Trade and integration'...",52116,PE,IBRD/IDA,L,,Tuvalu Aviation Investment Project - Additiona...,"[{'DocDate': '21-OCT-2013', 'EntityID': '00033...",IDA,Active,East Asia and Pacific,[{'Name': 'Rural and Inter-Urban Roads and Hig...,"{'Percent': 100, 'Name': 'Rural and Inter-Urba...",,,,"[{'code': 'TI', 'name': 'Rural and Inter-Urban...",TI,IBRD,Active,Y,"{'Percent': 46, 'Name': 'Regional integration'}","[{'code': '47', 'name': 'Regional integration'...",52812547,6060000,6060000,http://www.worldbank.org/projects/P145310?lang=en
3,{'$oid': '52b213b38594d8a2be17c783'},2014,October,2013-10-31T00:00:00Z,MIN. OF PLANNING AND INT'L COOPERATION,,Republic of Yemen!$!RY,RY,Republic of Yemen,"Yemen, Republic of","Procurement Plan,Project Information Document,...",C,1500000,0,P144665,0,LABOR INTENSIVE PUBLIC WORKS PROJECT PMU,Technical Assistance Loan,IN,1500000,"[{'Percent': 100, 'Name': 'Health and other so...","[{'code': 'JX', 'name': 'Health and other soci...","[Social dev/gender/inclusion, Social dev/gende...","[{'code': '7', 'name': 'Social dev/gender/incl...",77,RE,Recipient Executed Activities,L,,Gov't and Civil Society Organization Partnership,"[{'DocDate': '15-MAY-2013', 'EntityID': '00035...",OTHER,Active,Middle East and North Africa,[{'Name': 'Other social services'}],"{'Percent': 100, 'Name': 'Other social services'}",,,,"[{'code': 'JB', 'name': 'Other social services'}]",JB,IBRD,Active,N,"{'Percent': 50, 'Name': 'Participation and civ...","[{'code': '57', 'name': 'Participation and civ...",5957,0,1500000,http://www.worldbank.org/projects/P144665?lang=en
4,{'$oid': '52b213b38594d8a2be17c784'},2014,October,2013-10-31T00:00:00Z,MINISTRY OF FINANCE,2019-04-30T00:00:00Z,Kingdom of Lesotho!$!LS,LS,Kingdom of Lesotho,Lesotho,"Project Information Document,Integrated Safegu...",B,0,0,P144933,13100000,MINISTRY OF TRADE AND INDUSTRY,Investment Project Financing,IN,15000000,"[{'Percent': 50, 'Name': 'Industry and trade'}...","[{'code': 'YX', 'name': 'Industry and trade'},...","[Trade and integration, Financial and private ...","[{'code': '5', 'name': 'Trade and integration'...",54,PE,IBRD/IDA,L,{'cdata': 'The development objective of the Se...,Second Private Sector Competitiveness and Econ...,"[{'DocDate': '06-SEP-2013', 'EntityID': '09022...",IDA,Active,Africa,[{'Name': 'General industry and trade sector'}...,"{'Percent': 50, 'Name': 'General industry and ...","{'Percent': 40, 'Name': 'Other industry'}","{'Percent': 10, 'Name': 'SME Finance'}",,"[{'code': 'YZ', 'name': 'General industry and ...","FH,YW,YZ",IBRD,Active,N,"{'Percent': 30, 'Name': 'Export development an...","[{'code': '45', 'name': 'Export development an...",4145,13100000,13100000,http://www.worldbank.org/projects/P144933/seco...


In [12]:
#View Header Data
df.head()

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Percent': 100, 'Name': 'Education for all'}","[{'code': '65', 'name': 'Education for all'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Percent': 30, 'Name': 'Other economic manage...","[{'code': '24', 'name': 'Other economic manage...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en
2,{'$oid': '52b213b38594d8a2be17c782'},2014,November,2013-11-01T00:00:00Z,MINISTRY OF FINANCE AND ECONOMIC DEVEL,,Tuvalu!$!TV,TV,Tuvalu,Tuvalu,...,TI,IBRD,Active,Y,"{'Percent': 46, 'Name': 'Regional integration'}","[{'code': '47', 'name': 'Regional integration'...",52812547,6060000,6060000,http://www.worldbank.org/projects/P145310?lang=en
3,{'$oid': '52b213b38594d8a2be17c783'},2014,October,2013-10-31T00:00:00Z,MIN. OF PLANNING AND INT'L COOPERATION,,Republic of Yemen!$!RY,RY,Republic of Yemen,"Yemen, Republic of",...,JB,IBRD,Active,N,"{'Percent': 50, 'Name': 'Participation and civ...","[{'code': '57', 'name': 'Participation and civ...",5957,0,1500000,http://www.worldbank.org/projects/P144665?lang=en
4,{'$oid': '52b213b38594d8a2be17c784'},2014,October,2013-10-31T00:00:00Z,MINISTRY OF FINANCE,2019-04-30T00:00:00Z,Kingdom of Lesotho!$!LS,LS,Kingdom of Lesotho,Lesotho,...,"FH,YW,YZ",IBRD,Active,N,"{'Percent': 30, 'Name': 'Export development an...","[{'code': '45', 'name': 'Export development an...",4145,13100000,13100000,http://www.worldbank.org/projects/P144933/seco...


# Project Exercises

## 1. Find the ten countries with most projects

In [13]:
#View Data type of first column _id
type(df._id)

pandas.core.series.Series

In [14]:
#Unpack json dictionary: 
df._id = df._id.apply(pd.Series)

In [15]:
#View Header Data:
df.head()

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,52b213b38594d8a2be17c780,1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Percent': 100, 'Name': 'Education for all'}","[{'code': '65', 'name': 'Education for all'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,52b213b38594d8a2be17c781,2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Percent': 30, 'Name': 'Other economic manage...","[{'code': '24', 'name': 'Other economic manage...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en
2,52b213b38594d8a2be17c782,2014,November,2013-11-01T00:00:00Z,MINISTRY OF FINANCE AND ECONOMIC DEVEL,,Tuvalu!$!TV,TV,Tuvalu,Tuvalu,...,TI,IBRD,Active,Y,"{'Percent': 46, 'Name': 'Regional integration'}","[{'code': '47', 'name': 'Regional integration'...",52812547,6060000,6060000,http://www.worldbank.org/projects/P145310?lang=en
3,52b213b38594d8a2be17c783,2014,October,2013-10-31T00:00:00Z,MIN. OF PLANNING AND INT'L COOPERATION,,Republic of Yemen!$!RY,RY,Republic of Yemen,"Yemen, Republic of",...,JB,IBRD,Active,N,"{'Percent': 50, 'Name': 'Participation and civ...","[{'code': '57', 'name': 'Participation and civ...",5957,0,1500000,http://www.worldbank.org/projects/P144665?lang=en
4,52b213b38594d8a2be17c784,2014,October,2013-10-31T00:00:00Z,MINISTRY OF FINANCE,2019-04-30T00:00:00Z,Kingdom of Lesotho!$!LS,LS,Kingdom of Lesotho,Lesotho,...,"FH,YW,YZ",IBRD,Active,N,"{'Percent': 30, 'Name': 'Export development an...","[{'code': '45', 'name': 'Export development an...",4145,13100000,13100000,http://www.worldbank.org/projects/P144933/seco...


In [16]:
#View all columns:
df.columns

Index(['_id', 'approvalfy', 'board_approval_month', 'boardapprovaldate',
       'borrower', 'closingdate', 'country_namecode', 'countrycode',
       'countryname', 'countryshortname', 'docty', 'envassesmentcategorycode',
       'grantamt', 'ibrdcommamt', 'id', 'idacommamt', 'impagency',
       'lendinginstr', 'lendinginstrtype', 'lendprojectcost',
       'majorsector_percent', 'mjsector_namecode', 'mjtheme',
       'mjtheme_namecode', 'mjthemecode', 'prodline', 'prodlinetext',
       'productlinetype', 'project_abstract', 'project_name', 'projectdocs',
       'projectfinancialtype', 'projectstatusdisplay', 'regionname', 'sector',
       'sector1', 'sector2', 'sector3', 'sector4', 'sector_namecode',
       'sectorcode', 'source', 'status', 'supplementprojectflg', 'theme1',
       'theme_namecode', 'themecode', 'totalamt', 'totalcommamt', 'url'],
      dtype='object')

In [17]:
# Number of unique countries
df.countryname.nunique()

118

In [18]:
#Split country_namecode by '!$!' to get a list of country and namecodes:
df['country_namecode'] = df['country_namecode'].apply(lambda x : x.split('!$!'))

In [19]:
#Apply Group by:
df_gr= df.groupby(['id'], as_index=False)['country_namecode'].sum()
df_gr.head()

Unnamed: 0,id,country_namecode
0,P075941,"[Africa, 3A]"
1,P085621,"[Republic of Chile, CL]"
2,P086592,"[Republic of Kazakhstan, KZ]"
3,P094183,"[Africa, 3A]"
4,P095003,"[Federal Republic of Nigeria, NG]"


In [20]:
#Create two new columns to store country and nmaecode separately:  
df_gr['country'] = df['country_namecode'].apply(lambda x : x[0])
df_gr['namecode'] = df['country_namecode'].apply(lambda y : y[1])

In [21]:
df_gr.head()

Unnamed: 0,id,country_namecode,country,namecode
0,P075941,"[Africa, 3A]",Federal Democratic Republic of Ethiopia,ET
1,P085621,"[Republic of Chile, CL]",Republic of Tunisia,TN
2,P086592,"[Republic of Kazakhstan, KZ]",Tuvalu,TV
3,P094183,"[Africa, 3A]",Republic of Yemen,RY
4,P095003,"[Federal Republic of Nigeria, NG]",Kingdom of Lesotho,LS


## Answer: Ten Countries with most projects

In [22]:
#Group by and count top 10 projects in descending order: 
df_gr.groupby(['namecode', 'country'], as_index = False, axis =0)['id'].count().sort_values(by ='id', ascending = False).head(10)

Unnamed: 0,namecode,country,id
45,ID,Republic of Indonesia,19
28,CN,People's Republic of China,19
110,VN,Socialist Republic of Vietnam,17
46,IN,Republic of India,16
88,RY,Republic of Yemen,13
60,MA,Kingdom of Morocco,12
15,BD,People's Republic of Bangladesh,12
78,NP,Nepal,12
1,3A,Africa,11
73,MZ,Republic of Mozambique,11


## 2. Find the top 10 major project themes (using column 'mjtheme_namecode')

In [23]:
# Read column:
df['mjtheme_namecode'].head()

0    [{'code': '8', 'name': 'Human development'}, {...
1    [{'code': '1', 'name': 'Economic management'},...
2    [{'code': '5', 'name': 'Trade and integration'...
3    [{'code': '7', 'name': 'Social dev/gender/incl...
4    [{'code': '5', 'name': 'Trade and integration'...
Name: mjtheme_namecode, dtype: object

In [24]:
#Data type
type(df['mjtheme_namecode'])

pandas.core.series.Series

In [25]:
#Unpack json dictionary:
df_theme= df['mjtheme_namecode'].apply(pd.Series)

In [26]:
#View data header
df_theme.head()

Unnamed: 0,0,1,2,3,4
0,"{'code': '8', 'name': 'Human development'}","{'code': '11', 'name': ''}",,,
1,"{'code': '1', 'name': 'Economic management'}","{'code': '6', 'name': 'Social protection and r...",,,
2,"{'code': '5', 'name': 'Trade and integration'}","{'code': '2', 'name': 'Public sector governance'}","{'code': '11', 'name': 'Environment and natura...","{'code': '6', 'name': 'Social protection and r...",
3,"{'code': '7', 'name': 'Social dev/gender/inclu...","{'code': '7', 'name': 'Social dev/gender/inclu...",,,
4,"{'code': '5', 'name': 'Trade and integration'}","{'code': '4', 'name': 'Financial and private s...",,,


In [27]:
#Check Null Values in all 5 columns
print('1st column:','\n',df_theme.loc[:,0].isnull().value_counts())
print('2st column:','\n',df_theme.loc[:,1].isnull().value_counts())
print('3st column:','\n',df_theme.loc[:,2].isnull().value_counts())
print('4st column:','\n',df_theme.loc[:,3].isnull().value_counts())
print('5st column:','\n',df_theme.loc[:,4].isnull().value_counts())

1st column: 
 False    500
Name: 0, dtype: int64
2st column: 
 False    491
True       9
Name: 1, dtype: int64
3st column: 
 False    263
True     237
Name: 2, dtype: int64
4st column: 
 True     329
False    171
Name: 3, dtype: int64
5st column: 
 True     426
False     74
Name: 4, dtype: int64


In [28]:
# Normalize first json column
df_theme_0= json_normalize(df_theme[0], 'code', ['name'])

#Normalize 2,3,4 and 5th json columns and drop missing values in each column: 
df_theme_1= json_normalize(df_theme[1].dropna(), 'code', ['name'])
df_theme_2= json_normalize(df_theme[2].dropna(), 'code', ['name'])
df_theme_3= json_normalize(df_theme[3].dropna(), 'code', ['name'])
df_theme_4= json_normalize(df_theme[4].dropna(), 'code', ['name'])

In [29]:
#Concatenate all 5 treated columns vertically: 
df_theme_combine = pd.concat([df_theme_0, df_theme_1, df_theme_2, df_theme_3, df_theme_4], axis ='rows')

In [30]:
#View Header Data:
df_theme_combine.head()

Unnamed: 0,0,name
0,8,Human development
1,1,Economic management
2,5,Trade and integration
3,7,Social dev/gender/inclusion
4,5,Trade and integration


In [31]:
#Rename column names of concatenated table: 
df_theme_combine.columns = ['code', 'name']

In [32]:
#View header data and replace blank names with misisng values(NaN): 
df_theme_combine[df_theme_combine.name == ''] = 'NaN'

In [33]:
#Group by and sorted top ten values in Descending order:
df_theme_sorted= df_theme_combine.groupby(['name'])['code'].count().sort_values(ascending = False).head(10)

## Answer: Top 10 major project themes (using column 'mjtheme_namecode')

In [34]:
#Result 
df_theme_sorted.reset_index()

Unnamed: 0,name,code
0,Environment and natural resources management,446
1,Rural development,404
2,Human development,197
3,Public sector governance,184
4,,163
5,Social protection and risk management,158
6,Financial and private sector development,130
7,Social dev/gender/inclusion,119
8,Trade and integration,72
9,Urban development,47


## 3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in

In [35]:
#Store previous concatenated table under new name df_missing: 
df_missing= pd.concat([df_theme_0, df_theme_1, df_theme_2, df_theme_3, df_theme_4], axis =0 )

In [36]:
#Create pivot table to find corresponding code names: 
import numpy as np
df_pivot= df_theme_combine.pivot_table(index= 'code', values='name', aggfunc=np.min)
df_pivot=df_pivot.reset_index()
df_pivot

Unnamed: 0,code,name
0,0.0,Rural development
1,1.0,Economic management
2,2.0,Public sector governance
3,3.0,Rule of law
4,4.0,Financial and private sector development
5,5.0,Trade and integration
6,6.0,Social protection and risk management
7,7.0,Social dev/gender/inclusion
8,8.0,Human development
9,9.0,Urban development


In [71]:
# Rename column lables:
df_missing.columns = ['code', 'missing_names']

#Filter blank names:
df_assign= (df_missing[df_missing.missing_names == ''])
df_assign= df_assign.groupby(['code']).count().reset_index()

#Join tables (df_assign with df_pivot) to identify missing names by code:
df_fill = df_assign.merge(right= df_pivot, how = 'inner')

## Answer: Dataframe of missing names filled in: 

In [72]:
#Result: 
df_fill

Unnamed: 0,code,missing_names,name
0,0,14,Rural development
1,1,73,Economic management
2,2,15,Public sector governance
3,3,3,Rule of law
4,4,16,Financial and private sector development
5,5,5,Trade and integration
6,6,10,Social protection and risk management
7,7,11,Social dev/gender/inclusion
8,8,13,Human development
9,9,3,Urban development
