# JSON exercise

Download the data from [**here**](https://drive.google.com/file/d/1DGaX5AVfYhmWeb15lI-MzUbSKTYSz9fQ/view?usp=sharing) and answer following questions:
1. Find the 10 countries with most projects
2. What are top 10 sectors with projects?
3. Find the top 10 major project themes (using column 'mjtheme_namecode')
4. In 3. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [1]:
import pandas as pd
import json
from pprint import pprint
from pandas.io.json import json_normalize  
import numpy as np

In [2]:
#load json object
with open('data/world_bank_projects.json') as file:
    nested_json = json.load(file)
#pprint(nested_json)

In [3]:
# Pro pandas tip:
    # If you load a dataframe and there is too many columns that it automatically truncates.
    # Use this code to display all columns when displaying your dataframe in jupyter.
import pandas as pd
pd.set_option('display.max_columns', None)

In [4]:
# Q1 - Find the 10 countries with most projects
df = pd.json_normalize(nested_json)
df[['countryname','project_name']].groupby('countryname').count()['project_name'].reset_index().sort_values(by='project_name',ascending=False).head(10)

Unnamed: 0,countryname,project_name
39,People's Republic of China,19
64,Republic of Indonesia,19
107,Socialist Republic of Vietnam,17
63,Republic of India,16
97,Republic of Yemen,13
38,People's Republic of Bangladesh,12
34,Nepal,12
25,Kingdom of Morocco,12
76,Republic of Mozambique,11
0,Africa,11


In [5]:
# Q2 - What are top 10 sectors with projects?
sectors = json_normalize(nested_json, record_path='sector', meta='project_name')
sectors[['Name','project_name']].groupby('Name').count()['project_name'].reset_index().sort_values(by='project_name',ascending=False).head(10)

  sectors = json_normalize(nested_json, record_path='sector', meta='project_name')


Unnamed: 0,Name,project_name
40,Other social services,106
7,Central government administration,82
61,Sub-national government administration,75
15,"General agriculture, fishing and forestry sector",65
24,Health,63
21,General public administration sector,51
56,Rural and Inter-Urban Roads and Highways,44
23,"General water, sanitation and flood protection...",39
1,Agricultural extension and research,38
45,"Public administration- Agriculture, fishing an...",36


In [6]:
# Q3 - Find the top 10 major project themes (using column 'mjtheme_namecode')
themes = json_normalize(nested_json, record_path='mjtheme_namecode', meta='project_name')
themes[['name','project_name']].groupby('name').count()['project_name'].reset_index().sort_values(by='project_name',ascending=False).head(10)

  themes = json_normalize(nested_json, record_path='mjtheme_namecode', meta='project_name')


Unnamed: 0,name,project_name
2,Environment and natural resources management,223
7,Rural development,202
4,Human development,197
5,Public sector governance,184
9,Social protection and risk management,158
3,Financial and private sector development,130
0,,122
8,Social dev/gender/inclusion,119
10,Trade and integration,72
11,Urban development,47


In [7]:
themes[['code','project_name']].groupby('code').count()['project_name'].reset_index().sort_values(by='project_name',ascending=False).head(10)

Unnamed: 0,code,project_name
2,11,250
1,10,216
9,8,210
3,2,199
7,6,168
5,4,146
8,7,130
6,5,77
10,9,50
0,1,38


In [8]:
# Q4 - In 3. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

# adding new column to where missing names == 0 (length) and named themes have a length
themes = themes.assign(name_len = themes['name'].str.len())
themes

Unnamed: 0,code,name,project_name,name_len
0,8,Human development,Ethiopia General Education Quality Improvement...,17
1,11,,Ethiopia General Education Quality Improvement...,0
2,1,Economic management,TN: DTF Social Protection Reforms Support,19
3,6,Social protection and risk management,TN: DTF Social Protection Reforms Support,37
4,5,Trade and integration,Tuvalu Aviation Investment Project - Additiona...,21
...,...,...,...,...
1494,10,Rural development,Sustainable Management of Agricultural Researc...,17
1495,9,Urban development,KENYA: NATIONAL URBAN TRANSPORT IMPROVEMENT PR...,17
1496,8,Human development,KENYA: NATIONAL URBAN TRANSPORT IMPROVEMENT PR...,17
1497,5,Trade and integration,KENYA: NATIONAL URBAN TRANSPORT IMPROVEMENT PR...,21


In [9]:
# create a df of the missing lines
blanks = themes[themes['name_len'] == 0]
blanks

Unnamed: 0,code,name,project_name,name_len
1,11,,Ethiopia General Education Quality Improvement...,0
13,6,,Additional Financing for Cash Transfers for Or...,0
17,8,,China Renewable Energy Scale-Up Program Phase II,0
19,7,,Rajasthan Road Sector Modernization Project,0
24,2,,Southern Sudan Emergency Food Crisis Response ...,0
...,...,...,...,...
1457,4,,Capacity Builiding for Emerging Infectious Dis...,0
1477,11,,Water Supply and Sanitation Improvements for W...,0
1481,5,,Revision and Alignment of NAP with UNCCD 10-ye...,0
1483,8,,Nepal: Pilot Project for Seismic School Safety...,0


In [10]:
# create a df of the filled names
non_blanks = themes[themes['name_len'] >  0]
non_blanks

Unnamed: 0,code,name,project_name,name_len
0,8,Human development,Ethiopia General Education Quality Improvement...,17
2,1,Economic management,TN: DTF Social Protection Reforms Support,19
3,6,Social protection and risk management,TN: DTF Social Protection Reforms Support,37
4,5,Trade and integration,Tuvalu Aviation Investment Project - Additiona...,21
5,2,Public sector governance,Tuvalu Aviation Investment Project - Additiona...,24
...,...,...,...,...
1494,10,Rural development,Sustainable Management of Agricultural Researc...,17
1495,9,Urban development,KENYA: NATIONAL URBAN TRANSPORT IMPROVEMENT PR...,17
1496,8,Human development,KENYA: NATIONAL URBAN TRANSPORT IMPROVEMENT PR...,17
1497,5,Trade and integration,KENYA: NATIONAL URBAN TRANSPORT IMPROVEMENT PR...,21


In [12]:
full_themes = non_blanks.merge(blanks, on=['code','project_name'], how='outer')
full_themes

Unnamed: 0,code,name_x,project_name,name_len_x,name_y,name_len_y
0,8,Human development,Ethiopia General Education Quality Improvement...,17.0,,
1,1,Economic management,TN: DTF Social Protection Reforms Support,19.0,,
2,6,Social protection and risk management,TN: DTF Social Protection Reforms Support,37.0,,
3,5,Trade and integration,Tuvalu Aviation Investment Project - Additiona...,21.0,,
4,2,Public sector governance,Tuvalu Aviation Investment Project - Additiona...,24.0,,
...,...,...,...,...,...,...
1479,4,,Capacity Builiding for Emerging Infectious Dis...,,,0.0
1480,11,,Water Supply and Sanitation Improvements for W...,,,0.0
1481,5,,Revision and Alignment of NAP with UNCCD 10-ye...,,,0.0
1482,8,,Nepal: Pilot Project for Seismic School Safety...,,,0.0


In [13]:
# return to Q3
full_themes[['code','project_name']].groupby('code').count()['project_name'].reset_index().sort_values(by='project_name',ascending=False).head(10)

Unnamed: 0,code,project_name
2,11,245
1,10,215
9,8,207
3,2,198
7,6,164
5,4,145
8,7,130
6,5,77
10,9,50
0,1,38
