# JSON project


In [50]:
cd /Users/sazzadnasir/Desktop/Springboard/Mini_projects/data_wrangling_json/data

/Users/sazzadnasir/Desktop/Springboard/Mini_projects/data_wrangling_json/data


In [51]:
#Import relevant packages
import pandas as pd
import json
from pandas.io.json import json_normalize

# Read Json file
json_df = pd.read_json('world_bank_projects.json') 

## Sort country names by frequency of occurence

In [52]:
country_counts = json_df['countryname'].value_counts()
print(country_counts.head(10))

People's Republic of China         19
Republic of Indonesia              19
Socialist Republic of Vietnam      17
Republic of India                  16
Republic of Yemen                  13
Kingdom of Morocco                 12
Nepal                              12
People's Republic of Bangladesh    12
Republic of Mozambique             11
Africa                             11
Name: countryname, dtype: int64


In [53]:
df_project = json_df['mjtheme_namecode'] # extracting the column for project themes
df_project[0] # check 

[{'code': '8', 'name': 'Human development'}, {'code': '11', 'name': ''}]

## Sort project themes (including the empty ones)

In [54]:
from collections import defaultdict
projectcounts = defaultdict(int)

for row in df_project:
    for rw in row:
   #     if len(rw['name']) > 0:
            projectcounts[rw['name']] += 1

    #Now sort the project themes in descending order
from operator import itemgetter
projectcounts_sorted = sorted(projectcounts.items(), key = itemgetter(1), reverse = True)
print(projectcounts_sorted[0:10])

[('Environment and natural resources management', 223), ('Rural development', 202), ('Human development', 197), ('Public sector governance', 184), ('Social protection and risk management', 158), ('Financial and private sector development', 130), ('', 122), ('Social dev/gender/inclusion', 119), ('Trade and integration', 72), ('Urban development', 47)]


## To replace empty names, first a lookup table is created (code vs. name)

In [55]:
code_name_lookup = {} # Code and project name, a lookup table
for row in df_project:
    for rw in row:
         if len(rw['name']) > 0:
            if rw['code']  not in code_name_lookup:
                code_name_lookup[rw['code']] = rw['name']
            
print(code_name_lookup)

{'8': 'Human development', '1': 'Economic management', '6': 'Social protection and risk management', '5': 'Trade and integration', '2': 'Public sector governance', '11': 'Environment and natural resources management', '7': 'Social dev/gender/inclusion', '4': 'Financial and private sector development', '10': 'Rural development', '9': 'Urban development', '3': 'Rule of law'}


## One way for empty name substitution. This is by defining a function name_sub and applying it row-wise to the Panda series.

In [48]:
def name_sub(row):
    for rw in row:
     #    if len(rw['name']) == 0:
         if not rw['name']:     
            rw['name'] = code_name_lookup[rw['code']]
    return row
            
new_project1 = df_project.apply(name_sub)
new_project1[0]

[{'code': '8', 'name': 'Human development'},
 {'code': '11', 'name': 'Environment and natural resources management'}]