## imports for Python, Pandas

In [21]:
import pandas as pd
import json
from pandas.io.json import json_normalize

from collections import Counter

In [70]:
worldbank_projects_df = pd.read_json('data/world_bank_projects.json')


In [23]:
# Preparing the Project Code dictionary mapping code to name
projectCodeDict={}

nameCodeArrays = worldbank_projects_df['mjtheme_namecode'].values
for nameCodeArray in nameCodeArrays:
    for nameCode in nameCodeArray:
        if nameCode['name']:
            projectCodeDict[nameCode['code']] = nameCode['name']

In [27]:
# Get all name code entries, and get counts based on the code
nameCodeEntries = [j for i in nameCodeArrays for j in i]

projectCodeCounter = Counter([nameCodeEntry['code'] for nameCodeEntry in nameCodeEntries])
sortedByProjectCodeDesc = sorted(projectCodeCounter, key=projectCodeCounter.get,reverse=True)


In [28]:
# 1. Find top 10 countries with most projects
worldbank_projects_df[['countryshortname']].groupby(by='countryshortname').size().sort_values(ascending=False).head(n=10).index.get_values()


array([u'Indonesia', u'China', u'Vietnam', u'India', u'Yemen, Republic of',
       u'Nepal', u'Bangladesh', u'Morocco', u'Mozambique', u'Africa'], dtype=object)

In [29]:
# 2.Find the top 10 major project themes (using column 'mjtheme_namecode')
[projectCodeDict[j] for j in sortedByProjectCodeDesc[:10]]

[u'Environment and natural resources management',
 u'Rural development',
 u'Human development',
 u'Public sector governance',
 u'Social protection and risk management',
 u'Financial and private sector development',
 u'Social dev/gender/inclusion',
 u'Trade and integration',
 u'Urban development',
 u'Economic management']

In [36]:
# In 2. above you will notice that some entries have only the code and the name is missing. 
# Create a dataframe with the missing names filled in.
worldbank_projects_df['mjtheme_namecode'][0]

[{u'code': u'8', u'name': u'Human development'},
 {u'code': u'11', u'name': u''}]

In [83]:
def fillEmptyProjectNames(nameCodeArray):
    filledNameCodeArray = []
    for nameCode in nameCodeArray:
        filledNameCode = {}
        filledNameCode['code'] = nameCode['code']
        filledNameCode['name'] = projectCodeDict[nameCode['code']]
        filledNameCodeArray.append(filledNameCode)
    return filledNameCodeArray

In [90]:
worldbank_projects_df_filledIn = worldbank_projects_df.copy()
worldbank_projects_df_filledIn['mjtheme_namecode'] = worldbank_projects_df_filledIn.apply(lambda row: fillEmptyProjectNames(row['mjtheme_namecode']), axis=1)

In [95]:

worldbank_projects_df['mjtheme_namecode'][0]

[{u'code': u'8', u'name': u'Human development'},
 {u'code': u'11', u'name': u''}]

In [96]:

worldbank_projects_df_filledIn['mjtheme_namecode'][0]

[{'code': u'8', 'name': u'Human development'},
 {'code': u'11', 'name': u'Environment and natural resources management'}]