In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize

In [2]:
# Create a dataframe 
world_bank_projects_df = pd.read_json('data/world_bank_projects.json')

In [3]:
# Question 1: find the 10 countries with the most projects 
print("The 10 countries with the most projects are:")
print(world_bank_projects_df['countryname'].value_counts().head(10))

The 10 countries with the most projects are:
Republic of Indonesia              19
People's Republic of China         19
Socialist Republic of Vietnam      17
Republic of India                  16
Republic of Yemen                  13
Nepal                              12
People's Republic of Bangladesh    12
Kingdom of Morocco                 12
Africa                             11
Republic of Mozambique             11
Name: countryname, dtype: int64


In [4]:
# Africa is not a country
# So the 10 countries ith the most projects will be:
print("Excluding Africa, which is not a country, the 10 countries with the most projects are:")
print(world_bank_projects_df['countryname'].value_counts().drop('Africa').head(10))

Excluding Africa, which is not a country, the 10 countries with the most projects are:
Republic of Indonesia              19
People's Republic of China         19
Socialist Republic of Vietnam      17
Republic of India                  16
Republic of Yemen                  13
Nepal                              12
People's Republic of Bangladesh    12
Kingdom of Morocco                 12
Republic of Mozambique             11
Federative Republic of Brazil       9
Name: countryname, dtype: int64


In [5]:
# Question 2: Find the top 10 major project themes
major_theme = json.load((open('data/world_bank_projects.json')))
major_theme_norm = json_normalize(major_theme, 'mjtheme_namecode')

print("The top 10 major project themes are:")
print(major_theme_norm.name.value_counts()[:10])

The top 10 major project themes are:
Environment and natural resources management    223
Rural development                               202
Human development                               197
Public sector governance                        184
Social protection and risk management           158
Financial and private sector development        130
                                                122
Social dev/gender/inclusion                     119
Trade and integration                            72
Urban development                                47
Name: name, dtype: int64


In [7]:
# The 7th most common major project theme is blank
# Find the blank entries
blank = major_theme_norm.name == ''
blank_codes = major_theme_norm.code[blank]

In [8]:
# Find out how many unique names and number codes exist
unique_names = pd.Series(major_theme_norm.name.unique())
print(unique_names)
unique_codes = pd.Series(major_theme_norm.code.unique())
print(unique_codes)

0                                Human development
1                                                 
2                              Economic management
3            Social protection and risk management
4                            Trade and integration
5                         Public sector governance
6     Environment and natural resources management
7                      Social dev/gender/inclusion
8         Financial and private sector development
9                                Rural development
10                               Urban development
11                                     Rule of law
dtype: object
0      8
1     11
2      1
3      6
4      5
5      2
6      7
7      4
8     10
9      9
10     3
dtype: object


In [9]:
# Explore major_theme_norm to find corresponding code and name pairs
print(major_theme_norm.head(15))

   code                                          name
0     8                             Human development
1    11                                              
2     1                           Economic management
3     6         Social protection and risk management
4     5                         Trade and integration
5     2                      Public sector governance
6    11  Environment and natural resources management
7     6         Social protection and risk management
8     7                   Social dev/gender/inclusion
9     7                   Social dev/gender/inclusion
10    5                         Trade and integration
11    4      Financial and private sector development
12    6         Social protection and risk management
13    6                                              
14    2                      Public sector governance


In [12]:
# Make codes_reference list that matches the order found in major_theme_norm 
part1 = pd.Series(unique_codes[0:6])
part1[6] = '11'
part2 = pd.Series(unique_codes[6:11]) 
code_reference = pd.concat([part1,part2],ignore_index='TRUE')

In [13]:
# Make a reference data frame to pair corresponding codes and themes 
reference = pd.DataFrame([code_reference,unique_names])
reference_df = pd.DataFrame([code_reference, unique_names])
reference_df = reference_df.transpose()

In [14]:
# Need to drop index 1 to get rid of the blank entry in the reference data frame
reference_df = reference_df.drop(1)
reference_df = reference_df.reset_index(drop=True)
reference_df.columns = ['code', 'name']

In [15]:
# Merge reference_df with major_theme_norm to fill in missing names
complete = pd.merge(major_theme_norm,reference_df, on = ['code'], right_index = True, how= 'outer', suffixes = ('_original', '_filled'))
complete = complete.sort_index()

In [17]:
print("The top 10 major project themes -- excluding blanks -- are:")
print(complete.name_filled.value_counts()[:10])

The top 10 major project themes -- excluding blanks -- are:
Environment and natural resources management    250
Rural development                               216
Human development                               210
Public sector governance                        199
Social protection and risk management           168
Financial and private sector development        146
Social dev/gender/inclusion                     130
Trade and integration                            77
Urban development                                50
Economic management                              38
Name: name_filled, dtype: int64
