In [49]:
import pandas as pd
import os
import pickle
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import numpy as np

In [50]:
world_bank_projects = pd.read_pickle('world_bank_projects/world_bank_projects_json.pkl')
loan_features = pd.read_pickle('extracted_features.pkl')

In [51]:
wb_project_names = world_bank_projects['project_name']
doc_project_names = loan_features['Project Name']

We will use fuzzy string matching to pair project names between the two tables. Let's explore a couple of algorithms. The first one compares substrings.

In [52]:
similarities = np.zeros((len(wb_project_names),len(doc_project_names)))
for i in range(len(wb_project_names)):
    if i % 100 == 0:
        print('computing similarities for world bank project name', i)
    for j in range(len(doc_project_names)):
        similarities[i,j] = fuzz.partial_ratio(wb_project_names.iloc[i],doc_project_names.iloc[j])

computing similarities for world bank project name 0
computing similarities for world bank project name 100
computing similarities for world bank project name 200
computing similarities for world bank project name 300
computing similarities for world bank project name 400
computing similarities for world bank project name 500
computing similarities for world bank project name 600
computing similarities for world bank project name 700
computing similarities for world bank project name 800
computing similarities for world bank project name 900
computing similarities for world bank project name 1000
computing similarities for world bank project name 1100
computing similarities for world bank project name 1200
computing similarities for world bank project name 1300
computing similarities for world bank project name 1400
computing similarities for world bank project name 1500
computing similarities for world bank project name 1600
computing similarities for world bank project name 1700
comp

Let's take a look at what the matches end up being.

In [53]:
matches = similarities.argmax(axis=0)
count = 0
for i in range(len(doc_project_names)):
    sim = similarities[matches[i],i]
    if sim > 95:
        count += 1
        print("doc project:\t", doc_project_names.iloc[i])
        print("best match:\t",wb_project_names.iloc[matches[i]])
        print("score:\t", sim)

nterprises Project
score:	 100.0
doc project:	 Water Supply Project
best match:	 Additional Financing for the Greater Beirut Water Supply Project
score:	 100.0
doc project:	 Second Pollution Abatement Project
best match:	 Second Pollution Abatement Project
score:	 100.0
doc project:	 Second State Statistical System Development Project
best match:	 Second State Statistical System Development Project
score:	 100.0
doc project:	 Structural Adjustment Loan
best match:	 CO: Programmatic Labor Reform and Social Structural Adjustment Loan
score:	 100.0
doc project:	 Transport Project
best match:	 Meghalaya Integrated Transport Project
score:	 100.0
doc project:	 Financial Sector Adjustment Loan
best match:	 Programmatic Financial Sector Adjustment Loan I
score:	 100.0
doc project:	 Programmatic Public Sector Development Policy Loan
best match:	 Development Policy Loan
score:	 100.0
doc project:	 Absheron Rehabilitation Program II:  
Integrated Solid Waste Management Project
best match:	 Integ

Hmm, the substring matching means that some generic project names get matched to something more specific.

In [54]:
count

2388

This is a stricter version, which runs faster and just computes the Levenshtein distance between two strings, dividing by the length.

In [55]:
strict_similarities = np.zeros((len(wb_project_names),len(doc_project_names)))
for i in range(len(wb_project_names)):
    if i % 100 == 0:
        print('computing similarities for world bank project name', i)
    for j in range(len(doc_project_names)):
        strict_similarities[i,j] = fuzz.ratio(wb_project_names.iloc[i],doc_project_names.iloc[j])

computing similarities for world bank project name 0
computing similarities for world bank project name 100
computing similarities for world bank project name 200
computing similarities for world bank project name 300
computing similarities for world bank project name 400
computing similarities for world bank project name 500
computing similarities for world bank project name 600
computing similarities for world bank project name 700
computing similarities for world bank project name 800
computing similarities for world bank project name 900
computing similarities for world bank project name 1000
computing similarities for world bank project name 1100
computing similarities for world bank project name 1200
computing similarities for world bank project name 1300
computing similarities for world bank project name 1400
computing similarities for world bank project name 1500
computing similarities for world bank project name 1600
computing similarities for world bank project name 1700
comp

In [56]:
count = 0
for i in range(len(doc_project_names)):
    sim = strict_similarities[matches[i],i]
    if sim > 90:
        count += 1
        if sim < 100:
            print("doc project:\t", doc_project_names.iloc[i])
            print("best match:\t",wb_project_names.iloc[matches[i]])
            print("score:\t", sim)

doc project:	 Pilot Fisheries Development Project
best match:	 Pilot Fisheries Developent Project
score:	 99.0
doc project:	 Qinghai Xining Water Environment Management Project
best match:	 China-Qinghai Xining Water Environment Management Project
score:	 94.0
doc project:	 National Transmission Modernization - I Project
best match:	 National Transmission Modernization I Project
score:	 98.0
doc project:	 Tree Crop Smallholder Development Project
best match:	 Tree Crop Smallholder Development Project (TCSDP)
score:	 91.0
doc project:	 Irrigated Agriculture Intensification Project
best match:	 Tunisia Irrigated Agriculture Intensification Project
score:	 92.0
doc project:	 Third Economic Management and Competitiveness Development Policy Operation
best match:	 Economic Management and Competitiveness Development Policy Operation
score:	 96.0
doc project:	 Municipal Finance Project
best match:	 Municipal Finance Project (02)
score:	 91.0
doc project:	 Power Sector Rehabilitation and Modern

In [57]:
count

1173

Let's use the strict matching, since we can be more confident that the projects will be correctly matched. Also, some project names are not unique, so we'll leave them out from both directions.

In [58]:
wb_value_counts = wb_project_names.value_counts()
doc_value_counts = doc_project_names.value_counts()
project_ids = []
def get_project_id(k):
    if strict_similarities[matches[k],k] > 90:
        project_name = doc_project_names.iloc[k]
        match_name = wb_project_names.iloc[matches[k]]
        if wb_value_counts[match_name] == 1 and doc_value_counts[project_name] == 1:
            return world_bank_projects.id.iloc[matches[k]]
        else:
            return None
    else:
        return None

loan_features['wb_project_id'] = list(map(get_project_id, range(len(loan_features))))

In [59]:
loan_features['wb_project_id'].describe()

count         814
unique        783
top       P035173
freq            2
Name: wb_project_id, dtype: object

It looks like there are still some pairs of loans that map to the same project, even with the strict matching and requiring that they be unique. But they should at least have the same type of project, so the resulting sector should be correct still.

In [60]:
loan_features['wb_project_id'].value_counts()

P035173    2
P115874    2
P149528    2
P150308    2
P081776    2
          ..
P149322    1
P035160    1
P077856    1
P156837    1
P157136    1
Name: wb_project_id, Length: 783, dtype: int64

In [61]:
loan_features.loc[loan_features['wb_project_id'] == 'P035173']

Unnamed: 0,year,month,day,id,name,countries,Total Amount,Project Name,wb_project_id
195,2001,june,13,595171468252295741,conformed-copy--l4603--second-powergrid-system...,"[India, United States]",450000000,Second Powergrid System Development Project,P035173
587,1993,march,23,287341468251988014,conformed-copy--l3577--powergrid-system-develo...,"[India, United States]",350000000,Powergrid System Development Project,P035173


In [62]:
world_bank_projects.loc['P035173']

id                                                                    P035173
regionname                                                         South Asia
countryname                                               [Republic of India]
lendinginstr                                         Specific Investment Loan
projectstatusdisplay                                                   Closed
status                                                                 Closed
project_name                      Second Powergrid System Development Project
boardapprovaldate                                   2001-05-03 00:00:00+00:00
ibrdcommamt                                                       450000000.0
idacommamt                                                                  0
totalamt                                                          450,000,000
grantamt                                                                    0
countryshortname                                                

In [63]:
loan_features.loc[loan_features['wb_project_id'] == 'P115874','Project Name']

40      Second Power Sector Reform Development Policy ...
1679    Third Power Sector Reform Development Policy O...
Name: Project Name, dtype: object

In [64]:
world_bank_projects.loc['P115874']

id                                                                    P115874
regionname                                              East Asia and Pacific
countryname                                   [Socialist Republic of Vietnam]
lendinginstr                                       Development Policy Lending
projectstatusdisplay                                                   Closed
status                                                                 Closed
project_name                Vietnam Power Sector Reform Development Policy...
boardapprovaldate                                   2010-04-06 00:00:00+00:00
ibrdcommamt                                                       200000000.0
idacommamt                                                        111,800,000
totalamt                                                          311,800,000
grantamt                                                                    0
countryshortname                                                

In [65]:
loan_features.loc[loan_features['wb_project_id'] == 'P149528']

Unnamed: 0,year,month,day,id,name,countries,Total Amount,Project Name,wb_project_id
983,2008,may,28,434411468016174206,"loan-agreement,-l7542-cn-conformed.txt","[China, Hong Kong, United States]",38400000,Gansu Cultural and Natural Heritage \n Protect...,P149528
1578,2017,july,7,171461502376530613,official-documents-loan-agreement-for-loan-867...,"[China, United States]",100000000,Second Gansu Cultural and Natural Heritage Pro...,P149528


In [66]:
world_bank_projects.loc['P149528']

id                                                                    P149528
regionname                                              East Asia and Pacific
countryname                                      [People's Republic of China]
lendinginstr                                     Investment Project Financing
projectstatusdisplay                                                   Active
status                                                                 Active
project_name                CH-Second Gansu Cultural and Natural Heritage ...
boardapprovaldate                                   2017-02-24 00:00:00+00:00
ibrdcommamt                                                       100000000.0
idacommamt                                                                  0
totalamt                                                          100,000,000
grantamt                                                                    0
countryshortname                                                

They mostly look like the same project with "Second" or "Third" put on.

In [67]:
loan_features.to_pickle('extracted_features_with_project_id.pkl')