In [20]:
import io

from pathlib import Path
import pandas as pd
import glob
import pickle
import pycountry
import time

In [15]:
files = []
for filepath in glob.iglob('world_bank_loans_txt_clean/*.txt'):
    file_txt = Path(filepath).read_text()
    file_name = filepath.split('/')[1]
    year, month, day, id_, name = file_name.split('_')
    files.append([year, month, day, id_, name, file_txt])

In [16]:
df = pd.DataFrame(files, columns = ['year', 'month', 'day', 'id', 'name', 'file_content'])
df

Unnamed: 0,year,month,day,id,name,file_content
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,World Bank Document\n\nCONFORMED COPY\n\n ...
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,World Bank Document\n\nCONFORMED COPY\n\nLOAN ...
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,World Bank Document\n\nLOAN NUMBER 3415 CHA \n...
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",World Bank Document\n\nMENTS\n\nLOAN NUMBER 82...
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,World Bank Document\n\nCONFORMED COPY\n\nLOAN ...
...,...,...,...,...,...,...
3190,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,World Bank Document\n\nCONFORMED COPY\n\n ...
3191,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,World Bank Document\n\nDOC r'1 3 2013OFFICIAL ...
3192,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",World Bank Document\n\nCONFORMED COPY \n\nLOAN...
3193,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,World Bank Document\n\nCONFORMED COPY \n\nLOAN...


# Extracting country names

In [18]:
def get_country(x):
    ans = []
    for country in pycountry.countries:
        if country.name in x:
            ans.append(country.name)
    return ans
df['countries'] = df['file_content'].apply(lambda x: get_country(x))

# Extracting sections

In [19]:
sec_names = ['LOAN AGREEMENT', 'ARTICLE I', 'ARTICLE II', 'ARTICLE III', 'ARTICLE IV', 'ARTICLE V', 'ARTICLE VI',
'ARTICLE VII', 'ARTICLE VIII','ARTICLE IX','ARTICLE X','ARTICLE XI', 'SCHEDULE 1', 'SCHEDULE 2', 'SCHEDULE 3', 
             'SCHEDULE 4', 'SCHEDULE 5','SCHEDULE 6','SCHEDULE 7','SCHEDULE 8']

In [54]:
def get_sections(x):
    keys = ['TITLE']
    inds = [0]
    start, stop = 0, len(x)
    for sec in sec_names:
        ss = sec.split(' ')
        for name in [sec + '\n', sec + ' ', sec + '-', ss[0] + ss[1] + ' ',ss[0] + ss[1] + '\n', ss[0] + '  ' + ss[1] + ' ',ss[0] + '  ' + ss[1] + '\n']:
            if name in x:
#                 print(name)
                index = x.find(name, start, stop)
#                 print(index)
                keys.append(sec)
                inds.append(index)
                start = index
                break
#     print(keys, inds)   
    res = {}
    for i, key in enumerate(keys[:-1]):
        res[key] = (inds[i], inds[i+1])
    res[keys[-1]] = (inds[-1], stop)
    return res

df['sections'] = df['file_content'].apply(get_sections)
# df['sections'].apply(lambda x: len(x)).sum()

In [55]:
def get_section(df, section):
    try:
        start, stop = df['sections'][section]

        return df['file_content'][start:stop]
    except:
        return 

title = df.apply(lambda x: get_section(x, 'TITLE'),axis=1)
df.loc[title.index,'TITLE'] =  title

df['ARTICLE II'] = df.apply(lambda x:get_section(x,'ARTICLE II'),axis=1)

In [25]:
def get_project_name(x):
    start, end = None, None
    if '(' in x:
        start = x.find('(')
    if ')' in x:
        end = x.find(')')
    if start and end:
        return x[start+1:end]
df['Project Name'] = df['TITLE'].apply(get_project_name)     

# Extracting Amounts in a naive way
Get the amounts simply by find things inside ()

In [119]:
def get_amount(x):
    if not x:
        return ''
    x = x[:x.find('2.02')]
    if not x:
        return ''
    ans = []    
    while x and'(' in x and ')' in x:
        i1, i2 = x.find('('), x.find(')')
        if i2 - i1 == 2 or '0' not in x[i1+1:i2]:
            x = x[i2+1:]
            continue
        else :
            ans.append(x[i1+1:i2])
            x = x[i2+1:]
    return ' '.join(ans) if ans else ''

df['Amount'] = df['ARTICLE II'].apply(get_amount)

In [116]:
df.Amount.isna().sum()

433

In [2]:
df = pd.read_pickle('world_bank.pickle')
df.head()

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Total Amount,Project Name,ARTICLE II,Amount
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 506), 'LOAN AGREEMENT': (506, 25...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Septe...",300000000,Kolubara B Thermal Power and Lignite Mine Project,ARTICLE II\n\n The...,"$300,000,000"
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"[Poland, United States]","{'TITLE': (0, 239), 'LOAN AGREEMENT': (239, 15...",CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 31, 200...",110000000,Railway Restructuring Project,ARTICLE II\n\nThe Loan\n\nSection 2.01. The Ba...,"EUR 110,000,000"
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"[China, United States]","{'TITLE': (0, 228), 'LOAN AGREEMENT': (228, 21...",LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"LOAN AGREEMENT \n\nAGREEMENT, dated Literal 4b...",60100000,Beijing Environmental Project,ARTICLE II \n\nThe Loan \n\nSection 2.01. The ...,"$45,000,000"
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"[China, United States]","{'TITLE': (0, 267), 'LOAN AGREEMENT': (267, 48...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"LOAN AGREEMENT\n\nAgreement dated ^46 , 2013, ...",150000000,Liaoning Coastal Economic Zone Urban Infrastru...,ARTICLE II - PROJECT\n\n3.01. The Borrower dec...,
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 364), 'LOAN AGREEMENT': (364, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",55000000,Third Highway Sector Project,ARTICLE II\n\nThe Loan\n\nSection 2.01. The Ba...,"$55,000,000"


## Extracting prices using Google Named Entity Analysis

In [3]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'flash-surge-313319-fb0596b2676b.json'

In [35]:
from google.cloud import language_v1
client = language_v1.LanguageServiceClient()

def get_price(text_content):
    """
    Analyzing Entities in a String

    Args:
      text_content The text content to analyze
    """

    
    if not text_content or '2.02' not in text_content:
        return None
    else:
        text_content = text_content[:text_content.find('2.02')-3]
    
    # text_content = 'California is a state.'

    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type_": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8

    response = client.analyze_entities(request = {'document': document, 'encoding_type': encoding_type})

    # Loop through entitites returned from the API
    ans = []
    for entity in response.entities:
        if language_v1.Entity.Type(entity.type_).name == 'PRICE':
#             return entity.name
            ans.append(entity.name)
#             print(entity.name, language_v1.Entity.Type(entity.type_).name)
    time.sleep(0.25)
    return ans 
    

In [37]:
# get prices using google named entity analysis
df['Amount_G'] = df['ARTICLE II'].apply(get_price)

In [38]:
df.head()

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Total Amount,Project Name,ARTICLE II,Amount,Amount_G
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 506), 'LOAN AGREEMENT': (506, 25...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Septe...",300000000,Kolubara B Thermal Power and Lignite Mine Project,ARTICLE II\n\n The...,"$300,000,000","[three hundred million dollars, $300,000,000]"
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"[Poland, United States]","{'TITLE': (0, 239), 'LOAN AGREEMENT': (239, 15...",CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 31, 200...",110000000,Railway Restructuring Project,ARTICLE II\n\nThe Loan\n\nSection 2.01. The Ba...,"EUR 110,000,000","[one hundred \nand ten million Euro, EUR 110,0..."
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"[China, United States]","{'TITLE': (0, 228), 'LOAN AGREEMENT': (228, 21...",LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"LOAN AGREEMENT \n\nAGREEMENT, dated Literal 4b...",60100000,Beijing Environmental Project,ARTICLE II \n\nThe Loan \n\nSection 2.01. The ...,"$45,000,000","[forty-five million dollars, $45,000,000]"
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"[China, United States]","{'TITLE': (0, 267), 'LOAN AGREEMENT': (267, 48...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"LOAN AGREEMENT\n\nAgreement dated ^46 , 2013, ...",150000000,Liaoning Coastal Economic Zone Urban Infrastru...,ARTICLE II - PROJECT\n\n3.01. The Borrower dec...,,
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 364), 'LOAN AGREEMENT': (364, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",55000000,Third Highway Sector Project,ARTICLE II\n\nThe Loan\n\nSection 2.01. The Ba...,"$55,000,000","[fifty-five million dollars, $55,000,000]"


In [39]:
# 443 files do not have a good formate of ARTICLE II
df['Amount_G'].isna().sum()

443

In [46]:
# 14 files do not have a price in ARTICLE II, implying an inaccurate section extraction
(df['Amount_G'].dropna().apply(len) == 0).sum()

14

In [49]:
def get_len(x):
    if not x:
        return None
    else:
        return len(x)

In [50]:
df['price_count'] = df['Amount_G'].apply(get_len)

In [74]:
df['Amount_G'][6]

['two hundred fifty million dollars', '$250,000,000']

In [80]:
# At least 191 contracts are using other currencies
df[df['price_count'] == 1].shape

(191, 16)

In [89]:
print(df[df['price_count'] == 1]['ARTICLE II'].iloc[4])

ARTICLE II - LOAN 

2.01. The Bank agrees to lend to the Borrower, on the terms and conditions set forth or 
referred to in this Agreement, the amount of Euro eighteen million one hundred 
thousand (EUR 18,100,000) (“Loan”), as such amount may be converted from 
time to time through a Currency Conversion in accordance with the provisions of 
Section 2.07 of this Agreement to assist in financing the project described in 
Schedule 1 to this Agreement (“Project”). 

2.02. The Borrower may withdraw the proceeds of the Loan in accordance with 

Section IV of Schedule 2 to this Agreement. 

2.03. The Front-end Fee payable by the Borrower shall be equal to one quarter of one 
percent (0.25%) of the Loan amount.   

2.04. The interest payable by the Borrower for each Interest Period shall be at a rate 

equal to LIBOR for the Loan Currency plus the Fixed Spread; provided however, 
that upon a Conversion of all or any portion of the principal amount of the Loan, 
the interest payable by the Bor

In [90]:
def get_amount(x):
    if not x:
        return None
    if len(x) == 1:
        return x[0]
    elif len(x) == 2:
        return x[1]
    elif len(x) == 4:
        # loan has two parts
        return [x[1], x[3]]
    else:
        # loan has
        return x[1]

In [91]:
df['Amount_G_num'] = df['Amount_G'].apply(get_amount)

In [92]:
df.head()

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Total Amount,Project Name,ARTICLE II,Amount,Amount_G,price_count,Amount_G_num
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 506), 'LOAN AGREEMENT': (506, 25...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Septe...",300000000,Kolubara B Thermal Power and Lignite Mine Project,ARTICLE II\n\n The...,"$300,000,000","[three hundred million dollars, $300,000,000]",2.0,"$300,000,000"
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"[Poland, United States]","{'TITLE': (0, 239), 'LOAN AGREEMENT': (239, 15...",CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 31, 200...",110000000,Railway Restructuring Project,ARTICLE II\n\nThe Loan\n\nSection 2.01. The Ba...,"EUR 110,000,000","[one hundred \nand ten million Euro, EUR 110,0...",2.0,"EUR 110,000,000"
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"[China, United States]","{'TITLE': (0, 228), 'LOAN AGREEMENT': (228, 21...",LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"LOAN AGREEMENT \n\nAGREEMENT, dated Literal 4b...",60100000,Beijing Environmental Project,ARTICLE II \n\nThe Loan \n\nSection 2.01. The ...,"$45,000,000","[forty-five million dollars, $45,000,000]",2.0,"$45,000,000"
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"[China, United States]","{'TITLE': (0, 267), 'LOAN AGREEMENT': (267, 48...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"LOAN AGREEMENT\n\nAgreement dated ^46 , 2013, ...",150000000,Liaoning Coastal Economic Zone Urban Infrastru...,ARTICLE II - PROJECT\n\n3.01. The Borrower dec...,,,,
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 364), 'LOAN AGREEMENT': (364, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",55000000,Third Highway Sector Project,ARTICLE II\n\nThe Loan\n\nSection 2.01. The Ba...,"$55,000,000","[fifty-five million dollars, $55,000,000]",2.0,"$55,000,000"


In [93]:
df[['year', 'month', 'day', 'id', 'name', 'ARTICLE II', 'Amount_G', 'Amount_G_num']].to_pickle('world_bank_amounts.pickle')