In [1]:
import io

from pathlib import Path
import pandas as pd
import glob
import pickle
import pycountry

In [2]:
files = []
for filepath in glob.iglob('world_bank_loans_txt/*.txt'):
    file_txt = Path(filepath).read_text()
    file_name = filepath.split('/')[1]
    year, month, day, id_, name = file_name.split('_')
    files.append([year, month, day, id_, name, file_txt])

In [3]:
df = pd.DataFrame(files, columns = ['year', 'month', 'day', 'id', 'name', 'file_content'])
df

Unnamed: 0,year,month,day,id,name,file_content
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA Loan Agreement (Beiji...
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
...,...,...,...,...,...,...
3200,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
3201,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,"OFFICIAL \nDOC \nDOCUMENTS\n\nN, 321\nr'1 3 ..."
3202,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
3203,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...


In [4]:
def remove_header(x):
    for i in range(len(x)-1):
        if x[i].isupper() and x[i+1].isupper():
            return x[i:]
    return x
df['file_content'] = df['file_content'].apply(remove_header)

In [5]:
def get_country(x):
    ans = []
    for country in pycountry.countries:
        if country.name in x:
            ans.append(country.name)
    return ans
df['countries'] = df['file_content'].apply(lambda x: get_country(x))

In [6]:
sec_names = ['LOAN AGREEMENT', 'ARTICLE I', 'ARTICLE II', 'ARTICLE III', 'ARTICLE IV', 'ARTICLE V', 'ARTICLE VI',
'ARTICLE VII', 'ARTICLE VIII','ARTICLE IX','ARTICLE X','ARTICLE XI', 'SCHEDULE 1', 'SCHEDULE 2', 'SCHEDULE 3', 
             'SCHEDULE 4', 'SCHEDULE 5','SCHEDULE 6','SCHEDULE 7','SCHEDULE 8']

In [7]:
def get_sections(x):
    keys = ['TITLE']
    inds = [0]
    start, stop = 0, len(x)
    for sec in sec_names:
        ss = sec.split(' ')
        for name in [sec + '\n', sec + ' ', ss[0] + ss[1] + ' ',ss[0] + ss[1] + '\n', ss[0] + '  ' + ss[1] + ' ',ss[0] + '  ' + ss[1] + '\n']:
            if name in x:
#                 print(name)
                index = x.find(name, start, stop)
#                 print(index)
                keys.append(sec)
                inds.append(index)
                start = index
                break
#     print(keys, inds)   
    res = {}
    for i, key in enumerate(keys[:-1]):
        res[key] = (inds[i], inds[i+1])
    res[keys[-1]] = (inds[-1], stop)
    return res

df['sections'] = df['file_content'].apply(get_sections)
df['sections'].apply(lambda x: len(x)).sum()

36851

In [8]:
def get_section(df, section):
    try:
        start, stop = df['sections'][section]

        return df['file_content'][start:stop]
    except:
        return 

title = df.apply(lambda x: get_section(x, 'TITLE'),axis=1)
df.loc[title.index,'TITLE'] =  title

df['LOAN AGREEMENT'] = df.apply(lambda x:get_section(x,'LOAN AGREEMENT'),axis=1)

In [41]:
find_all_numbers('123_45_abd_5')

123

In [33]:
s = '588'
find_number(0,len(s),s)

In [60]:
def find_number(start, end, x):
    try:
        total = ''
        found = False
        for i in range(start, end):
            if x[i].isdigit():
                found = True
                total = total + x[i]
            else:
                if found and (x[i] != ',' or i == end):
                    return int(total), i
    except:
        return

def find_max_numbers(x):

    res = 0
    start, end = 0, len(x)
    while start < end:
        if x[start].isdigit():
            find_num = find_number(start, end, x)
            if find_num:
                number, start = find_num
                res = max(res, number)
            else:
                start += 1
        else:
            start += 1
    return res
    

def find_total_amount(x):
    start, end = None, None
    
    if x and 'TOTAL' in x:
        start, end = x.find('TOTAL'), len(x)  
    elif x and '$' in x:
        start, end = x.find('$'), len(x)

        
    if start and end:
        number = find_number(start, end, x)
        if number:
            return number[0]
    else:
        return 
    
df['Total Amount'] = df['file_content'].apply(find_total_amount)        

In [61]:
df['Total Amount'].isnull().sum()

49

In [43]:
df['Total Amount max'] = df['file_content'].apply(find_max_numbers)  

In [65]:
(df['Total Amount max'] - df['Total Amount'] < 10).sum()

1811

In [11]:
# def find_borrow(x):
#     if not x:
#         return 
#     poss = ['(the Borrower)', '(the  Borrower)', '(the\nBorrower)','(the \nBorrower)', '“the  Borrower”', '("Borrower")', '(“Borrower”)']
#     for bor in poss:
#         if bor in x.replace('\n\n','\n'):
#             return True
        
# df['file_content'].apply(find_borrow).sum()

In [12]:
i = 139
print(df.loc[i,'LOAN AGREEMENT'].replace('\n\n','\n'))
df.loc[i,'LOAN AGREEMENT'].replace('\n\n','\n')

LOAN  AGREEMENT
Agreement  dated  5  tan bo  AD 
,  2015,  between  REPUBLIC  OF  POLAND
INTERNATIONAL  BANK  FOR  RECONSTRUCTION  AND
("Borrower") 
DEVELOPMENT  ("Bank").
and 
the 
WHEREAS  the  Borrower  intends  to  contract  from  the  Council  of Europe  Development
Bank (Co-financier  CEB)  a  loan  in  an amount  equal  to three  hundred  million Euro  (6300,000,000)
(the  CEB  Cofinancing)  to  assist  in  financing  part  of the  Project  on  the  terms  and  conditions  set
forth,  respectively,  in  an agreement to be  entered  into between  the Borrower  and the CEB (the CEB
Cofinancing  Agreement).
NOW  THEREFORE  the Borrower  and the  Bank hereby  agree  as  follows:



'LOAN  AGREEMENT\nAgreement  dated  5  tan bo  AD \n,  2015,  between  REPUBLIC  OF  POLAND\nINTERNATIONAL  BANK  FOR  RECONSTRUCTION  AND\n("Borrower") \nDEVELOPMENT  ("Bank").\nand \nthe \nWHEREAS  the  Borrower  intends  to  contract  from  the  Council  of Europe  Development\nBank (Co-financier  CEB)  a  loan  in  an amount  equal  to three  hundred  million Euro  (6300,000,000)\n(the  CEB  Cofinancing)  to  assist  in  financing  part  of the  Project  on  the  terms  and  conditions  set\nforth,  respectively,  in  an agreement to be  entered  into between  the Borrower  and the CEB (the CEB\nCofinancing  Agreement).\nNOW  THEREFORE  the Borrower  and the  Bank hereby  agree  as  follows:\n'

In [13]:
def get_project_name(x):
    start, end = None, None
    if '(' in x:
        start = x.find('(')
    if ')' in x:
        end = x.find(')')
    if start and end:
        return x[start+1:end]
df['Project Name'] = df['TITLE'].apply(get_project_name)     

In [62]:
df[]

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Total Amount,Project Name,Total Amount max
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 506), 'LOAN AGREEMENT': (506, 25...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Septe...",300000000.0,Kolubara B Thermal Power and Lignite Mine Project,300000000
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"[Poland, United States]","{'TITLE': (0, 217), 'LOAN AGREEMENT': (217, 15...",CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,LOAN AGREEMENT\n\nLOAN NUMBER 7054 POL\n\nAGRE...,10890000.0,Railway Restructuring Project,110000000
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA Loan Agreement (Beijing E...,"[China, United States]","{'TITLE': (0, 219), 'LOAN AGREEMENT': (219, 20...",LOAN NUMBER 3415 CHA Loan Agreement (Beijing E...,"LOAN AGREEMENT AGREEMENT, dated Literal 4b , 1...",45000000.0,Beijing Environmental Project,60100000
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreeme...,[China],"{'TITLE': (0, 283), 'LOAN AGREEMENT': (283, 51...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreeme...,"LOAN AGREEMENT\n\nAgreement dated \n\n, 201...",150000000.0,Liaoning Coastal Economic Zone Urban Infras...,211300004029664
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 360), 'LOAN AGREEMENT': (360, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",400000.0,Third Highway Sector Project,55000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,CONFORMED COPY\n\n ...,"[Kazakhstan, United States]","{'TITLE': (0, 533), 'LOAN AGREEMENT': (533, 11...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Octob...",38000000.0,Technical Assistance Project,38000000
3201,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,"OFFICIAL \nDOC \nDOCUMENTS\n\nN, 321\nr'1 3 ...",[Chile],"{'TITLE': (0, 497), 'LOAN AGREEMENT': (497, 80...","OFFICIAL \nDOC \nDOCUMENTS\n\nN, 321\nr'1 3 ...","LOAN AGREEMENT\n\nAgreement, as of the Sig...",0.0,Tertiary Education Finance for Results Proj...,40000000
3202,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",CONFORMED COPY \n \n\nLOAN NUMBER 7496-EC \n\n...,"[Ecuador, United States]","{'TITLE': (0, 285), 'LOAN AGREEMENT': (285, 67...",CONFORMED COPY \n \n\nLOAN NUMBER 7496-EC \n\n...,LOAN AGREEMENT \n \n\nAgreement dated April ...,4400000.0,Chimborazo Development Investment Project (PIDD,15300000
3203,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,CONFORMED COPY \n\nLOAN NUMBER 4857-IN \n\nLoa...,"[India, United States]","{'TITLE': (0, 258), 'LOAN AGREEMENT': (258, 28...",CONFORMED COPY \n\nLOAN NUMBER 4857-IN \n\nLoa...,"LOAN AGREEMENT \n\nAGREEMENT dated June 8, ...",94500000.0,Andhra Pradesh Community-Based Tank Management...,94500000


In [16]:
df[(df['year'] == '1991') & (df['month'] == 'may') & (df['day'] == '7')]

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Total Amount,Project Name
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 360), 'LOAN AGREEMENT': (360, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",400000.0,Third Highway Sector Project
350,1991,may,7,732861468304780251,conformed-copy--l3235--third-highway-sector-pr...,CONFORMED COPY\n\n ...,[United States],"{'TITLE': (0, 771), 'LOAN AGREEMENT': (771, 19...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated May 7...",25000000.0,Third Highway Sector Project
1220,1991,may,7,954661468300588680,conformed-copy--l3233--third-highway-sector-pr...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 726), 'LOAN AGREEMENT': (726, 19...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated May 7...",55000000.0,Third Highway Sector Project
1534,1991,may,7,276491468107079011,conformed-copy--l3231--third-highway-sector-pr...,CONFORMED COPY\n\n ...,"[Croatia, United States]","{'TITLE': (0, 674), 'LOAN AGREEMENT': (674, 18...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated May 7...",75000000.0,Third Highway Sector Project
1806,1991,may,7,742371468164648104,conformed-copy--l3232--third-highway-sector-pr...,CONFORMED COPY\n\n ...,[United States],"{'TITLE': (0, 728), 'LOAN AGREEMENT': (728, 19...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated May 7...",22000000.0,Third Highway Sector Project
1972,1991,may,7,374751468306855715,conformed-copy--l3234--third-highway-sector-pr...,CONFORMED COPY\n\n ...,"[Slovenia, United States]","{'TITLE': (0, 673), 'LOAN AGREEMENT': (673, 18...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated May 7...",60000000.0,Third Highway Sector Project


In [66]:
print(df.loc[4,'file_content'])

CONFORMED COPY

LOAN NUMBER 3230 YU

Loan Agreement

(Third Highway Sector Project)

INTERNATIONAL BANK FOR RECONSTRUCTION
AND DEVELOPMENT

SOCIAL FUND FOR ARTERIAL AND REGIONAL
ROADS OF BOSNIA AND HERZEGOVINA

among

and

and

FEDERAL ASSOCIATION OF REPUBLICAN AND
PROVINCIAL ROAD ORGANIZATIONS OF
YUGOSLAVIA, BELGRADE

Dated May 7, 1991

LOAN NUMBER 3230 Y

LOAN AGREEMENT

AGREEMENT, dated May 7, 1991, among INTERNATIONAL BANK FOR

RECONSTRUCTION AND DEVELOPMENT (the Bank) and SOCIAL FUND FOR
ARTERIAL AND REGIONAL ROADS OF BOSNIA AND HERZEGOVINA (the
Borrower) and FEDERAL ASSOCIATION OF REPUBLICAN AND PROVINCIAL
ROAD ORGANIZATIONS OF YUGOSLAVIA, BELGRADE (FARP).
 

WHEREAS (A) the Socialist Federal Republic of Yugoslavia

(the Guarantor), the Borrower and FARP, having been satisfied as
to the feasibility and priority of the Project described in
Schedule 2 to this Agreement, have requested the Bank to assist in
the financing of the Project;

(B)

by an agreement (the Guarantee Agreement