In [1]:
import io

from pathlib import Path
import pandas as pd
import glob
import pickle
import pycountry

In [2]:
files = []
for filepath in glob.iglob('world_bank_loans_txt/*.txt'):
    file_txt = Path(filepath).read_text()
    file_name = filepath.split('/')[1]
    year, month, day, id_, name = file_name.split('_')
    files.append([year, month, day, id_, name, file_txt])

In [3]:
df = pd.DataFrame(files, columns = ['year', 'month', 'day', 'id', 'name', 'file_content'])
df

Unnamed: 0,year,month,day,id,name,file_content
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA Loan Agreement (Beiji...
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
...,...,...,...,...,...,...
3194,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
3195,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,"OFFICIAL \nDOC \nDOCUMENTS\n\nN, 321\nr'1 3 ..."
3196,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...
3197,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,d\ne\nz\ni\nr\no\nh\nt\nu\nA\n \ne\nr\nu\ns\no...


In [4]:
def remove_header(x):
    for i in range(len(x)-1):
        if x[i].isupper() and x[i+1].isupper():
            return x[i:]
    return x
df['file_content'] = df['file_content'].apply(remove_header)

In [5]:
def get_country(x):
    ans = []
    for country in pycountry.countries:
        if country.name in x:
            ans.append(country.name)
    return ans
df['countries'] = df['file_content'].apply(lambda x: get_country(x))

In [6]:
sec_names = ['LOAN AGREEMENT', 'ARTICLE I', 'ARTICLE II', 'ARTICLE III', 'ARTICLE IV', 'ARTICLE V', 'ARTICLE VI',
'ARTICLE VII', 'ARTICLE VIII','ARTICLE IX','ARTICLE X','ARTICLE XI', 'SCHEDULE 1', 'SCHEDULE 2', 'SCHEDULE 3', 
             'SCHEDULE 4', 'SCHEDULE 5','SCHEDULE 6','SCHEDULE 7','SCHEDULE 8']

In [7]:
def get_sections(x):
    keys = ['TITLE']
    inds = [0]
    start, stop = 0, len(x)
    for sec in sec_names:
        ss = sec.split(' ')
        for name in [sec + '\n', sec + ' ', ss[0] + ss[1] + ' ',ss[0] + ss[1] + '\n', ss[0] + '  ' + ss[1] + ' ',ss[0] + '  ' + ss[1] + '\n']:
            if name in x:
#                 print(name)
                index = x.find(name, start, stop)
#                 print(index)
                keys.append(sec)
                inds.append(index)
                start = index
                break
#     print(keys, inds)   
    res = {}
    for i, key in enumerate(keys[:-1]):
        res[key] = (inds[i], inds[i+1])
    res[keys[-1]] = (inds[-1], stop)
    return res

df['sections'] = df['file_content'].apply(get_sections)
# df['sections'].apply(lambda x: len(x)).sum()

In [8]:
def get_section(df, section):
    try:
        start, stop = df['sections'][section]

        return df['file_content'][start:stop]
    except:
        return 

title = df.apply(lambda x: get_section(x, 'TITLE'),axis=1)
df.loc[title.index,'TITLE'] =  title

df['LOAN AGREEMENT'] = df.apply(lambda x:get_section(x,'LOAN AGREEMENT'),axis=1)

In [9]:
def find_number(start, end, x):
    try:
        total = ''
        found = False
        for i in range(start, end):
            if x[i].isdigit():
                found = True
                total = total + x[i]
            else:
                if found and (x[i] != ',' or i == end):
#                     if ',' in x[start:i]:
                    return int(total), i
    except:
        return

def find_max_numbers(x):

    res = 0
    start, end = 0, len(x)
    while start < end:
        if x[start].isdigit():
            find_num = find_number(start, end, x)
            if find_num and ',' in x[start:find_num[1]]:
                number, start = find_num
                res = max(res, number)
            else:
                start += 1
        else:
            start += 1
    return res
    

# def find_total_amount(x):
#     start, end = None, None
    
#     if x and 'TOTAL' in x:
#         start, end = x.find('TOTAL'), len(x)  
#     elif x and '$' in x:
#         start, end = x.find('$'), len(x)

        
#     if start and end:
#         number = find_number(start, end, x)
#         if number:
#             return number[0]
#     else:
#         return 
    
# df['Total Amount'] = df['file_content'].apply(find_total_amount)        

In [10]:
# def find_borrow(x):
#     if not x:
#         return 
#     poss = ['(the Borrower)', '(the  Borrower)', '(the\nBorrower)','(the \nBorrower)', '“the  Borrower”', '("Borrower")', '(“Borrower”)']
#     for bor in poss:
#         if bor in x.replace('\n\n','\n'):
#             return True
        
# df['file_content'].apply(find_borrow).sum()

In [11]:
i = 139
print(df.loc[i,'LOAN AGREEMENT'].replace('\n\n','\n'))
df.loc[i,'LOAN AGREEMENT'].replace('\n\n','\n')

LOAN AGREEMENT 
 
 
AGREEMENT dated as of the Signature Date between REPÚBLICA ORIENTAL DEL 
URUGUAY  (“Borrower”)  and  INTERNATIONAL  BANK  FOR  RECONSTRUCTION  AND 
DEVELOPMENT  (“Bank”)  for  the  purpose  of  providing  additional  financing  for  the  Original 
Project (as defined in the Appendix to this Agreement).  The Borrower and the Bank hereby agree 
as follows: 
 
 



'LOAN AGREEMENT \n \n \nAGREEMENT dated as of the Signature Date between REPÚBLICA ORIENTAL DEL \nURUGUAY  (“Borrower”)  and  INTERNATIONAL  BANK  FOR  RECONSTRUCTION  AND \nDEVELOPMENT  (“Bank”)  for  the  purpose  of  providing  additional  financing  for  the  Original \nProject (as defined in the Appendix to this Agreement).  The Borrower and the Bank hereby agree \nas follows: \n \n \n'

In [12]:
def get_project_name(x):
    start, end = None, None
    if '(' in x:
        start = x.find('(')
    if ')' in x:
        end = x.find(')')
    if start and end:
        return x[start+1:end]
df['Project Name'] = df['TITLE'].apply(get_project_name)     

In [13]:
df

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Project Name
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 506), 'LOAN AGREEMENT': (506, 25...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Septe...",Kolubara B Thermal Power and Lignite Mine Project
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"[Poland, United States]","{'TITLE': (0, 217), 'LOAN AGREEMENT': (217, 15...",CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,LOAN AGREEMENT\n\nLOAN NUMBER 7054 POL\n\nAGRE...,Railway Restructuring Project
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA Loan Agreement (Beijing E...,"[China, United States]","{'TITLE': (0, 219), 'LOAN AGREEMENT': (219, 20...",LOAN NUMBER 3415 CHA Loan Agreement (Beijing E...,"LOAN AGREEMENT AGREEMENT, dated Literal 4b , 1...",Beijing Environmental Project
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreeme...,[China],"{'TITLE': (0, 283), 'LOAN AGREEMENT': (283, 51...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreeme...,"LOAN AGREEMENT\n\nAgreement dated \n\n, 201...",Liaoning Coastal Economic Zone Urban Infras...
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 360), 'LOAN AGREEMENT': (360, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",Third Highway Sector Project
...,...,...,...,...,...,...,...,...,...,...,...
3194,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,CONFORMED COPY\n\n ...,"[Kazakhstan, United States]","{'TITLE': (0, 533), 'LOAN AGREEMENT': (533, 11...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Octob...",Technical Assistance Project
3195,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,"OFFICIAL \nDOC \nDOCUMENTS\n\nN, 321\nr'1 3 ...",[Chile],"{'TITLE': (0, 497), 'LOAN AGREEMENT': (497, 80...","OFFICIAL \nDOC \nDOCUMENTS\n\nN, 321\nr'1 3 ...","LOAN AGREEMENT\n\nAgreement, as of the Sig...",Tertiary Education Finance for Results Proj...
3196,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",CONFORMED COPY \n \n\nLOAN NUMBER 7496-EC \n\n...,"[Ecuador, United States]","{'TITLE': (0, 285), 'LOAN AGREEMENT': (285, 67...",CONFORMED COPY \n \n\nLOAN NUMBER 7496-EC \n\n...,LOAN AGREEMENT \n \n\nAgreement dated April ...,Chimborazo Development Investment Project (PIDD
3197,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,CONFORMED COPY \n\nLOAN NUMBER 4857-IN \n\nLoa...,"[India, United States]","{'TITLE': (0, 258), 'LOAN AGREEMENT': (258, 28...",CONFORMED COPY \n\nLOAN NUMBER 4857-IN \n\nLoa...,"LOAN AGREEMENT \n\nAGREEMENT dated June 8, ...",Andhra Pradesh Community-Based Tank Management...


In [14]:
print(df.loc[3,'file_content'][31000:])

  June  14,  2009  and  to  Business  License  No.  211300004029664  dated  May
25,  2009,  issued  by  Chaoyang  Municipality  Industry  and  Commerce  Administrative
Bureau,  and  any successor  thereto.

"Consultant  Guidelines"  means 
the  "Guidelines:  Selection  and  Employment  of
Consultants  under  I3RD  Loans  and  IDA  Credits  and  Grants  by  World  Bank  Borrowers"
dated January  2011.

"Displaced  Persons"  means persons who,  on  account  of the execution  of the Project,  have
experienced  or  would  experience  direct  economic  and  social  impacts  caused  by:  (a) the
involuntary  taking  of land,  resulting  in:  (i)  relocation or  loss  of shelter;  (ii)  loss  of assets
or  access to  assets;  or  (iii)  loss  of income  sources  or  means  of livelihood,  whether or  not
such  persons  must move  to another  location;  or  (b)  the involuntary  restriction  of access  to
impacts  on  the
legally  designated  parks  and  protected  areas,  resulting  in  adver