In [2]:
import io

from pathlib import Path
import pandas as pd
import glob
import pickle
import pycountry

In [3]:
files = []
for filepath in glob.iglob('world_bank_loans_txt_clean/*.txt'):
    file_txt = Path(filepath).read_text()
    file_name = filepath.split('/')[1]
    year, month, day, id_, name = file_name.split('_')
    files.append([year, month, day, id_, name, file_txt])

In [4]:
df = pd.DataFrame(files, columns = ['year', 'month', 'day', 'id', 'name', 'file_content'])
df

Unnamed: 0,year,month,day,id,name,file_content
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,World Bank Document\n\nCONFORMED COPY\n\n ...
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,World Bank Document\n\nCONFORMED COPY\n\nLOAN ...
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,World Bank Document\n\nLOAN NUMBER 3415 CHA \n...
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",World Bank Document\n\nMENTS\n\nLOAN NUMBER 82...
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,World Bank Document\n\nCONFORMED COPY\n\nLOAN ...
...,...,...,...,...,...,...
3190,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,World Bank Document\n\nCONFORMED COPY\n\n ...
3191,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,World Bank Document\n\nDOC r'1 3 2013OFFICIAL ...
3192,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",World Bank Document\n\nCONFORMED COPY \n\nLOAN...
3193,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,World Bank Document\n\nCONFORMED COPY \n\nLOAN...


In [5]:
def remove_header(x):
    for i in range(len(x)-1):
        if x[i].isupper() and x[i+1].isupper():
            return x[i:]
    return x
df['file_content'] = df['file_content'].apply(remove_header)

In [6]:
def get_country(x):
    ans = []
    for country in pycountry.countries:
        if country.name in x:
            ans.append(country.name)
    return ans
df['countries'] = df['file_content'].apply(lambda x: get_country(x))

In [7]:
sec_names = ['LOAN AGREEMENT', 'ARTICLE I', 'ARTICLE II', 'ARTICLE III', 'ARTICLE IV', 'ARTICLE V', 'ARTICLE VI',
'ARTICLE VII', 'ARTICLE VIII','ARTICLE IX','ARTICLE X','ARTICLE XI', 'SCHEDULE 1', 'SCHEDULE 2', 'SCHEDULE 3', 
             'SCHEDULE 4', 'SCHEDULE 5','SCHEDULE 6','SCHEDULE 7','SCHEDULE 8']

In [8]:
def get_sections(x):
    keys = ['TITLE']
    inds = [0]
    start, stop = 0, len(x)
    for sec in sec_names:
        ss = sec.split(' ')
        for name in [sec + '\n', sec + ' ', ss[0] + ss[1] + ' ',ss[0] + ss[1] + '\n', ss[0] + '  ' + ss[1] + ' ',ss[0] + '  ' + ss[1] + '\n']:
            if name in x:
#                 print(name)
                index = x.find(name, start, stop)
#                 print(index)
                keys.append(sec)
                inds.append(index)
                start = index
                break
#     print(keys, inds)   
    res = {}
    for i, key in enumerate(keys[:-1]):
        res[key] = (inds[i], inds[i+1])
    res[keys[-1]] = (inds[-1], stop)
    return res

df['sections'] = df['file_content'].apply(get_sections)
# df['sections'].apply(lambda x: len(x)).sum()

In [9]:
def get_section(df, section):
    try:
        start, stop = df['sections'][section]

        return df['file_content'][start:stop]
    except:
        return 

title = df.apply(lambda x: get_section(x, 'TITLE'),axis=1)
df.loc[title.index,'TITLE'] =  title

df['LOAN AGREEMENT'] = df.apply(lambda x:get_section(x,'LOAN AGREEMENT'),axis=1)

In [10]:
def find_number(start, end, x):
    try:
        total = ''
        found = False
        for i in range(start, end):
            if x[i].isdigit():
                found = True
                total = total + x[i]
            else:
                if found and (x[i] != ',' or i == end):
#                     if ',' in x[start:i]:
                    return int(total), i
    except:
        return

def find_max_amount(x):

    res = 0
    start, end = 0, len(x)
    while start < end:
        if x[start].isdigit():
            find_num = find_number(start, end, x)
            if find_num and ',' in x[start:find_num[1]]:
                number, start = find_num
                res = max(res, number)
            else:
                start += 1
        else:
            start += 1
    return res
    
df['Total Amount'] = df['file_content'].apply(find_max_amount)   

# def find_total_amount(x):
#     start, end = None, None
    
#     if x and 'TOTAL' in x:
#         start, end = x.find('TOTAL'), len(x)  
#     elif x and '$' in x:
#         start, end = x.find('$'), len(x)

        
#     if start and end:
#         number = find_number(start, end, x)
#         if number:
#             return number[0]
#     else:
#         return 
    
# df['Total Amount'] = df['file_content'].apply(find_total_amount)        

In [11]:
# def find_borrow(x):
#     if not x:
#         return 
#     poss = ['(the Borrower)', '(the  Borrower)', '(the\nBorrower)','(the \nBorrower)', '“the  Borrower”', '("Borrower")', '(“Borrower”)']
#     for bor in poss:
#         if bor in x.replace('\n\n','\n'):
#             return True
        
# df['file_content'].apply(find_borrow).sum()

In [12]:
i = 139
print(df.loc[i,'LOAN AGREEMENT'].replace('\n\n','\n'))
df.loc[i,'LOAN AGREEMENT'].replace('\n\n','\n')

LOAN AGREEMENT 
AGREEMENT dated as of the Signature Date between REPÚBLICA ORIENTAL DEL 
URUGUAY (“Borrower”) and INTERNATIONAL BANK FOR RECONSTRUCTION AND 
DEVELOPMENT (“Bank”) for the purpose of providing additional financing for the Original 
Project (as defined in the Appendix to this Agreement).  The Borrower and the Bank hereby agree 
as follows: 



'LOAN AGREEMENT \nAGREEMENT dated as of the Signature Date between REPÚBLICA ORIENTAL DEL \nURUGUAY (“Borrower”) and INTERNATIONAL BANK FOR RECONSTRUCTION AND \nDEVELOPMENT (“Bank”) for the purpose of providing additional financing for the Original \nProject (as defined in the Appendix to this Agreement).  The Borrower and the Bank hereby agree \nas follows: \n'

In [13]:
def get_project_name(x):
    start, end = None, None
    if '(' in x:
        start = x.find('(')
    if ')' in x:
        end = x.find(')')
    if start and end:
        return x[start+1:end]
df['Project Name'] = df['TITLE'].apply(get_project_name)     

In [14]:
df

Unnamed: 0,year,month,day,id,name,file_content,countries,sections,TITLE,LOAN AGREEMENT,Total Amount,Project Name
0,1991,september,12,558671468103155868,conformed-copy--l3361--kolubara-b-thermal-powe...,CONFORMED COPY\n\n ...,"[Serbia, United States]","{'TITLE': (0, 506), 'LOAN AGREEMENT': (506, 25...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Septe...",300000000,Kolubara B Thermal Power and Lignite Mine Project
1,2001,may,31,473681468094784501,conformed-copy--l7054--railway-restructuring-p...,CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"[Poland, United States]","{'TITLE': (0, 239), 'LOAN AGREEMENT': (239, 15...",CONFORMED COPY\n\nLOAN NUMBER 7054 POL\n\nLoan...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 31, 200...",110000000,Railway Restructuring Project
2,1991,november,19,892131468028134392,china--beijing-environmental-project-:-loan-34...,LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"[China, United States]","{'TITLE': (0, 228), 'LOAN AGREEMENT': (228, 21...",LOAN NUMBER 3415 CHA \n\nLoan Agreement \n\n(B...,"LOAN AGREEMENT \n\nAGREEMENT, dated Literal 4b...",60100000,Beijing Environmental Project
3,2013,may,20,945021468224685176,"official-documents-loan-agreement,-l8236-cn-cl...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"[China, United States]","{'TITLE': (0, 267), 'LOAN AGREEMENT': (267, 48...",MENTS\n\nLOAN NUMBER 8236-CN\n\nLoan Agreement...,"LOAN AGREEMENT\n\nAgreement dated ^46 , 2013, ...",150000000,Liaoning Coastal Economic Zone Urban Infrastru...
4,1991,may,7,107521468303622178,conformed-copy--l3230--third-highway-sector-pr...,CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"[Bosnia and Herzegovina, United States]","{'TITLE': (0, 364), 'LOAN AGREEMENT': (364, 14...",CONFORMED COPY\n\nLOAN NUMBER 3230 YU\n\nLoan ...,"LOAN AGREEMENT\n\nAGREEMENT, dated May 7, 1991...",55000000,Third Highway Sector Project
...,...,...,...,...,...,...,...,...,...,...,...,...
3190,1993,october,15,268921468048527743,conformed-copy--l3642--technical-assistance-pr...,CONFORMED COPY\n\n ...,"[Kazakhstan, United States]","{'TITLE': (0, 533), 'LOAN AGREEMENT': (533, 11...",CONFORMED COPY\n\n ...,"LOAN AGREEMENT\n\n AGREEMENT, dated Octob...",38000000,Technical Assistance Project
3191,2013,march,21,584611468237295236,official-documents-loan-agreement-for-loan-812...,"DOC r'1 3 2013OFFICIAL \nN, 321\n\nDOCUMENTS\n...","[Chile, United States]","{'TITLE': (0, 235), 'LOAN AGREEMENT': (235, 50...","DOC r'1 3 2013OFFICIAL \nN, 321\n\nDOCUMENTS\n...","LOAN AGREEMENT\n\nAgreement, as of the Signatu...",40000000,Tertiary Education Finance for Results Project...
3192,2008,april,18,448831468262773500,"loan-agreement,-l7496-ec-conformed.txt",CONFORMED COPY \n\nLOAN NUMBER 7496-EC \n\nLoa...,"[Ecuador, United States]","{'TITLE': (0, 262), 'LOAN AGREEMENT': (262, 63...",CONFORMED COPY \n\nLOAN NUMBER 7496-EC \n\nLoa...,"LOAN AGREEMENT \n\nAgreement dated April 18, 2...",15300000,Chimborazo Development Investment Project (PIDD
3193,2007,june,8,642541468034864651,loan-agreement-l4857-in--andhra-pradesh-commun...,CONFORMED COPY \n\nLOAN NUMBER 4857-IN \n\nLoa...,"[India, United States]","{'TITLE': (0, 242), 'LOAN AGREEMENT': (242, 27...",CONFORMED COPY \n\nLOAN NUMBER 4857-IN \n\nLoa...,"LOAN AGREEMENT \n\n AGREEMENT dated June 8, 20...",94500000,Andhra Pradesh Community-Based Tank Management...


In [15]:
print(df.loc[3,'file_content'][31000:])


Non-consulting Services under IBRD Loans and IDA Credits and Grants by World Bank
Borrowers" dated January 2011.

13. "Procurement Plan" means the Borrower's procurement plan for the Project, dated
November 26, 2012, and referred to in paragraph 1.18 of the Procurement Guidelines and
paragraph 1.25 of the Consultant Guidelines, as the same shall be updated from time to
time in accordance with the provisions of said paragraphs.

14. "Project Agreement" means the agreement between the Bank and the Project
Implementing Entity -of even date herewith, as the same may be amended from time to
time; and such term includes all schedules and agreements supplemental to the Project
Agreement.

15. "Project Cities" means the Borrower's counties/cities/district of: (a) Panjin City; (b)
Donggang City and Kuandian County (under the jurisdiction of Dandong City); (c)
Longcheng District and Lingyuan City (under the jurisdiction of Chaoyang City); and (d)
Suizhong County (under the jurisdiction of Hu Lu

In [16]:
df.countries.describe()

count                3195
unique                352
top       [United States]
freq                  215
Name: countries, dtype: object

In [17]:
df.countries.apply(len).describe()

count    3195.000000
mean        2.195618
std         0.821734
min         0.000000
25%         2.000000
50%         2.000000
75%         2.000000
max        12.000000
Name: countries, dtype: float64

In [18]:
df.columns

Index(['year', 'month', 'day', 'id', 'name', 'file_content', 'countries',
       'sections', 'TITLE', 'LOAN AGREEMENT', 'Total Amount', 'Project Name'],
      dtype='object')

In [19]:
df_features = df[['year','month','day','id','name','countries','Total Amount','Project Name']]
df_features.to_pickle("extracted_features.pkl")