In [149]:
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import PDFPageAggregator
from io import StringIO
import os
import codecs
import re 
import camelot
import pandas as pd
import numpy as np

sectors = ['Livelihoods and basic needs', 'Protection, Gender & Inclusion', 'Health', 'WASH', 'Migration']

# map sectors to naming convention used in "IFRC GO Matrix"
sector_map = {'Livelihoods and basic needs': 'Livelihoods and basic needs',
              'WASH': 'WASH',
              'Health': 'Health',
              'Protection, Gender & Inclusion': 'PGI',
              'Migration': 'Migration'}

In [150]:
def ExtractTxtAndTables():
    for country in os.listdir('Approved Plans'):
#         # Test only on Cook Islands
#         if country != 'Cook Islands':
#             continue

        for file in os.listdir('Approved Plans/'+country):

            if file.startswith('MDR') and file.endswith('pdf'):
                output_string = StringIO()
                with open('Approved Plans/'+country+'/'+file, 'rb') as fh:
                    parser = PDFParser(fh)
                    doc = PDFDocument(parser)
                    rsrcmgr = PDFResourceManager()
                    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
                    interpreter = PDFPageInterpreter(rsrcmgr, device)
                    landscapepages = list()
                    for ix, page in enumerate(PDFPage.create_pages(doc)):
                        # only pages with landscape orientation
                        if page.cropbox[2] < 800:
                            continue
                        interpreter.process_page(page)
                        # count landscape page 
                        landscapepages.append(ix+1)
                    
                    landscapepages = str(landscapepages)
                    landscapepages = landscapepages[1:-1]
                    
                    # find all tables on the landscape pages and create excel tables
                    foundtables = camelot.read_pdf('Approved Plans/'+country+'/'+file, pages=landscapepages)
                    for i in range(len(foundtables)):
                        foundtables[i].to_excel('Approved Plans/'+country+'/'+'Table_test'+str(i)+'.xlsx')
                        
                    file1 = codecs.open('Approved Plans/'+country+'/'+file.split('.')[0]+'.txt', "w", "utf-8")  # write mode
                    file1.write(output_string.getvalue())
                    file1.close()

In [151]:
def ParseTablesExtractActivities():
    
    for country in os.listdir('Approved Plans'):
#         # Test only on Cook Islands
#         if country != 'Palau':
#             continue

        # merge all tables
        table_all = pd.DataFrame()
        for table in [x for x in os.listdir('Approved Plans/'+country) if x.endswith('xlsx')]:
                table = pd.read_excel('Approved Plans/'+country+'/'+table)
                table_all = pd.concat([table_all, table])
        table_all = table_all.rename(columns={0: 'code', 1: 'text'})
        table_all = table_all.reset_index()
        
        if table_all.empty:
            print(country, 'missing tables')
            continue

        # specify sector in each table entry
        table_all['sector'] = ''
        sector_active = ''
        for ix, row in table_all.iterrows():
            for sector in sectors:
                search = sector + ' Outcome'
                try:
                    if search.lower() in row.text.lower() and 'P&B' in row.code:
                        sector_active = sector
                    else:
                        continue
                except:
                    pass
            table_all.at[ix, 'sector'] = sector_active
        # print results
        # for sector in table_all.sector.unique():
        #     print(table_all[table_all.sector == sector].head())

        # get activities
        table_activities = table_all.dropna(subset=['code'])
        table_activities = table_activities[table_activities.code.str.contains('AP')]
        
        # if some activities don't have a sector, assign nearest
        if any(table_activities.sector == ''):
            print('ERROR ({}): sector not found for some activities, skipping country'.format(country))
            continue

        # load output_data
        output_data = pd.read_excel('matrix_filled.xlsx')

        # fill output_data
        for ix, row in table_activities.iterrows():
            data_entry = pd.Series({'Country': country,
                                    sector_map[row['sector']]: row['text']
                                   })
            output_data = output_data.append(data_entry, ignore_index=True)

        # save output data
        output_data.to_excel('matrix_filled.xlsx')

In [152]:
def ParseTxtExtractExtraInfoSector():
    
    sectors_txt = sectors
    sectors_txt.append('Health and WASH') # some plans (e.g. Philippines) merge the two sectors
    
    for country in os.listdir('Approved Plans'):
#         # Test only on Cook Islands
#         if country != 'Cook Islands':
#             continue

        for file in os.listdir('Approved Plans/'+country):

            if file.startswith('MDR') and file.endswith('txt'):
                
                # read txt file
                file1 = codecs.open('Approved Plans/' + country + '/' + file.split('.')[0] + '.txt', "r",
                                    "utf-8")  # write mode
                text = file1.read()
                
                # get detailed operational plan
                text = re.split(r"(.*)Detailed Operational Plan", text, re.MULTILINE | re.DOTALL)[-1]
                
                # loop over sectors and extract extra info
                info_sectors = {}
                
                for sector in sectors_txt:
                    if sector == 'Protection, Gender & Inclusion':
                        sector_re = 'Protection, Gender and Inclusion'
                    elif sector == 'WASH':
                        sector_re = 'Water, sanitation and hygiene'
                    else:
                        sector_re = sector
                        
                    text_sector = re.split(sector_re+' \nPeople', text, re.MULTILINE | re.DOTALL)
                    
                    if len(text_sector)==1:
                        continue
                    text_sector = text_sector[1]
                    
                    for line in re.split(r"\n", text_sector, re.MULTILINE | re.DOTALL)[:10]:
                        if 'targeted:' in line:
                            peop_target = re.findall(r"[0-9,]+", line)[0]
                        if 'Requirements (CHF):' in line:
                            budget = re.findall(r"[0-9,]+", line)[0]
                            
                    if sector != 'Health and WASH':
                        info_sectors[sector] = {'Number of targeted (if applicable)': peop_target, 'Funding': budget}
                    else:
                        info_sectors['Health'] = {'Number of targeted (if applicable)': peop_target, 'Funding': budget}
                        info_sectors['WASH'] = {'Number of targeted (if applicable)': peop_target, 'Funding': budget}
                    
        # load output_data
        output_data = pd.read_excel('matrix_filled.xlsx')

        # fill output_data
        for sector, infos in info_sectors.items():
            for field, number in infos.items():
                output_data.at[output_data[output_data['Country']==country][~output_data[sector_map[sector]].isna()].index, field] = number

        # save output data
        output_data.to_excel('matrix_filled.xlsx')

In [153]:
# prepare dataframe
output_data = pd.read_excel('Matrix for data analysis from GO.xlsx')
output_data.drop(output_data.index, inplace=True)
output_data.to_excel('matrix_filled.xlsx')

# do the magic
# ExtractTxtAndTables()
ParseTablesExtractActivities()
ParseTxtExtractExtraInfoSector()

East Asia missing tables
Pacific missing tables
ERROR (Palau): sector not found for some activities, skipping country
Sth Asia missing tables
Sth East Asia missing tables
Livelihoods and basic needs {'Number of targeted (if applicable)': '504,000', 'Funding': '7,185,679'}
Protection, Gender & Inclusion {'Number of targeted (if applicable)': '150,000', 'Funding': '298,200'}
Health {'Number of targeted (if applicable)': '2,042,500', 'Funding': '4,500,001'}
WASH {'Number of targeted (if applicable)': '190,000', 'Funding': '529,724'}




Health {'Number of targeted (if applicable)': '500,000', 'Funding': '4,809,929'}
Migration {'Number of targeted (if applicable)': '100,000', 'Funding': '182,617'}
Protection, Gender & Inclusion {'Number of targeted (if applicable)': '6,000', 'Funding': '533'}
Health {'Number of targeted (if applicable)': '6,000', 'Funding': '15,342'}
Health {'Number of targeted (if applicable)': '2,823,747', 'Funding': '581,998'}
WASH {'Number of targeted (if applicable)': '2,823,747', 'Funding': '581,998'}
Health {'Number of targeted (if applicable)': '2,823,747', 'Funding': '581,998'}
WASH {'Number of targeted (if applicable)': '2,823,747', 'Funding': '581,998'}


IndexError: list index out of range

In [None]:
# move to output, which is ignored by git, so that we are data responsible
os.rename('matrix_filled.xlsx', 'Output/matrix_filled.xlsx')