In [2]:
import os
import pdfplumber
import tabula
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm

ModuleNotFoundError: No module named 'tabula'

### Import PDF, extract data, and save to CSV

This section is a test to show how the process for PDF extraction works

In [5]:
# open PDF
pdf = pdfplumber.open("19-001.pdf")

# save the 11th page (actually the 12, Python counts from 0)
p0 = pdf.pages[11]

In [6]:
# extract the PDF page
table = p0.extract_table()
table

[['Name of Lender/Source',
  None,
  'Term (months)',
  'Interest Rate',
  'Amount of Funds'],
 ['1)', 'Rabobank Construction Loan', '24', '5.49%', '$15,000,000'],
 ['2)', 'City of Bakersfield - HOME', '660', '3.00%', '$1,500,000'],
 ['3)', 'Deferred Costs', 'N/A', 'N/A', '$1,825,976'],
 ['4)', 'PNC Real Estate Tax Credit Equity', 'N/A', 'N/A', '$2,242,591'],
 ['5)', '', '', '', ''],
 ['6)', '', '', '', ''],
 ['7)', '', '', '', ''],
 ['8)', '', '', '', ''],
 ['9)', '', '', '', ''],
 ['10)', '', '', '', ''],
 ['11)', '', '', '', ''],
 ['12)', '', '', '', ''],
 ['Total Funds For Construction:', None, None, None, '$20,568,567']]

In [7]:
# save table as a pandas dataframe
df = pd.DataFrame(table[1:], columns=table[0])
df

Unnamed: 0,Name of Lender/Source,NaN,Term (months),Interest Rate,Amount of Funds
0,1),Rabobank Construction Loan,24.0,5.49%,"$15,000,000"
1,2),City of Bakersfield - HOME,660.0,3.00%,"$1,500,000"
2,3),Deferred Costs,,,"$1,825,976"
3,4),PNC Real Estate Tax Credit Equity,,,"$2,242,591"
4,5),,,,
5,6),,,,
6,7),,,,
7,8),,,,
8,9),,,,
9,10),,,,


In [8]:
# drop extra column, rename null column in index 0 as "Source"
to_drop = ["Name of Lender/Source"]
df.drop(to_drop, inplace=True, axis=1)
df.columns.values[0] = "Source"
df

Unnamed: 0,Source,Term (months),Interest Rate,Amount of Funds
0,Rabobank Construction Loan,24.0,5.49%,"$15,000,000"
1,City of Bakersfield - HOME,660.0,3.00%,"$1,500,000"
2,Deferred Costs,,,"$1,825,976"
3,PNC Real Estate Tax Credit Equity,,,"$2,242,591"
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


In [9]:
# save cleaned dataframe to a csv file
df.to_csv('19-001.csv')

### Glob extract PDF files

In [10]:
# define filepath to folder with all the PDFs
# filepath = '/Users/Carolina/Box Sync/Terner Center_Summer 2019/LIHTC Construction Cost Brief/LIHTC Data Entry Project/2019 TCAC Applications/'
filepath = '/Users/ethan/Box Sync/LIHTC Construction Cost Brief/LIHTC Data Entry Project/2019 TCAC Applications/'

# glob extract files
files = glob.glob(filepath+"*.pdf", recursive = True)

In [11]:
# test to verify that files were properly read in
for file in files: 
    # print(file)
    print(os.path.basename(file))

19-001.pdf
19-002.pdf
19-003.pdf
19-004.pdf
19-005.pdf
19-006.pdf
19-007.pdf
19-008.pdf
19-009.pdf
19-010.pdf
19-011.pdf
19-012.pdf
19-013.pdf
19-014.pdf
19-015.pdf
19-016.pdf
19-017.pdf
19-018.pdf
19-019.pdf
19-020.pdf
19-021.pdf
19-022.pdf
19-023.pdf
19-024.pdf
19-025.pdf
19-026.pdf
19-027.pdf
19-028.pdf
19-029.pdf
19-030.pdf
19-031.pdf
19-032.pdf
19-033.pdf
19-034.pdf
19-035.pdf
19-036.pdf
19-037.pdf
19-038.pdf
19-039.pdf
19-040.pdf
19-041.pdf
19-042.pdf
19-043.pdf
19-044.pdf
19-045.pdf
19-046.pdf
19-047.pdf
19-048.pdf
19-049.pdf
19-050.pdf
19-051.pdf
19-052.pdf
19-053.pdf
19-054.pdf
19-055.pdf
19-056.pdf
19-057.pdf
19-058.pdf
19-059.pdf
19-060.pdf
19-061.pdf
19-062.pdf
19-063.pdf
19-064.pdf
19-065.pdf
19-066.pdf
19-067.pdf
19-068.pdf
19-069.pdf
19-070.pdf
19-071.pdf
19-072.pdf
19-073.pdf
19-074.pdf
19-075.pdf
19-076.pdf
19-077.pdf
19-078.pdf
19-079.pdf
19-080.pdf
19-081.pdf
19-082.pdf
19-083.pdf
19-084.pdf
19-085.pdf
19-086.pdf
19-087.pdf
19-088.pdf
19-089.pdf
19-090.pdf
19-091.pdf

In [29]:
count = 0
df_all = pd.DataFrame()

# loop through filepaths globbed in above cell (tqdm used for progress bar in loop)
for file in tqdm(files[-4:]):
    # get current file path and name
    current_path = os.path.dirname(file)
    current_pdf = os.path.basename(file)[:-4]
    print("Now extracting: "+current_pdf)
    
    # open file using pdfplumber
    with pdfplumber.open(file) as my_pdf:
        
        # extract the 11th page (actually the 12, Python counts from 0)
        # unless the table has as number 700+, then it's the 12th page
        page = 11 if int(current_pdf[3:])<700 else 12
        p0 = my_pdf.pages[page]
        table = p0.extract_table()
        
        # save table as a pandas dataframe
        df = pd.DataFrame(table[1:], columns=table[0])
        
        # drop extra column, rename null column in index 0 as "Source"
        try:
            to_drop = ["Name of Lender/Source"]
            df.drop(to_drop, inplace=True, axis=1)
            df.columns.values[0]="Source"
        except KeyError:
            print("Error reading in: " + current_pdf)
            break
        
        # save cleaned dataframe to a csv file (with same name as the initial PDF)
        df.to_csv(current_path + "/Extract_PDF_to_CSV/" + current_pdf + '.csv', index=False)
        
        # EXTRA STEPS:
        # 1) Drop null records, add column with PDF filename
        df['Source'].replace('', np.nan, inplace=True)
        df.dropna(inplace=True)
        df['PDF_Filename'] = current_pdf
        
        # 2) append to dataframe with all values
        if count==0:
            df_all = df.copy()
            count+=1
        else:
            df_all = df_all.append(df)

# 3) Save df_all to file
print("Saving: all_extracted_files.csv")
df_all.to_csv(current_path + '/Extract_PDF_to_CSV/all_extracted_files.csv', index=False)

# Done
print("DONE")

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Now extracting: 19-700
Now extracting: 19-701
Now extracting: 19-702
Now extracting: 19-703



### Done