In [3]:
import os
import pdfplumber
#import tabula
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm

### Import PDF, extract data, and save to CSV

This section is a test to show how the process for PDF extraction works

In [69]:
# open PDF
pdf = pdfplumber.open("pdfs/19-079.pdf")

# save the 11th page (actually the 12, Python counts from 0)
p0 = pdf.pages[11]

In [78]:
pwd

'/Users/underriner/Desktop/work/ca_lihtc_scraper'

In [70]:
# extract the PDF page
table = p0.extract_table()
table

[['Name of Lender/Source',
  None,
  'Term (months)',
  'Interest Rate',
  'Amount of Funds'],
 ['1)', 'Bonneville Multifamily Capital', '24', '5.500%', '$3,314,248'],
 ['2)', 'USDA Section 515 Loan Assumption', '600', '1.000%', '$1,154,605'],
 ['3)', "Investor's Equity", '', '', '$1,670,677'],
 ['4)', 'Deferred Reserves', '', '', '$694,564'],
 ['5)', 'Deferred Developer Fee', '', '', '$690,374'],
 ['6)', '', '', '', ''],
 ['7)', '', '', '', ''],
 ['8)', '', '', '', ''],
 ['9)', '', '', '', ''],
 ['10)', '', '', '', ''],
 ['11)', '', '', '', ''],
 ['12)', '', '', '', ''],
 ['Total Funds For Construction:', None, None, None, '$7,524,468']]

In [71]:
# save table as a pandas dataframe
df = pd.DataFrame(table[1:], columns=table[0])
df

Unnamed: 0,Name of Lender/Source,NaN,Term (months),Interest Rate,Amount of Funds
0,1),Bonneville Multifamily Capital,24.0,5.500%,"$3,314,248"
1,2),USDA Section 515 Loan Assumption,600.0,1.000%,"$1,154,605"
2,3),Investor's Equity,,,"$1,670,677"
3,4),Deferred Reserves,,,"$694,564"
4,5),Deferred Developer Fee,,,"$690,374"
5,6),,,,
6,7),,,,
7,8),,,,
8,9),,,,
9,10),,,,


In [72]:
# drop extra column, rename null column in index 0 as "Source"
to_drop = ["Name of Lender/Source"]
df.drop(to_drop, inplace=True, axis=1)
df.columns.values[0] = "Source"
df

Unnamed: 0,Source,Term (months),Interest Rate,Amount of Funds
0,Bonneville Multifamily Capital,24.0,5.500%,"$3,314,248"
1,USDA Section 515 Loan Assumption,600.0,1.000%,"$1,154,605"
2,Investor's Equity,,,"$1,670,677"
3,Deferred Reserves,,,"$694,564"
4,Deferred Developer Fee,,,"$690,374"
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


In [9]:
# save cleaned dataframe to a csv file
#df.to_csv('19-001.csv')

### Glob extract PDF files

In [73]:
# define filepath to folder with all the PDFs
# filepath = '/Users/Carolina/Box Sync/Terner Center_Summer 2019/LIHTC Construction Cost Brief/LIHTC Data Entry Project/2019 TCAC Applications/'
filepath = './pdfs/'

# glob extract files
files = glob.glob(filepath+"*.pdf", recursive = True)

In [77]:
# test to verify that files were properly read in
#for file in files: 
    # print(file)
#    print(os.path.basename(file))

In [40]:
count = 0
df_all = pd.DataFrame()

# loop through filepaths globbed in above cell (tqdm used for progress bar in loop)
for file in tqdm(files):
    # get current file path and name
    current_path = os.getcwd()
    current_pdf = os.path.basename(file)[:-4]
    print("Now extracting: "+current_pdf)
    
    # open file using pdfplumber
    with pdfplumber.open(file) as my_pdf:
        
        # extract the 11th page (actually the 12, Python counts from 0)
        # unless the table has as number 700+, then it's the 12th page NOTE: This was true for 2019, diff numbers in different years
        page = 11 if int(current_pdf[3:])<700 else 12
        p0 = my_pdf.pages[page]
        table = p0.extract_table()
        
        # save table as a pandas dataframe
        df = pd.DataFrame(table[1:], columns=table[0])
        
        # drop extra column, rename null column in index 0 as "Source"
        try:
            to_drop = ["Name of Lender/Source"]
            df.drop(to_drop, inplace=True, axis=1)
            df.columns.values[0]="Source"
        except KeyError:
            print("Error reading in: " + current_pdf)
            break
        
        # save cleaned dataframe to a csv file (with same name as the initial PDF)
        df.to_csv(current_path + "/processed_data/" + current_pdf + '.csv', index=False)
        
        # EXTRA STEPS:
        # 1) Drop null records, add column with PDF filename
        df['Source'].replace('', np.nan, inplace=True)
        df.dropna(inplace=True)
        df['PDF_Filename'] = current_pdf
        
        # 2) append to dataframe with all values
        if count==0:
            df_all = df.copy()
            count+=1
        else:
            df_all = df_all.append(df)

# 3) Save df_all to file
print("Saving: all_extracted_files.csv")
df_all.to_csv(current_path + '/processed_data/all_extracted_files.csv', index=False)

# Done
print("DONE")

#Note: will need to spot check - some will get read in incorreclty 

  0%|          | 0/73 [00:00<?, ?it/s]

Now extracting: 19-075
Now extracting: 19-061
Now extracting: 19-129
Now extracting: 19-101
Now extracting: 19-115
Now extracting: 19-114
Now extracting: 19-100
Now extracting: 19-128
Now extracting: 19-060
Now extracting: 19-074
Now extracting: 19-089
Now extracting: 19-062
Now extracting: 19-076
Now extracting: 19-116
Now extracting: 19-102
Now extracting: 19-103
Now extracting: 19-117
Now extracting: 19-077
Now extracting: 19-063
Now extracting: 19-088
Now extracting: 19-098
Now extracting: 19-067
Now extracting: 19-073
Now extracting: 19-113
Now extracting: 19-107
Now extracting: 19-106
Now extracting: 19-112
Now extracting: 19-072
Now extracting: 19-066
Now extracting: 19-099
Now extracting: 19-070
Now extracting: 19-064
Now extracting: 19-104
Now extracting: 19-110
Now extracting: 19-111
Now extracting: 19-105
Now extracting: 19-059
Now extracting: 19-065
Now extracting: 19-071
Now extracting: 19-097
Now extracting: 19-083
Now extracting: 19-068
Now extracting: 19-108
Now extract