In [1]:
import os
import pdfplumber
import glob
import re
import pandas as pd
import numpy as np

### Extract affordable housing financing data from Virginia PDFs

In [None]:
#filepath to PDFs
filepath = 'financials/Virgina 2019 Applications/'
# create a list of filepaths 
files = glob.glob(filepath+"*.pdf", recursive = True)

In [160]:
def extract_from_pdf(files):
    """
    Inputs:
        Files (list): List of full filepaths to the PDFs for Virginia. 
    
    This function go through each PDF to search for the first page to mention SOURCE
    OF FUNDS to extract the related data. To speed up the process to not look through entire 
    PDF here I just look through pages 34-41 which seems to be where these table live. 
    It then seperates out the four different charts construction_financing, permanent_financing,
    grants and subsidized_financing into distinct units and calls the populate table function.
    """
    cols = ['Source', 'Term (Months)', 'Interest Rate'," Amount of Funds","Funding Type","File Name"]
    final_df = []
    for file in files:
        try:
            pdf = pdfplumber.open(file)
            for i in range(34, 41):
                if "SOURCES OF FUNDS" in pdf.pages[i].extract_text(x_tolerance=3, y_tolerance=3).replace('\xa0', ' '):
                    page1 = pdf.pages[i].extract_tables()
                    page2 = pdf.pages[i+1].extract_table()
                    break
            construction_financing = page1[0]
            permanent_financing = page1[1]
            grants = page1[2]
            if len(page1) > 3: #this check is needed because sometimes the charts split across pages 
                subsidized_financing = page1[3] + page2
            else:
                subsidized_financing = page2

            big_rows = populate_table(construction_financing,permanent_financing,grants,subsidized_financing,file)
            final_df += big_rows   
        except Exception as e:
            print(file, e)
    return pd.DataFrame(final_df, columns=cols)
    
def populate_table(constuction_financing,permanent_financing,grants,subsidized_financing,file):
    """
    Inputs:
        constuction_financing (list): Extracted tables in list form 
        permanent_financing (list): Extracted tables in list form 
        grants (list): Extracted tables in list form 
        subsidized_financing (list): Extracted tables in list form 
        file (str): filename 
        
    This file takes the extracted financing tables and enumerates over them to index in and 
    pulls out the relevent data. 
    """
    
    
    file_name = re.split("/", file)[-1] #regex to pull the filename from the filepath
    file_name = file_name[0:-7]
    
    rows = []
    constuction_financing = constuction_financing[1:len(constuction_financing)-1] #drop col name row and totals row 
    for i, row in enumerate(constuction_financing):
        if len(constuction_financing[i][1]) > 1: #check if row exists
            new_row = [row[1], 'n/a', 'n/a', row[4],"Construction Financing",file_name]
            rows.append(new_row)
        else:
            continue 
    
    permanent_financing = permanent_financing[1:len(permanent_financing)-1]
    for i, row in enumerate(permanent_financing):  
            if len(permanent_financing[i][-1]) > 1:
                new_row = [row[1], float(row[8])*12, row[6], row[4],"Permanent Financing",file_name]
                rows.append(new_row)
            else:
                continue
                
    grants = grants[1:len(grants)-1]
    for i, row in enumerate(grants):
        if grants[i][1]:
            new_row = [row[1], 'n/a', 'n/a', row[4], "Grants",file_name]
            rows.append(new_row)
 
            
    subsidized_financing = subsidized_financing[1:len(subsidized_financing)-1]        
    for i, row in enumerate(subsidized_financing):
        if subsidized_financing[i][1]:
            new_row = [row[1], 'n/a', 'n/a', row[-1],"Subsidized Financing",file_name]
            rows.append(new_row)
        else:
            continue
            
    return rows
    

These six files below have errors that seem not worth the time to fix. The "Cannot convert None to Decimal."
error seems like a known problem for the pdfplumber package that a patch has not yet been released for
and the two other edge cases are not worth the time, I'll add these six manually to the file. 

In [162]:
df = extract_from_pdf(files)

Virgina 2019 Applications/Friendship-Courtnew.pdf list index out of range
Virgina 2019 Applications/Holley-Pointenew.pdf Cannot convert None to Decimal.
Virgina 2019 Applications/Brady-Squarenew.pdf could not convert string to float: 'l,.._'
Virgina 2019 Applications/Luray-Village.pdf 'NoneType' object has no attribute 'replace'
Virgina 2019 Applications/Joyner-Greenenew.pdf Cannot convert None to Decimal.
Virgina 2019 Applications/Churc-Hill-Northnew.pdf Cannot convert None to Decimal.


In [163]:
#write out the good files to csv
df.to_csv('most_files_virginia.csv', index=False)