In [3]:
import os
import pdfplumber
import glob
import re
import pandas as pd
import numpy as np

### Extract affordable housing financing data from Virginia PDFs

In [4]:
#filepath to PDFs
filepath = 'financials/Virgina 2019 Applications/'
# create a list of filepaths 
files = glob.glob(filepath+"*.pdf", recursive = True)

In [5]:
def extract_from_pdf(files):
    """
    Inputs:
        Files (list): List of full filepaths to the PDFs for Virginia. 
    
    This function go through each PDF to search for the first page to mention SOURCE
    OF FUNDS to extract the related data. To speed up the process to not look through entire 
    PDF here I just look through pages 34-41 which seems to be where these table live. 
    It then seperates out the four different charts construction_financing, permanent_financing,
    grants and subsidized_financing into distinct units and calls the populate table function.
    """
    cols = ['Source', 'Term (Months)', 'Interest Rate'," Amount of Funds","Funding Type","File Name"]
    final_df = []
    for file in files:
        try:
            pdf = pdfplumber.open(file)
            for i in range(34, 41):
                if "SOURCES OF FUNDS" in pdf.pages[i].extract_text(x_tolerance=3, y_tolerance=3).replace('\xa0', ' '):
                    page1 = pdf.pages[i].extract_tables()
                    page2 = pdf.pages[i+1].extract_tables()
                    break
            construction_financing = page1[0]
            permanent_financing = page1[1]
            grants = page1[2]
            if len(page1) > 3: #this check is needed because sometimes the charts split across pages 
                subsidized_financing = page1[3] + page2[0]
                below_market_Loans
            else:
                subsidized_financing = page2[0]
                

            big_rows = populate_table(construction_financing,permanent_financing,grants,subsidized_financing,file)
            final_df += big_rows   
        except Exception as e:
            print(file, e)
    return pd.DataFrame(final_df, columns=cols)
    
def populate_table(constuction_financing,permanent_financing,grants,subsidized_financing,file):
    """
    Inputs:
        constuction_financing (list): Extracted tables in list form 
        permanent_financing (list): Extracted tables in list form 
        grants (list): Extracted tables in list form 
        subsidized_financing (list): Extracted tables in list form 
        file (str): filename 
        
    This file takes the extracted financing tables and enumerates over them to index in and 
    pulls out the relevent data. 
    """
    
    
    file_name = re.split("/", file)[-1] #regex to pull the filename from the filepath
    file_name = file_name[0:-7]
    
    rows = []
    constuction_financing = constuction_financing[1:len(constuction_financing)-1] #drop col name row and totals row 
    for i, row in enumerate(constuction_financing):
        if len(constuction_financing[i][1]) > 1: #check if row exists
            new_row = [row[1], 'n/a', 'n/a', row[4],"Construction Financing",file_name]
            rows.append(new_row)
        else:
            continue 
    
    permanent_financing = permanent_financing[1:len(permanent_financing)-1]
    for i, row in enumerate(permanent_financing):  
            if len(permanent_financing[i][-1]) > 1:
                new_row = [row[1], float(row[8])*12, row[6], row[4],"Permanent Financing",file_name]
                rows.append(new_row)
            else:
                continue
                
    grants = grants[1:len(grants)-1]
    for i, row in enumerate(grants):
        if grants[i][1]:
            new_row = [row[1], 'n/a', 'n/a', row[4], "Grants",file_name]
            rows.append(new_row)
 
            
    subsidized_financing = subsidized_financing[1:len(subsidized_financing)-1]        
    for i, row in enumerate(subsidized_financing):
        if subsidized_financing[i][1]:
            new_row = [row[1], 'n/a', 'n/a', row[-1],"Subsidized Financing",file_name]
            rows.append(new_row)
        else:
            continue
            
    return rows
    

These six files below have errors that seem not worth the time to fix. The "Cannot convert None to Decimal."
error seems like a known problem for the pdfplumber package that a patch has not yet been released for
and the two other edge cases are not worth the time, I'll add these six manually to the file. 

In [6]:
df = extract_from_pdf(files)

In [7]:
#write out the good files to csv
df.to_csv('most_files_virginia.csv', index=False)

df = pd.read_csv("Virginia_scraped_financing_info.csv")

In [8]:
#Now need to filter for only the projects that got funded
#I just manually grabbed from this list, not worth the time to code it up https://www.vhda.com/BusinessPartners/MFDevelopers/LIHTCProgram/LowIncome%20Housing%20Tax%20Credit%20Program/2019%20final%20Rankings-Board%20approved.pdf
df = pd.read_csv("Virginia_scraped_financing_info.csv")

In [12]:
#Now need to filter for only the projects that got funded
#I just manually grabbed from this list: not worth the time to code it up as just using 2019 https://www.vhda.com/BusinessPartners/MFDevelopers/LIHTCProgram/LowIncome%20Housing%20Tax%20Credit%20Program/2019%20final%20Rankings-Board%20approved.pdf

#set of failed applications   
drop_me = ["Bickerstaff-Crossin-2019",
           "White Marsh",
"Church Hill North",
"Carrier Point I",
'Mountain Laurel Manor III',
'Cecelia House',
'Steeplechase Manor',
'Mountain Laurel Manor',
'Luray Village',
'The Heights at Brady Square',
'Bickerstaff Crossing',
'Hanover Apartments',
'Birch Island',
'King William Manor',
'Courthouse Lane',
'Holley Pointe',
'Coile',
'Grande Oak',
'Oakland Pointe',
'Riverside Station',
'Windsor Court',
'Magnolia Place',
'Joyner Greene',
'Marvin Gardens',
'1218-Park_New',
'Steeplechase Manor',
'Hanover',
'Mountain Laurel Manor III',
'Grande Oak',
'Cecelia House',
'Birch Island Apartments',
'Oakland Pointe',
'Bickerstaff Crossing',
'Windsor Court II',
'Mountain Laurel Manor',
'Riverside Station',
'King William Manor',
'Courthouse Lane',
'Luray Village',
'Magnolia Place']

df["File Name"] = df["File Name"].str[:-3]#all the file names had "new" at the end of the name, removing that
drop_me = pd.Series(drop_me)
#so now, need to add dashes to the drop me file so that things match 
drop_me = drop_me.str.replace('\xa0', '-') #replace spaces with a dash so that names can be matched to make a tilter 
drop_me = drop_me.append(pd.Series("Mountain-Laurel-ManorIII"))#one edge case of dash in wrong palce 
df2 = df[~df["File Name"].isin(drop_me)]#filter out the non 9 percent accepted applications 
df2["File Name"] = df2["File Name"].dropna()

In [13]:
#this filters out duplicates that have the same file name and the same amount of funds associated, 
#and if this is the case, it keeps the funding type permanent finacing, as that contains more info on interest amount
df2 = df2[~df2.duplicated(["File Name", "Amount of Funds"], keep=False) | df2["Funding Type"].eq("Permanent Financing")]

In [14]:
#double check against list 
np.sort(df2["File Name"].dropna().unique())

array(['', '1218', 'A', 'Bickerstaff-Cro', 'Bir', 'Blai', 'Bra', 'Bro',
       'Carlton-', 'Carrie', 'Cece', 'Church-H', 'Courth', 'Cr', 'Cresc',
       'Daffodi', 'Fairlington-Pre', 'Friends', 'G', 'Gro', 'Hol', 'Joyn',
       'Kilmarnoc', 'King-Will', 'Knig', 'Lassit', 'Lau', 'Lura', 'Magno',
       'Marke', 'Marvi', 'Maury-Ri', 'Mountain-Lau', 'Mountain-Laurel',
       'Mt.', 'Oakla', 'Pop', 'Post-', 'Powhata', 'Riversid', 'Sens',
       'South-Fir', 'Sprat', 'Steeplech', 'Wh', 'Wind'], dtype=object)

In [15]:
df2

Unnamed: 0,Source,Term (Months),Interest Rate,Amount of Funds,Funding Type,File Name
0,TBD,,,"$5,500,000",Construction Financing,King-Will
1,VHDA REACH,420.0,2.95%,"$800,000",Permanent Financing,King-Will
2,VHDA Taxable,420.0,5.50%,"$2,100,000",Permanent Financing,King-Will
3,First Citizens Bank,,,"$6,350,000",Construction Financing,Blai
4,VHDA REACH,360.0,2.95%,"$1,000,000",Permanent Financing,Blai
...,...,...,...,...,...,...
248,Construction Loan,,,"$7,530,000",Construction Financing,Church-H
249,REACH,420.0,2.95%,"$2,000,000",Permanent Financing,
250,Taxable REACH,420.0,5.50%,"$1,000,000",Permanent Financing,
251,RRHA Seller Note,480.0,0%,"$480,000",Permanent Financing,


In [None]:
#df2.to_csv("Virginia_approved_9%_applications.csv")
#df2 = pd.read_csv("Virginia_approved_9%_applications.csv")