In [3]:
import os
import pdfplumber
import glob
import re
import pandas as pd
import numpy as np

### Extract affordable housing financing data from Virginia PDFs

In [7]:
#filepath to PDFs
filepath = 'financials/Virgina 2019 Applications/'
# create a list of filepaths 
files = glob.glob(filepath+"*.pdf", recursive = True)

In [8]:
files

['financials/Virgina 2019 Applications/King-William-Manornew.pdf',
 'financials/Virgina 2019 Applications/Blairs-Schoolnew.pdf',
 'financials/Virgina 2019 Applications/Fairlington-Presbyteriannew.pdf',
 'financials/Virgina 2019 Applications/Mountain-Laurel-ManorIIInew.pdf',
 'financials/Virgina 2019 Applications/Laurel-Ridge.pdf',
 'financials/Virgina 2019 Applications/Groom-Schoolnew.pdf',
 'financials/Virgina 2019 Applications/Crescent-Hallsnew.pdf',
 'financials/Virgina 2019 Applications/Coilenew.pdf',
 'financials/Virgina 2019 Applications/Senseny-Placenew.pdf',
 'financials/Virgina 2019 Applications/Friendship-Courtnew.pdf',
 'financials/Virgina 2019 Applications/Post-West-Ninenew.pdf',
 'financials/Virgina 2019 Applications/Mt.-Sterlingnew.pdf',
 'financials/Virgina 2019 Applications/Lassiter-Courtsnew.pdf',
 'financials/Virgina 2019 Applications/Knightsbridgenew.pdf',
 'financials/Virgina 2019 Applications/Birch-Islandnew.pdf',
 'financials/Virgina 2019 Applications/Maury-River-

In [14]:
test = "1218-Park_Newnew.pdf"

In [15]:
pdf = pdfplumber.open(test)

In [16]:
pdf

<pdfplumber.pdf.PDF at 0x11ad6c438>

In [17]:
page2 = pdf.pages[37].extract_tables()

In [23]:
page2[3]

[['Taxable\xa0Bonds', None, '$400,000'],
 ['Section\xa0220', None, '$0'],
 ['Section\xa0221(d)(3)', None, '$0'],
 ['Section\xa0221(d)(4)', None, '$0'],
 ['Section\xa0236', None, '$0'],
 ['Section\xa0223(f)', None, '$0'],
 ['Other:', '', '$0'],
 [None, '', '']]

In [160]:
def extract_from_pdf(files):
    """
    Inputs:
        Files (list): List of full filepaths to the PDFs for Virginia. 
    
    This function go through each PDF to search for the first page to mention SOURCE
    OF FUNDS to extract the related data. To speed up the process to not look through entire 
    PDF here I just look through pages 34-41 which seems to be where these table live. 
    It then seperates out the four different charts construction_financing, permanent_financing,
    grants and subsidized_financing into distinct units and calls the populate table function.
    """
    cols = ['Source', 'Term (Months)', 'Interest Rate'," Amount of Funds","Funding Type","File Name"]
    final_df = []
    for file in files:
        try:
            pdf = pdfplumber.open(file)
            for i in range(34, 41):
                if "SOURCES OF FUNDS" in pdf.pages[i].extract_text(x_tolerance=3, y_tolerance=3).replace('\xa0', ' '):
                    page1 = pdf.pages[i].extract_tables()
                    page2 = pdf.pages[i+1].extract_tables()
                    break
            construction_financing = page1[0]
            permanent_financing = page1[1]
            grants = page1[2]
            if len(page1) > 3: #this check is needed because sometimes the charts split across pages 
                subsidized_financing = page1[3] + page2[0]
                below_market_Loans
            else:
                subsidized_financing = page2[0]
                

            big_rows = populate_table(construction_financing,permanent_financing,grants,subsidized_financing,file)
            final_df += big_rows   
        except Exception as e:
            print(file, e)
    return pd.DataFrame(final_df, columns=cols)
    
def populate_table(constuction_financing,permanent_financing,grants,subsidized_financing,file):
    """
    Inputs:
        constuction_financing (list): Extracted tables in list form 
        permanent_financing (list): Extracted tables in list form 
        grants (list): Extracted tables in list form 
        subsidized_financing (list): Extracted tables in list form 
        file (str): filename 
        
    This file takes the extracted financing tables and enumerates over them to index in and 
    pulls out the relevent data. 
    """
    
    
    file_name = re.split("/", file)[-1] #regex to pull the filename from the filepath
    file_name = file_name[0:-7]
    
    rows = []
    constuction_financing = constuction_financing[1:len(constuction_financing)-1] #drop col name row and totals row 
    for i, row in enumerate(constuction_financing):
        if len(constuction_financing[i][1]) > 1: #check if row exists
            new_row = [row[1], 'n/a', 'n/a', row[4],"Construction Financing",file_name]
            rows.append(new_row)
        else:
            continue 
    
    permanent_financing = permanent_financing[1:len(permanent_financing)-1]
    for i, row in enumerate(permanent_financing):  
            if len(permanent_financing[i][-1]) > 1:
                new_row = [row[1], float(row[8])*12, row[6], row[4],"Permanent Financing",file_name]
                rows.append(new_row)
            else:
                continue
                
    grants = grants[1:len(grants)-1]
    for i, row in enumerate(grants):
        if grants[i][1]:
            new_row = [row[1], 'n/a', 'n/a', row[4], "Grants",file_name]
            rows.append(new_row)
 
            
    subsidized_financing = subsidized_financing[1:len(subsidized_financing)-1]        
    for i, row in enumerate(subsidized_financing):
        if subsidized_financing[i][1]:
            new_row = [row[1], 'n/a', 'n/a', row[-1],"Subsidized Financing",file_name]
            rows.append(new_row)
        else:
            continue
            
    return rows
    

These six files below have errors that seem not worth the time to fix. The "Cannot convert None to Decimal."
error seems like a known problem for the pdfplumber package that a patch has not yet been released for
and the two other edge cases are not worth the time, I'll add these six manually to the file. 

In [162]:
df = extract_from_pdf(files)

Virgina 2019 Applications/Friendship-Courtnew.pdf list index out of range
Virgina 2019 Applications/Holley-Pointenew.pdf Cannot convert None to Decimal.
Virgina 2019 Applications/Brady-Squarenew.pdf could not convert string to float: 'l,.._'
Virgina 2019 Applications/Luray-Village.pdf 'NoneType' object has no attribute 'replace'
Virgina 2019 Applications/Joyner-Greenenew.pdf Cannot convert None to Decimal.
Virgina 2019 Applications/Churc-Hill-Northnew.pdf Cannot convert None to Decimal.


In [163]:
#write out the good files to csv
df.to_csv('most_files_virginia.csv', index=False)

In [172]:
#Now need to filter for only the projects that got funded
#I just manually grabbed from this list, not worth the time to code it up https://www.vhda.com/BusinessPartners/MFDevelopers/LIHTCProgram/LowIncome%20Housing%20Tax%20Credit%20Program/2019%20final%20Rankings-Board%20approved.pdf
df = pd.read_csv("Virginia_scraped_financing_info.csv")

In [173]:
#list of failed, some duplicates so call a set  
drop_me = ["Bickerstaff-Crossin-2019",
           "White Marsh",
"Church Hill North",
"Carrier Point I",
'Mountain Laurel Manor III',
'Cecelia House',
'Steeplechase Manor',
'Mountain Laurel Manor',
'Luray Village',
'The Heights at Brady Square',
'Bickerstaff Crossing',
'Hanover Apartments',
'Birch Island',
'King William Manor',
'Courthouse Lane',
'Holley Pointe',
'Coile',
'Grande Oak',
'Oakland Pointe',
'Riverside Station',
'Windsor Court',
'Magnolia Place',
'Joyner Greene',
'Marvin Gardens',
'1218-Park_New',
'Steeplechase Manor',
'Hanover',
'Mountain Laurel Manor III',
'Grande Oak',
'Cecelia House',
'Birch Island Apartments',
'Oakland Pointe',
'Bickerstaff Crossing',
'Windsor Court II',
'Mountain Laurel Manor',
'Riverside Station',
'King William Manor',
'Courthouse Lane',
'Luray Village',
'Magnolia Place']
set(drop_me)

df["File Name"] = df["File Name"].str[:-3]#all the file names had "new" at the end of the name, removing that
drop_me = pd.Series(drop_me)
#so now, need to add dashes to the drop me file so that things match 
drop_me = drop_me.str.replace('\xa0', '-') #replace spaces with a dash so that names can be matched to make a tilter 
drop_me = drop_me.append(pd.Series("Mountain-Laurel-ManorIII"))#one edge case of dash in wrong palce 
df2 = df[~df["File Name"].isin(drop_me)]#filter out the non 9 percent accepted applications 
df2["File Name"] = df2["File Name"].dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [175]:
#this filters out duplicates that have the same file name and the same amount of funds associated, 
#and if this is the case, it keeps the funding type permanent finacing, as that contains more info on interest amount
df2 = df2[~df2.duplicated(["File Name", "Amount of Funds"], keep=False) | df2["Funding Type"].eq("Permanent Financing")]

In [177]:
#double check against list 
np.sort(df2["File Name"].dropna().unique())

array(['Arrowbrook', 'Blairs-School', 'Brady-Square', 'Brook-Villas',
       'Carlton-Views-III', 'Cool-Lane', 'Crescent-Halls', 'Cross-Creek',
       'Daffodil-Gardens', 'Fairlington-Presbyterian', 'Friendship-Court',
       'Groom-School', 'Holley-Point', 'Kilmarnock-Village',
       'Knightsbridge', 'Lassiter-Courts', 'Laurel-Ridge',
       'Market-Heights', 'Maury-River-Place', 'Mt.-Sterling', 'PABP',
       'Poplar-Creek', 'Post-West-Nine', 'Powhatan-Terrace',
       'Senseny-Place', 'South-First-Street', 'Spratley-House'],
      dtype=object)

In [178]:
df2.to_csv("Virginia_approved_9%_applications.csv")

In [190]:
df2 = pd.read_csv("Virginia_approved_9%_applications.csv")

In [194]:
df2["Source"] = df2["Source"].str.replace('\xa0', ' ')
df2["Source"] = df2["Source"].str.replace(' - ', ' ')
df2["Source"] = df2["Source"].str.replace('-', ' ')

In [200]:
df2["Source"].value_counts()

VHDA REACH             10
TBD                     9
VHDA SPARC              5
DHCD HOME               4
VHDA                    4
                       ..
Merchants Bank          1
BB&T                    1
VHDA SIP                1
Local Fdtn              1
Sponsor Loan (FHLB)     1
Name: Source, Length: 79, dtype: int64

In [197]:
df2["Source"].unique()

array(['First Citizens Bank', 'VHDA REACH', 'VHDA TAXABLE',
       'Donation of Property', 'First Mortgage', 'Alexandria City Loan',
       'DHCD HOME', 'Town of South Hill', 'TBD', 'VHDA SPARC',
       'DHCD (AHTF)', 'Sponsor Loan (FHLB)',
       'Sponsor Loan (City of Charlottesville)', 'Sponsor Loan (CCF)',
       'VHDA Const. Loan', 'VHDA Taxable', 'VHTF/Sponsor Loan',
       'Construction & Bridge Loan Equity LP', 'VHDA Debt', 'AHIF',
       'American Legion Seller No', 'Bridge Loan', 'Tax Credit Equity',
       'VHDA Perm', 'Loudoun County HTF', 'Permit Fee Waiver',
       'Access National Bank', 'DHCD National Trust Fund',
       'Sponsor Loan (Leashold Interest Seller N',
       'Sponsor Loan (City HOME Funds)', 'Sponsor Loan (RH Factor)',
       'Sponsor Loan (PH Operating Reserves)',
       'Sponsor Loan (NNRHA Development Resources)', 'Housing',
       'Sponsor Subsidy Financing', 'RD 515', 'VHDA',
       'VHDA REACH Loan/MUMI', 'DHCD NHTF', 'DHCD VHTF',
       'Henrico CDBG

In [None]:
bank_loan = ['First Citizens Bank','FHLB of Atlanta','RRHA Seller Note',
             "Bank of America", "First Mortgage",'Merchants Bank', 'Merchants Capital',
             'Merchants Bank', 'Merchants Capital','Access National Bank',
       'Valhalla Mortgage, LLC','Construction & Bridge Loan Equity LP','BB&T','Bridge Loan', "M&T Bank",]
deferred_dev = []
state_credit = ['MUMI','VHDA REACH Loan/MUMI',] 
federal_credit = ['USDA PRLF']
state_grant = ['VHDA REACH','REACH','Taxable REACH','VHDA Const. Loan','DHCD HOME','VHDA Perm','VHDA','DHCD National Trust Fund',
               'VHDA SPARC','DHCD (AHTF)','VHDA Const. Loan','DHCD Planning Grant', 'DHCD HOME/VHTF','DHCD NHTF', 'DHCD VHTF','FHTF','VHDA Debt','VHDA Taxable', 'VHTF/Sponsor Loan',] # VHDA was created by the state but is a public private partnership I think? https://www.vhda.com/BusinessPartners/GovandNon-Profits/CommunityOutreach/Pages/Grant-Programs.aspx financing seems explicltly below market rate https://www.vhda.com/BusinessPartners/MFDevelopers/MFFinancing/Pages/MFFinancing.aspx
local_grant = ['Henrico CDBG/ HOME', 'Richmond CDBG/ HOME/',
       'Chesterfield CDBG/ HOME','Sponsor Loan (Leashold Interest Seller N',
       'Sponsor Loan (City HOME Funds)', 'AHIF','Sponsor Loan (RH Factor)','Sponsor Subsidy Financing',
       'Sponsor Loan (PH Operating Reserves)',"American Legion Seller No",
       'Sponsor Loan (NNRHA Development Resources)','Town of South Hill', 'Alexandria City Loan','Sponsor Loan (NNRHA Resources)','Sponsor Loan','Sponsor Loan (Seller Note)',
             'Sponsor Loan (City of Charlottesville)','Loudoun County HTF', 'Permit Fee Waiver','Local Fdtn','Charlottesville Grants', 'Sponsor Loan (CCF)",] # https://www.hudexchange.info/grantees/richmond-ca/?program=2 
public_private_partnership = ['VHDA TAXABLE','VHDA Taxable Bonds', 'VHDA SPARC',
                              'VHDA Const. Loan', 'VHDA Taxable','VHDA SIP','VHDA Match','Bon Secours/ VCU Health', ] 
federal_home_loan_bank = ['FHLB','Sponsor Loan (FHLB)','FHLBA', 'FHLB AHP',"TCB Loan (FHLB, DHCD, other Resources)"]               
fed_grant = ["USDA RD",'RD 515']
land_donation = ['Donated Land','Bay Aging ‐ land donation','Donation of Property',
                 'Donated Land/ Building','Below‐Market Ground Lease']

In [None]:
conditions = [df["Financing Type"].isin(bank_loan),
    df["Financing Type"].isin(deferred_dev),
    df["Financing Type"].isin(state_credit),
    df["Financing Type"].isin(federal_credit),
    df["Financing Type"].isin(state_grant),
    df["Financing Type"].isin(local_grant),
    df["Financing Type"].isin(public_private_partnership),
    df["Financing Type"].isin(fed_grant),
             df["Financing Type"].isin(federal_home_loan_bank),
             df["Financing Type"].isin(land_donation)]

In [None]:
outputs = ['Bank Loan',"Deferred Developer Fee","State Housing Credit Equity",
    "Federal Housing Credit Equity","State Grant","Local Grant","Public Private Partnership","Federal Home Loan Bank",
           "Federal Grant","Land Donation"]

In [None]:
df["Financing_sub_cat"] = np.select(conditions, outputs, 'Other')
