I think it's worth a scan back through the original data sets to understand which ones actually offered the placed in service variable

In [716]:
import pandas as pd
import numpy as np
import re
import matplotlib as plt
import seaborn as sns
import warnings
import altair as alt
import cpi
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)

In [717]:
#Load in HUD data from https://lihtc.huduser.gov/
HUD_lihtc_filepath = 'LIHTCPUB.csv'
hud = pd.read_csv(HUD_lihtc_filepath)

In [718]:
syndicator_data_concat = pd.read_csv("syndicator_data_concat.csv")

#### Some quick checks on year placed in service reliability (can skip to matching sections) 

Below I am checking for differences between year placed in service 
and construction completion date in the syndicator data.

In [719]:
len(con_complete)

3592

In [720]:
con_complete= syndicator_data_concat[["concomdate","yr_pis","df"]]
con_complete=con_complete.dropna()
con_complete["concomdate"] = "20" + con_complete["concomdate"].str[-2:]
con_complete["diff"] = con_complete["yr_pis"].astype(int) - con_complete["concomdate"].astype(int) 
con_complete["diff_sign"] = np.sign(con_complete["diff"])
con_complete["diff_sign"].value_counts()

 0    3254
-1     320
 1      18
Name: diff_sign, dtype: int64

The above shows that in 320 out of 3592 (8.9 percent) cases we have a construction completion date after a placed in service date. This could be plausible, if a building was started to be rented out before it was totally completed, but could also point to data errors. 

In [721]:
con_complete_no_zeros = con_complete[(con_complete['diff_sign'] != 0)&(con_complete['diff_sign'] != 1)]
con_complete_no_zeros[["df","diff_sign"]].groupby("df").count()

Unnamed: 0_level_0,diff_sign
df,Unnamed: 1_level_1
a,21
c,13
e,9
f,277


The vast majority of these circumstances came from a dataset where placed in service and construction completion date were given to us by the syndicator. As you can note below, the raw data from F contains both columns. 

In [722]:
f = pd.read_csv('datasets/Syndicator F Data Set.csv',skiprows=1)#unaltered Syndicator data 
f.columns

Index(['Deal Id', 'Total Project Cost', 'Total Hard Cost', '# of Sources',
       'Debt Inst Type Name', 'Financing Type', 'Hard vs Soft', 'Amount',
       'LOI Date', 'Close Date', 'Constr Start', 'Constr End', 'PIS',
       'Stabilization Date', 'Total Units', 'City', 'State', 'Zip',
       'Target Population'],
      dtype='object')

As a further robustness test for the PIS data, as it will be key to matching, I want to examine where it falls in relationship to the stabilizaiton date. Theoretically, the placed in service date should always procede the stabilitzation date. 

In [723]:
len(stab_date_check)

4730

In [724]:
stab_date_check = syndicator_data_concat[["yr_pis","stabdate","df"]].dropna()
stab_date_check["stabdate"] = "20" + stab_date_check["stabdate"].str[-2:]
stab_date_check["diff"]= stab_date_check["yr_pis"].astype(int) - stab_date_check["stabdate"].astype(int)
stab_date_check["diff_sign"] = np.sign(stab_date_check["diff"])
stab_date_check["diff_sign"].value_counts()

-1    2168
 0    1953
 1     609
Name: diff_sign, dtype: int64

It does not do so 12.8 percent of the time.

In [725]:
stab_date_check[["df","diff_sign"]].groupby("df").describe()

Unnamed: 0_level_0,diff_sign,diff_sign,diff_sign,diff_sign,diff_sign,diff_sign,diff_sign,diff_sign
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
df,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,79.0,-0.86,0.38,-1.0,-1.0,-1.0,-1.0,1.0
b,275.0,-0.64,0.52,-1.0,-1.0,-1.0,0.0,1.0
c,997.0,-0.65,0.52,-1.0,-1.0,-1.0,0.0,1.0
d,1175.0,-0.29,0.51,-1.0,-1.0,0.0,0.0,1.0
e,448.0,-0.33,0.52,-1.0,-1.0,0.0,0.0,1.0
f,983.0,0.33,0.8,-1.0,0.0,1.0,1.0,1.0
g,154.0,-0.86,0.4,-1.0,-1.0,-1.0,-1.0,1.0
h,306.0,-0.84,0.37,-1.0,-1.0,-1.0,-1.0,0.0
i,227.0,-0.22,0.47,-1.0,0.0,0.0,0.0,1.0
j,86.0,-0.77,0.42,-1.0,-1.0,-1.0,-1.0,0.0


This problem affects every syndicator dataset but J and K.

In [726]:
stab_date_check_no_zeros = stab_date_check[(stab_date_check['diff_sign'] != 0)&(con_complete['diff_sign'] != 1)]
stab_date_check_no_zeros[["df","diff_sign"]].groupby("df").count()

Unnamed: 0_level_0,diff_sign
df,Unnamed: 1_level_1
a,69
b,188
c,689
e,171
f,710
g,139
h,257
i,60


B,C,E,G,H,I were the datasets where construction completion date was substituted for PIS date. These datsets do seem to do somewhat worse on this check (especially c), but notably, F, which has the most discrepencies, is not one. 

#### By showing this I am just trying to get a sense of how skeptical we should be of the year placed in service date to see how much wiggle room we should give matching on year. I will artfully translate this moderate skeptism into "2 years wiggle room".

In [727]:
#this is just a quick check to see average distance between construction start date and year placed in service. Its ~1.5
#and since we arent using months, for the later analysis rounding up to 2 seems fine

syndicator_data_concat_con = syndicator_data_concat.dropna(subset=["yr_pis","con_stdate"])

def fixdate(x):
    if x:
        if "/" in x:
            year = x[-2:]
            if year[0] == "9":
                return "19" + year
            else:
                return "20" + year 
        else:
            return x
syndicator_data_concat_con["con_stdate"] = syndicator_data_concat_con.con_stdate.apply(lambda x: fixdate(x))
syndicator_data_concat_con["gap"] = abs(syndicator_data_concat_con["yr_pis"].astype(int) - syndicator_data_concat_con["con_stdate"].astype(float))
syndicator_data_concat_con["gap"].mean()

1.4892672094744634

### Matching 

In [728]:
#take subset of cols we care about
syndicator_data_concat.rename(columns={'df':'syndicator'}, inplace=True)#rename col 
df = syndicator_data_concat[['id', 'syndicator', 'state', 'city', 'zipcode', 'yr_pis',
                             'units_n', 'units_li', 'units_ot', 'sources_n', 'tpop',"totalcost","sources_n"]]

In [729]:
#create new unique indentifiers for each syndicator 
df.rename(columns={'id':'syn_id'}, inplace=True)
df['id'] = np.arange(df.shape[0])

In [730]:
#subset HUD lihtc data for datasets we might match on plus cols we want 
hudf = hud[['hud_id', 'project', 'proj_cty', 'proj_st', 'proj_zip', 'yr_pis', 'n_units', 'li_units', "type",
"credit","bond"]]

In [731]:
#check on data availability of city and zipcode for syndicator data 
#we can see that we wont be able to match A & G on location. Farther down i add on G b/c it has construction type as a 
#column. A is just left out of this analysis as I don't think its possible to match on, but it also only has 93 entries.
df.groupby('syndicator').apply(lambda x: x.notnull().mean())[['city', 'zipcode']]

Unnamed: 0_level_0,city,zipcode
syndicator,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,0.0
b,0.0,1.0
c,1.0,1.0
d,1.0,1.0
e,1.0,0.0
f,0.42,1.0
g,0.0,0.0
h,1.0,0.0
i,1.0,1.0
j,1.0,1.0


In [732]:
#match on zipcode 
zip_match = pd.merge(df.dropna(subset=['zipcode']), hudf.dropna(subset=['proj_zip']), left_on='zipcode', right_on='proj_zip')

In [733]:
#do some cleaning of city name 
hudf.loc[hudf.proj_cty.notnull(), 'city'] = hudf.loc[hudf.proj_cty.notnull(), 'proj_cty'].apply(lambda x: x.lower())
df.loc[df.city.notnull(), 'city'] = df.loc[df.city.notnull(), 'city'].apply(lambda x: x.lower())

In [734]:
#match on city 
city_match = pd.merge(df.dropna(subset=['city']), hudf.dropna(subset=['city']), left_on=['city', 'state'],
                     right_on=['city', 'proj_st'])

In [735]:
# yr_pis match within two year for both city and zip group 
zip_match_yr = zip_match[abs(zip_match.yr_pis_x - zip_match.yr_pis_y)<=2]
city_match_yr = city_match[abs(city_match.yr_pis_x == city_match.yr_pis_y)<=2]

The gap in the whole of the HUD data between the number of lihtc units and total units is about 5 percent. Which seems 
as good as a margin of error as any to use for number of units. 

In [736]:
#filter by +/-5% number of units 
zip_match_yr = zip_match_yr[(abs(zip_match_yr.units_n/zip_match_yr.n_units) >= .95)&(abs(zip_match_yr.units_n/zip_match_yr.n_units) <= 1.05)] 
city_match_yr = city_match_yr[(abs(city_match_yr.units_n/city_match_yr.n_units) >= .95)&(abs(city_match_yr.units_n/city_match_yr.n_units) <= 1.05)] 

In [737]:
#label type of match 
matches = zip_match_yr[~zip_match_yr.id.duplicated()]#[['id', 'hud_id']]
matches['match_type'] = 'zip+yr'

c1 = city_match_yr[~city_match_yr.id.duplicated()]#[['id', 'hud_id']]
c1['match_type'] = 'city+yr'

In [738]:
#add both types of matches together 
matches = matches.append(c1[~(c1.id.isin(matches.id))])

In [739]:
matches

Unnamed: 0,syn_id,syndicator,state,city,zipcode,yr_pis_x,units_n,units_li,units_ot,sources_n,tpop,totalcost,sources_n.1,id,hud_id,project,proj_cty,proj_st,proj_zip,yr_pis_y,n_units,li_units,type,credit,bond,match_type
10,33663,b,ME,,04426,2015.00,24.00,24.00,0.00,3.0,Elderly,"$5,333,498",3.0,94,MEA20150006,NORTH VIEW APARTMENTS,DOVER FOXCROFT,ME,04426,2015,24.00,24.00,1.00,1.00,1.00,zip+yr
11,33709,b,OK,,73134,2013.00,44.00,44.00,0.00,2.0,Elderly,"$6,849,238",2.0,95,OKA20130004,CROSS CREEK LANDING,OKLAHOMA CITY,OK,73134,2013,44.00,,1.00,2.00,2.00,zip+yr
13,33768,b,NC,,27896,2015.00,72.00,72.00,0.00,3.0,Family,"$7,183,151",3.0,96,NCA20150020,RAVENSWOOD APARTMENTS,WILSON,NC,27896,2015,72.00,72.00,1.00,2.00,2.00,zip+yr
25,34280,b,NJ,,07208,2014.00,84.00,83.00,1.00,4.0,Family,"$20,987,345",4.0,98,NJA20120412,WESTMINSTER HEIGHTS,ELIZABETH,NJ,07208,2012,84.00,83.00,1.00,,2.00,zip+yr
33,34320,b,NC,,28412,2015.00,60.00,60.00,0.00,5.0,Family,"$6,965,402",5.0,99,NCA20150016,LOCKWOOD VILLAGE APARTMENTS,WILMINGTON,NC,28412,2015,60.00,60.00,1.00,2.00,2.00,zip+yr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332935,197,i,AZ,somerton,85350,2009.00,33.00,,,6,Family,"$6,132,156",6,6471,AZA00000195,TIERRA DEL CIELO APTS,SOMERTON,AZ,,8888,32.00,26.00,,,,city+yr
332943,202,i,MT,st. regis,59866,2012.00,8.00,,,2,Family,"$1,264,070",2,6476,MTA20120030,TWO RIVERS PLACE,ST. REGIS,MT,59866-0000,2012,8.00,8.00,1.00,2.00,2.00,city+yr
332944,208,i,PR,vieques,765,2006.00,28.00,,,3,Elderly,"$2,773,353",3,6482,PRA20060105,VIEQUES ELDERLY APTS,VIEQUES,PR,,2006,28.00,28.00,3.00,3.00,2.00,city+yr
332949,211,i,PR,villalba,766,2005.00,57.00,,,3,Family,"$6,412,457",3,6485,PRA20050090,VILLALBA HOUSING SE,VILLALBA,PR,,2005,57.00,57.00,1.00,3.00,2.00,city+yr


In [740]:
#_x is syndicator data 
matches = matches.rename(columns={"yr_pis_x":"yr_pis"})

In [741]:
#want to filter for credit == 2, which means a 70% subsidy, aka 9& percent credit project. 

In [742]:
matches = matches[matches.credit == 2]

In [743]:
# there are 733 nulls, which we are throwing out entirely 
matches.credit.isna().sum()

0

In [744]:
# ~30 percent of these projects are not new constufction 
matches.type.value_counts()

1.00    1077
2.00     320
3.00      30
Name: type, dtype: int64

In [745]:
#this is code ot read in data from syndicator G, leaving in case we decide to do so later 
#As mentioned above, G does not have enough location data to use, but I will filter for new construction and append to our dataset
#read in data (these are the syndicator data post Carson's initial pre-processing)
#g = pd.read_csv("/Users/quinnunderriner/Desktop/Work/syndictor_lihtc/syndicator_g.csv")
#g = g[g.con_type == "New Construction"]
#g = g.rename(columns={"df":"syndicator"})
#append new construction G onto matches
#matches = matches.append(g)

In [746]:
matches.head()

Unnamed: 0,syn_id,syndicator,state,city,zipcode,yr_pis,units_n,units_li,units_ot,sources_n,tpop,totalcost,sources_n.1,id,hud_id,project,proj_cty,proj_st,proj_zip,yr_pis_y,n_units,li_units,type,credit,bond,match_type
11,33709,b,OK,,73134,2013.0,44.0,44.0,0.0,2.0,Elderly,"$6,849,238",2.0,95,OKA20130004,CROSS CREEK LANDING,OKLAHOMA CITY,OK,73134,2013,44.0,,1.0,2.0,2.0,zip+yr
13,33768,b,NC,,27896,2015.0,72.0,72.0,0.0,3.0,Family,"$7,183,151",3.0,96,NCA20150020,RAVENSWOOD APARTMENTS,WILSON,NC,27896,2015,72.0,72.0,1.0,2.0,2.0,zip+yr
33,34320,b,NC,,28412,2015.0,60.0,60.0,0.0,5.0,Family,"$6,965,402",5.0,99,NCA20150016,LOCKWOOD VILLAGE APARTMENTS,WILMINGTON,NC,28412,2015,60.0,60.0,1.0,2.0,2.0,zip+yr
53,34378,b,VA,,23881,2016.0,32.0,32.0,0.0,3.0,Family,"$2,669,292",3.0,101,VAA20160027,SURRY VILLAGE II,SPRING GROVE,VA,23881,2016,32.0,32.0,2.0,2.0,2.0,zip+yr
62,34401,b,VA,,22314,2014.0,54.0,54.0,0.0,3.0,Mixed,"$16,007,355",3.0,102,VAA20150013,JAMES BLAND V,ALEXANDRIA,VA,22314,2015,54.0,54.0,1.0,2.0,2.0,zip+yr


### Quick analysis and graphing 

In [747]:
#  make this code less ugly when more awake 
def adjust_for_CPI(date,house_cost):
    if date == 2000:
        return cpi.inflate(house_cost, 2000)
    if date == 2001:
        return cpi.inflate(house_cost, 2001)
    if date  == 2002:
        return cpi.inflate(house_cost, 2002)
    if date == 2003:
        return cpi.inflate(house_cost, 2003)
    if date == 2004:
        return cpi.inflate(house_cost, 2004)    
    if date == 2005:
        return cpi.inflate(house_cost, 2005)
    if date == 2006:
        return cpi.inflate(house_cost, 2006)
    if date == 2007:
        return cpi.inflate(house_cost, 2007)
    if date == 2008:
        return cpi.inflate(house_cost, 2008)
    if date == 2009:
        return cpi.inflate(house_cost, 2009)
    if date == 2010:
        return cpi.inflate(house_cost, 2010)
    if date == 2011:
        return cpi.inflate(house_cost, 2011)
    if date == 2012:
        return cpi.inflate(house_cost, 2012)
    if date == 2013:
        return cpi.inflate(house_cost, 2013)
    if date == 2014:
        return cpi.inflate(house_cost, 2014)
    if date == 2015:
        return cpi.inflate(house_cost, 2015)
    if date == 2016:
        return cpi.inflate(house_cost, 2016)
    if date == 2017:
        return cpi.inflate(house_cost, 2017)
    if date == 2018:
        return cpi.inflate(house_cost, 2018)
    if date == 2019:
        return cpi.inflate(house_cost, 2019)
    if date == 2020:
        return house_cost

In [748]:
def clean_up_for_charts(df):
    #make yr_pis an int, and substract two years for con_yr per discussion 
    df["Year"] = df["yr_pis_y"].astype(int) - 2
    #df = df.drop(columns={"yr_pis"})

    df = df.dropna(subset=["totalcost","Year","units_n"])
    df['totalcost'] = df['totalcost'].replace({'\$': '', ',': ''}, regex=True)
    df['totalcost'] = df['totalcost'].astype(int)
    #find average cost per unit, adjusted for cpi 
    
    df['totalcost_adj'] = df[["Year","totalcost"]].apply(lambda x: adjust_for_CPI(*x), axis=1)
    
    df["Cost Per Unit"] = df["totalcost_adj"] / df["units_n"].astype(int)

    #clean up name 
    df = df.rename(columns = {"sources_n":"Number of Sources","tpop":"Target Population"})
    return df

In [749]:
matches = clean_up_for_charts(matches)
matches = matches.dropna(subset=["Cost Per Unit"])

In [750]:
matches["Number of Sources"] = matches["Number of Sources"].astype(float)

In [751]:
grouped_match = matches.groupby(["Year"]).mean().reset_index()
#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
grouped_match = grouped_match.sort_values("Year")
grouped_match.Year = grouped_match.Year.astype(str)
#make sure no duplicate cols for altair
grouped_match = grouped_match.loc[:,~grouped_match.columns.duplicated()]


In [757]:
alt.Chart(grouped_match).mark_line().encode(
    x='Year',
    y='Cost Per Unit').configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Cost Per Unit Over Time (CPI Adjusted 2020)"]})


In [756]:
alt.Chart(grouped_match).mark_line().encode(
    x='Year',
    y='Number of Sources'
).configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Projects: Number of Sources Over Time"]})