In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib as plt
import seaborn as sns
import warnings
import altair as alt
import cpi
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)

In [None]:
#Load in HUD data from https://lihtc.huduser.gov/
HUD_lihtc_filepath = 'LIHTCPUB.csv'
hud = pd.read_csv(HUD_lihtc_filepath)

In [None]:
#load in HUD data 
syndicator_data_concat = pd.read_csv("syndicator_data_concat.csv")

#### Some quick checks on year placed in service reliability (can skip to matching sections) 

Below I am checking for differences between year placed in service 
and construction completion date in the syndicator data.

In [None]:
con_complete= syndicator_data_concat[["concomdate","yr_pis","df"]]
con_complete=con_complete.dropna()
con_complete["concomdate"] = "20" + con_complete["concomdate"].str[-2:]
con_complete["diff"] = con_complete["yr_pis"].astype(int) - con_complete["concomdate"].astype(int) 
con_complete["diff_sign"] = np.sign(con_complete["diff"])
con_complete["diff_sign"].value_counts()

The above shows that in 320 out of 3592 (8.9 percent) cases we have a construction completion date after a placed in service date. This could be plausible, if a building was started to be rented out before it was totally completed, but could also point to data errors. 

In [None]:
con_complete_no_zeros = con_complete[(con_complete['diff_sign'] != 0)&(con_complete['diff_sign'] != 1)]
con_complete_no_zeros[["df","diff_sign"]].groupby("df").count()

The vast majority of these circumstances came from a dataset where placed in service and construction completion date were given to us by the syndicator. As you can note below, the raw data from F contains both columns. 

In [None]:
f = pd.read_csv('datasets/Syndicator F Data Set.csv',skiprows=1)#unaltered Syndicator data 
f.columns

As a further robustness test for the PIS data, as it will be key to matching, I want to examine where it falls in relationship to the stabilizaiton date. Theoretically, the placed in service date should always procede the stabilitzation date. 

In [None]:
stab_date_check = syndicator_data_concat[["yr_pis","stabdate","df"]].dropna()
stab_date_check["stabdate"] = "20" + stab_date_check["stabdate"].str[-2:]
stab_date_check["diff"]= stab_date_check["yr_pis"].astype(int) - stab_date_check["stabdate"].astype(int)
stab_date_check["diff_sign"] = np.sign(stab_date_check["diff"])
stab_date_check["diff_sign"].value_counts()

It does not do so 12.8 percent of the time.

In [None]:
stab_date_check[["df","diff_sign"]].groupby("df").describe()

This problem affects every syndicator dataset but J and K.

In [None]:
stab_date_check_no_zeros = stab_date_check[(stab_date_check['diff_sign'] != 0)&(con_complete['diff_sign'] != 1)]
stab_date_check_no_zeros[["df","diff_sign"]].groupby("df").count()

B,C,E,G,H,I were the datasets where construction completion date was substituted for PIS date. These datasets do seem to do somewhat worse on this check (especially c), but notably, F, which has the most discrepancies, is not one.  

#### By showing this I am just trying to get a sense of how skeptical we should be of the year placed in service date to see how much wiggle room we should give matching on year. I will artfully translate this moderate skepticism into "2 years wiggle room". 

In [None]:
#this is just a quick check to see average distance between construction start date and year placed in service. Its ~1.5
#and since we arent using months, for the later analysis rounding up to 2 seems fine

syndicator_data_concat_con = syndicator_data_concat.dropna(subset=["yr_pis","con_stdate"])

def fixdate(x):
    if "/" in x:
        year = x[-2:]
        if year[0] == "9":
            return "19" + year
        else:
            return "20" + year 
    else:
        return x
syndicator_data_concat_con["con_stdate"] = syndicator_data_concat_con.con_stdate.apply(lambda x: fixdate(x))
syndicator_data_concat_con["gap"] = abs(syndicator_data_concat_con["yr_pis"].astype(int) - syndicator_data_concat_con["con_stdate"].astype(float))
syndicator_data_concat_con["gap"].mean()

In [None]:
syndicator_data_concat_con["gap"].describe()

### Matching 

In [None]:
#take subset of cols we care about
syndicator_data_concat.rename(columns={'df':'syndicator'}, inplace=True)#rename col 
df = syndicator_data_concat[['id', 'syndicator', 'state', 'city', 'zipcode', 'yr_pis',
                             'units_n', 'units_li', 'units_ot', 'sources_n', 'tpop',"totalcost","sources_n","con_stdate"]]

In [None]:
syndicator_data_concat.head()

In [None]:
df.con_stdate.unique()

In [None]:
type(df.con_stdate[0])

In [None]:
#create new unique indentifiers for each syndicator 
df.rename(columns={'id':'syn_id'}, inplace=True)
df['id'] = np.arange(df.shape[0])


In [None]:
#subset HUD lihtc data for datasets we might match on plus cols we want 
hudf = hud[['hud_id', 'project', 'proj_cty', 'proj_st', 'proj_zip', 'yr_pis', 'n_units', 'li_units', "type",
"credit","bond"]]

In [None]:
#check on data availability of city and zipcode for syndicator data 
#we can see that we wont be able to match A & G on location. Farther down i add on G b/c it has construction type as a 
#column. A is just left out of this analysis as I don't think its possible to match on, but it also only has 93 entries.
df.groupby('syndicator').apply(lambda x: x.notnull().mean())[['city', 'zipcode']]

In [None]:
#match on zipcode 
zip_match = pd.merge(df.dropna(subset=['zipcode']), hudf.dropna(subset=['proj_zip']), left_on='zipcode', right_on='proj_zip')

In [None]:
#do some cleaning of city name 
hudf.loc[hudf.proj_cty.notnull(), 'city'] = hudf.loc[hudf.proj_cty.notnull(), 'proj_cty'].apply(lambda x: x.lower())
df.loc[df.city.notnull(), 'city'] = df.loc[df.city.notnull(), 'city'].apply(lambda x: x.lower())

In [None]:
#match on city 
city_match = pd.merge(df.dropna(subset=['city']), hudf.dropna(subset=['city']), left_on=['city', 'state'],
                     right_on=['city', 'proj_st'])

In [None]:
# yr_pis match within two year for both city and zip group 
zip_match_yr = zip_match[abs(zip_match.yr_pis_x - zip_match.yr_pis_y)<=2]
city_match_yr = city_match[abs(city_match.yr_pis_x == city_match.yr_pis_y)<=2]

The gap in the whole of the HUD data between the number of lihtc units and total units is about 5 percent. Which seems 
as good as a margin of error as any to use for number of units. 

In [None]:
#filter by +/-5% number of units 
zip_match_yr = zip_match_yr[(abs(zip_match_yr.units_n/zip_match_yr.n_units) >= .95)&(abs(zip_match_yr.units_n/zip_match_yr.n_units) <= 1.05)] 
city_match_yr = city_match_yr[(abs(city_match_yr.units_n/city_match_yr.n_units) >= .95)&(abs(city_match_yr.units_n/city_match_yr.n_units) <= 1.05)] 

In [None]:
#label type of match 
matches = zip_match_yr[~zip_match_yr.id.duplicated()]#[['id', 'hud_id']]
matches['match_type'] = 'zip+yr'

c1 = city_match_yr[~city_match_yr.id.duplicated()]#[['id', 'hud_id']]
c1['match_type'] = 'city+yr'

In [None]:
#add both types of matches together 
matches = matches.append(c1[~(c1.id.isin(matches.id))])

In [None]:
matches.head()

In [None]:
#keep year construction starts and clean it if it exists, if it does no exist us PIS - 2 

#clean up construction start year
df["con_stdate"] = df["con_stdate"].astype(str)
matches["con_stdate_PIS"] = matches["yr_pis_y"] - 2 
matches.loc[matches.con_stdate.notnull(), 'con_stdate'] = matches.loc[matches.con_stdate.notnull(), 'con_stdate'].apply(lambda x: fixdate(x))
matches.loc[matches.con_stdate.isnull(), 'con_stdate'] = matches.loc[matches.con_stdate.isnull(), 'con_stdate_PIS']#.apply(lambda x: sjdfkjdkfj)
matches = matches.rename(columns={"con_stdate":"Year"})

In [None]:
#_x is syndicator data 
matches = matches.rename(columns={"yr_pis_x":"yr_pis"})

In [None]:
#want to filter for credit == 2, which means a 70% subsidy, aka 9& percent credit project. 

In [None]:
matches = matches[matches.credit == 2]

In [None]:
# there are 733 nulls, which we are throwing out entirely 
matches.credit.isna().sum()

In [None]:
# ~30 percent of these projects are not new constufction 
matches.type.value_counts()

In [None]:
#this is code ot read in data from syndicator G, leaving in case we decide to do so later 
#As mentioned above, G does not have enough location data to use, but I will filter for new construction and append to our dataset
#read in data (these are the syndicator data post Carson's initial pre-processing)
#g = pd.read_csv("/Users/quinnunderriner/Desktop/Work/syndictor_lihtc/syndicator_g.csv")
#g = g[g.con_type == "New Construction"]
#g = g.rename(columns={"df":"syndicator"})
#append new construction G onto matches
#matches = matches.append(g)

### Quick analysis and graphing 

In [None]:
def adjust_for_CPI(date,house_cost):
    if date != 2020:
        return cpi.inflate(house_cost, date)
    else:
        return house_cost
def fix_year(x): #edge cases with weird year 
    if x == 8886:
        return 1986
    if x == 9997:
        return 1997
    else:
        return x
def clean_up_for_charts(df):
    #make yr_pis an int, and substract two years for con_yr per discussion 
    #df["Year"] = df["yr_pis_y"].astype(int) - 2
    #df = df.drop(columns={"yr_pis"})
    df.Year = df.Year.apply(lambda x: fix_year(x))

    df = df.dropna(subset=["totalcost","Year","units_n"])
    df['totalcost'] = df['totalcost'].replace({'\$': '', ',': ''}, regex=True)
    df['totalcost'] = df['totalcost'].astype(int)
    #find average cost per unit, adjusted for cpi 
    
    df["Year"] = df["Year"].astype(int)
    df['totalcost_adj'] = df[["Year","totalcost"]].apply(lambda x: adjust_for_CPI(*x), axis=1)
    
    df["Cost Per Unit"] = df["totalcost_adj"] / df["units_n"].astype(int)
    df = df.dropna(subset=["Cost Per Unit"])

    #clean up name 
    df = df.rename(columns = {"sources_n":"Number of Sources","tpop":"Target Population"})
    df["Number of Sources"] = df["Number of Sources"].astype(float)
    return df

In [None]:
matches = clean_up_for_charts(matches)
matches = matches.loc[:,~matches.columns.duplicated()] #double check no duplicate cols before putting to csv


In [None]:
matches["Target Population"] = matches["Target Population"].replace(
    {"Senior - Age Restricted": 'Senior',
    'Elderly':"Senior",
    "Special Needs":"Supportive Housing",
    "Senior (62+)":"Senior",
    "Senior (55+)":"Senior",
    "Formerly Homeless":"Supportive Housing"})

In [None]:
#dropped mixed 
matches["Target Population"].value_counts()

In [None]:
matches = matches.loc[:,~matches.columns.duplicated()] #double check no duplicate cols before putting to csv
#matches.to_csv("big_syndicator_data_set.csv")

In [None]:
#need to group by year to be able to make charts 
grouped_match = matches.groupby(["Year"]).mean().reset_index()
#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
grouped_match = grouped_match.sort_values("Year")
grouped_match.Year = grouped_match.Year.astype(str)
#make sure no duplicate cols for altair
grouped_match = grouped_match.loc[:,~grouped_match.columns.duplicated()]


In [None]:
alt.Chart(grouped_match).mark_line().encode(
    x='Year',
    y='Cost Per Unit').configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Cost Per Unit Over Time (CPI Adjusted 2020)"]})


In [None]:
grouped_match = grouped_match[grouped_match.Year >= "1991"]
alt.Chart(grouped_match).mark_line().encode(
    x='Year',
    y='Number of Sources'
).configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Projects: Number of Sources Over Time"]})

In [None]:
matches_target_pop = matches
matches_target_pop["Target Population"] = matches_target_pop["Target Population"].replace(
    {"Senior - Age Restricted": 'Senior',
    'Elderly':"Senior",
    "Special Needs":"Supportive Housing",
    "Senior (62+)":"Senior",
    "Senior (55+)":"Senior",
    "Formerly Homeless":"Supportive Housing"})

In [None]:
matches_target_pop = matches_target_pop[matches_target_pop["Target Population"].isin(["Senior","Family","Supportive Housing"])]
#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
matches_target_pop = matches_target_pop.sort_values("Year")
matches_target_pop.Year = matches_target_pop.Year.astype(str)
matches_target_pop=matches_target_pop.groupby(["Year","Target Population"]).mean().reset_index()

In [None]:
alt.Chart(matches_target_pop).mark_line().encode(
    x='Year',
    y='Cost Per Unit',
    color="Target Population"
).configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Cost Per Unit Over Time by Target Population"]})

In [None]:
alt.Chart(matches_target_pop).mark_line().encode(
    x='Year',
    y='Number of Sources',
    color="Target Population"
).configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Cost Per Unit Over Time by Target Population"]})