In [146]:
import pandas as pd
import numpy as np
import re
import matplotlib as plt
import seaborn as sns
import warnings
import altair as alt
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4000)

In [147]:
#read in data (these are the syndicator data post Carson's initial pre-processing)
b = pd.read_csv("syndicator_b.csv")
i = pd.read_csv("syndicator_i.csv")
g = pd.read_csv("syndicator_g.csv")

In [148]:
b.head()

Unnamed: 0.1,Unnamed: 0,id,totalcost,constcost,hardcost,acqucost,landcost,equity_lif,equity_lis,con_type,sources_n,lien1,lien1type,lien2,lien2type,lien3,lien3type,lien4,lien4type,lien5,lien5type,lien6,lien6type,lien7,lien7type,lien8,lien8type,lien9,lien9type,lien10,lien10type,acqudate,con_stdate,concomdate,stabdate,units_n,units_li,units_ot,zipcode,build_n,sqft,tpop,state,df,yr_pis
0,0,33532,"$32,481,257","$32,481,257","$26,248,298",$-,$-,"$10,363,780",$-,New,6.0,"$8,876,272",Soft,"$1,750,000",Soft,"$2,714,861",Soft,"$2,500,000",Soft,"$5,882,070",Soft,,,,,,,,,,,2/7/13,9/1/11,12/23/14,11/30/15,75.0,74.0,1.0,2111,1.0,35986,Elderly,MA,b,2014
1,1,33663,"$5,333,498","$5,179,631","$3,403,697",$-,"$153,867","$1,565,780",$-,New,3.0,"$1,000,000",Hard,"$2,332,547",Soft,,,,,,,,,,,,,,,,,10/31/14,11/1/14,9/8/15,10/29/15,24.0,24.0,0.0,4426,1.0,14436,Elderly,ME,b,2015
2,2,33709,"$6,849,238","$6,849,238","$4,971,098",$-,$-,"$6,336,867",$-,New,2.0,"$900,000",Hard,,,,,,,,,,,,,,,,,,,5/1/12,9/21/12,7/29/13,7/30/15,44.0,44.0,0.0,73134,23.0,41770,Elderly,OK,b,2013
3,3,33768,"$7,183,151","$6,626,903","$6,270,653",$-,"$556,248","$5,572,243",$-,New,3.0,"$600,000",Hard,"$1,895,224",Soft,,,,,,,,,,,,,,,,,3/21/14,2/20/14,6/11/15,10/20/15,72.0,72.0,0.0,27896,9.0,61308,Family,NC,b,2015
4,4,34008,"$5,957,537","$5,161,743","$5,161,743",$-,"$795,794","$4,030,489",$-,New,3.0,"$1,695,600",Hard,"$1,816,045",Soft,,,,,,,,,,,,,,,,,8/1/13,8/6/13,12/23/14,12/11/15,56.0,56.0,0.0,27030,5.0,58736,Family,NC,b,2014


#### Syndicator B 

In [149]:
cols = ["totalcost","con_type","units_n","yr_pis","sources_n"]

def clean_syndicator_cols(df,cols,df_name):
    """
    Grab only cols i need to find avg unit cost, clean numeric cols and convert to ints 
    """
    df = df[cols]
    df[cols] = df[cols].replace({'\$': '', ',': ''}, regex=True)
    df[["totalcost","units_n"]] = df[["totalcost","units_n"]].astype(int)
    df["flag"] = df_name
    return df 

In [150]:
#clean strings and turn strings into ints 
b_units_cost = clean_syndicator_cols(b,cols,"b")
b_units_cost.head()

Unnamed: 0,totalcost,con_type,units_n,yr_pis,sources_n,flag
0,32481257,New,75,2014,6.0,b
1,5333498,New,24,2015,3.0,b
2,6849238,New,44,2013,2.0,b
3,7183151,New,72,2015,3.0,b
4,5957537,New,56,2014,3.0,b


#### Syndicator i 

In [151]:
i_units_cost = clean_syndicator_cols(i,cols,"i")
i_units_cost.head()

Unnamed: 0,totalcost,con_type,units_n,yr_pis,sources_n,flag
0,16715361,New,98,2015,4,i
1,8541173,Rehab-Around,100,2014,8,i
2,22461237,New,80,2014,3,i
3,73044023,New,136,2009,8,i
4,23949900,New,65,2011,5,i


#### Syndicator G 


In [152]:
g.head()

Unnamed: 0.1,Unnamed: 0,id,state,units_n,con_type,status,loi_date,closedate,concomdate,stabdate,totalcost,hardcost,sources_n,df,yr_pis
0,0,1.0,CA,136.0,New Construction,Stabilized,7/31/06,4/19/07,9/11/09,10/1/10,"$69,809,586","$56,003,771",4.0,g,2009.0
1,1,2.0,CA,137.0,Rehab,Stabilized,9/5/14,12/19/14,5/30/18,1/0/00,"$39,190,669","$28,046,114",7.0,g,2018.0
2,2,3.0,CA,107.0,New Construction,Stabilized,3/10/08,6/1/08,7/19/10,4/1/11,"$49,178,710","$35,056,858",3.0,g,2010.0
3,3,4.0,TX,142.0,New Construction,Stabilized,1/22/08,8/1/08,7/1/10,1/31/12,"$21,037,569","$14,034,624",4.0,g,2010.0
4,4,5.0,TX,240.0,New Construction,Stabilized,4/27/15,12/18/15,11/6/17,2/1/19,"$39,964,744","$27,196,160",4.0,g,2017.0


In [153]:
g = g.dropna()
g_units_cost = clean_syndicator_cols(g,cols,"g")
g_units_cost.head()

Unnamed: 0,totalcost,con_type,units_n,yr_pis,sources_n,flag
0,69809586,New Construction,136,2009.0,4.0,g
1,39190669,Rehab,137,2018.0,7.0,g
2,49178710,New Construction,107,2010.0,3.0,g
3,21037569,New Construction,142,2010.0,4.0,g
4,39964744,New Construction,240,2017.0,4.0,g


### Concat together and clean up difference between new and rehab

In [154]:
unit_cost = pd.concat([b_units_cost,i_units_cost,g_units_cost])

In [155]:
def clean_up_for_charts(df):
    #find average cost per unit 
    df["Cost Per Unit"] = df["totalcost"].astype(int) / df["units_n"].astype(int)
    #make yr_pis an int, and substract two years for con_yr per discussion 
    df["Year"] = df["yr_pis"].astype(int) - 2
    df = df.drop(columns={"yr_pis"})
    #clean up name 
    df = df.rename(columns = {"sources_n":"Number of Sources","tpop":"Target Population"})
    return df 

In [156]:
unit_cost = clean_up_for_charts(unit_cost)

In [157]:
#clearly need to standize con_type 
unit_cost.con_type.value_counts()

 New                151
New                 143
 Rehab              107
New Construction    101
Rehab-Around         71
Rehab                29
Rehab - TIP          19
Gut Rehab            12
 Historic Rehab      11
 New/Rehab            6
Acq/Rehab             2
Adaptive              1
Name: con_type, dtype: int64

In [158]:
#drop when it says both new and rehab as unclear what these are and there are few
#and then standarize to just new and rehab
unit_cost = unit_cost[~unit_cost.con_type.isin([' New/Rehab ',"New/Rehab","Acq/Rehab","New Construction & Rehab - TIP","New Construction & Rehab"])]

In [159]:
def categorize_new(x):
    if "New" in x:
        return "New Construction"
    else:
        return "Rehab"
#label new or rehab 
unit_cost["con_type_clean"] = unit_cost.con_type.apply(lambda x: categorize_new(x))                                                                    

### Filtering dataset to new construction only as the main indicator of 9% vs 4% leaves us with 395 observations 

In [160]:
potential_9_new_construction = unit_cost[unit_cost["con_type_clean"] == "New Construction"]

In [161]:
len(potential_9_new_construction)

395

In [162]:
#only one observation in 2019, going to drop as its value is 10 for source count, a clear outlier
potential_9_new_construction.groupby("Year").count()

Unnamed: 0_level_0,totalcost,con_type,units_n,Number of Sources,flag,Cost Per Unit,con_type_clean
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003,3,3,3,3,3,3,3
2004,8,8,8,8,8,8,8
2005,12,12,12,12,12,12,12
2006,33,33,33,33,33,33,33
2007,35,35,35,35,35,35,35
2008,23,23,23,23,23,23,23
2009,14,14,14,14,14,14,14
2010,15,15,15,15,15,15,15
2011,28,28,28,28,28,28,28
2012,26,26,26,26,26,26,26


In [163]:
#drop 2019
potential_9_new_construction = potential_9_new_construction[potential_9_new_construction.Year != 2019]

In [164]:
#this sets color and font theme for altair 
def terner_theme2():
    # Typography
    font = "Veranda Regular"
    labelFont = "Veranda Regular" 
    sourceFont = "Veranda Regular"
    # Axes
    axisColor = "#FFFFFF"
    gridColor = "#DEDDDD"
    markColor = "#a6833d"
    # Colors
    main_palette = ["#FFB81D",
                   "#4E748B"]
                    
    sequential_palette = ["#FFB81D",
                   "#4E748B"]
    
    return{"width": 400, 
          "height": 400, 
          "config": {
              "title": {
                  "fontSize": 18,
                  "font": font,
                  "anchor": "start", 
                  "fontColor": "#000000"
              },
              
              "range": {
                  "category": main_palette,
                  "diverging": sequential_palette,

              
            },
                      "legend": {
                  "labelFont": labelFont,
                  "labelLimit": 0
                      },
}
    }

# register
alt.themes.register("terner_theme2", terner_theme2)
# enable
alt.themes.enable("terner_theme2")

ThemeRegistry.enable('terner_theme2')

In [165]:
potential_9_new_construction_grouped = potential_9_new_construction.groupby(["Year"]).mean().reset_index()

#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
potential_9_new_construction_grouped = potential_9_new_construction_grouped.sort_values("Year")
potential_9_new_construction_grouped.Year = potential_9_new_construction_grouped.Year.astype(str)

In [166]:
#not CPI adjusted, but don't think we will use this chart 
alt.Chart(potential_9_new_construction_grouped).mark_line().encode(
    x='Year',
    y='Number of Sources'
).configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["New Construction Syndicator Projects: Number of Sources Over Time"]})

In [184]:
potential_9_new_construction_grouped.head()

Unnamed: 0,Year,totalcost,units_n,Number of Sources,Cost Per Unit
0,2003,7029156.33,72.33,2.33,101389.7
1,2004,10754379.88,70.25,3.88,127725.14
2,2005,14328374.42,92.83,4.5,209750.56
3,2006,13066111.94,72.91,5.18,207027.66
4,2007,20759185.03,85.4,4.97,248642.8


In [185]:
alt.Chart(potential_9_new_construction_grouped).mark_line().encode(
    x='Year',
    y='Cost Per Unit'
).configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["New Construction Syndicator Projects: Cost Per Unit Over Time"]})

### Syndicator data where the debt to equity ratio is over 50%, implying a 9% project 

In [168]:
b = b.rename(columns={"equity_lif":"equity_net"}) #make measues of equity overlap 
debt_to_equity = pd.concat([b,i])

In [169]:
#used below to label equity percentages as either likley 9% or 4% 
def is_nine_percent(x):
    if x > .5:
        return "Likely 9% Project"
    else:
        return "Likely 4% Project"

In [170]:
#turn strings into ints to find equity ratio
cols = ['totalcost', 'equity_net']

# pass them to df.replace(), specifying each char and it's replacement:
debt_to_equity[cols] = debt_to_equity[cols].replace({'\$': '', ',': ''}, regex=True)

In [171]:
debt_to_equity["equity_ratio"] = debt_to_equity["equity_net"].astype(int) / debt_to_equity["totalcost"].astype(int)
debt_to_equity["project_type"] = debt_to_equity.equity_ratio.apply(lambda x: is_nine_percent(x))

In [172]:
#there are 315 likely 9% projects measured this way 
debt_to_equity["project_type"].value_counts()

Likely 9% Project    315
Likely 4% Project    187
Name: project_type, dtype: int64

In [173]:
debt_to_equity = debt_to_equity[debt_to_equity["project_type"]=="Likely 9% Project"]

In [174]:
#calc earlier function to clean up data
debt_to_equity = clean_up_for_charts(debt_to_equity)

In [175]:
#worth dropping everything before 2004 and after 2018 
debt_to_equity.groupby("Year").count()

Unnamed: 0_level_0,Unnamed: 0,id,totalcost,constcost,hardcost,acqucost,landcost,equity_net,equity_lis,con_type,Number of Sources,lien1,lien1type,lien2,lien2type,lien3,lien3type,lien4,lien4type,lien5,lien5type,lien6,lien6type,lien7,lien7type,lien8,lien8type,lien9,lien9type,lien10,lien10type,acqudate,con_stdate,concomdate,stabdate,units_n,units_li,units_ot,zipcode,build_n,sqft,Target Population,state,df,city,sqft_rent,source1type,source1,source2type,source2,source3type,source3,source4type,source4,source5type,source5,source6type,source6,source7type,source7,source8type,source8,source9type,source9,source10type,source10,source11type,source11,loi_date,closedate,equity_ratio,project_type,Cost Per Unit
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
1998,1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
2002,1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
2003,2,2,2,0,0,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,0,0,2,0,0,2,2,2,2,0,2,2,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2
2004,7,7,7,0,0,0,0,7,0,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,7,7,7,0,0,7,0,0,7,7,7,7,1,7,7,7,7,7,7,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,7,7,7,7
2005,4,4,4,0,0,0,0,4,0,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,4,0,0,4,0,0,4,4,4,4,4,4,4,4,4,4,4,3,3,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,4,4,4,4
2006,20,20,20,0,0,0,0,20,0,20,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,20,20,20,0,0,20,0,0,20,20,20,20,18,20,20,20,20,20,20,12,12,8,8,4,4,4,4,4,4,2,2,0,0,0,0,0,20,20,20,20
2007,14,14,14,0,0,0,0,14,0,14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,14,14,14,0,0,14,0,0,14,14,14,14,14,14,14,14,14,14,14,8,8,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,14,14,14,14
2008,3,3,3,0,0,0,0,3,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3,0,0,3,0,0,3,3,3,3,3,3,3,3,3,3,3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3
2009,2,2,2,0,0,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,0,0,2,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,2,2,2,2
2010,3,3,3,0,0,0,0,3,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3,0,0,3,0,0,3,3,3,3,3,3,3,3,3,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3


In [176]:
#drop years noted above
debt_to_equity = debt_to_equity[debt_to_equity.Year.isin(np.arange(2004,2019))]

In [177]:
#need to pass grouped data to alair or if it will display all the data points 
debt_to_equity_group = debt_to_equity.groupby("Year").mean().reset_index()
#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
debt_to_equity_group = debt_to_equity_group.sort_values("Year")
debt_to_equity_group.Year = debt_to_equity_group.Year.astype(str)

In [178]:
debt_to_equity_group_graph_sources = alt.Chart(debt_to_equity_group).mark_line().encode(
    x='Year',
    y='Number of Sources').configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator Data with > 50% Debt to Equity Threshold: Number of Sources Over Time"]})
debt_to_equity_group_graph_sources

In [179]:
debt_to_equity_group_graph_per_unit_cost = alt.Chart(debt_to_equity_group).mark_line().encode(
    x='Year',
    y='Cost Per Unit').configure(background="#ffffff").configure_legend().properties(
    title={
      "text": ["Syndicator 50% Debt to Equity Threshold: Cost Per Unit Over Time"]})
debt_to_equity_group_graph_per_unit_cost

In [180]:
debt_to_equity_grouped = debt_to_equity.groupby(["Target Population","Year"]).mean().reset_index()
#want to drop mixed and SRO because 
debt_to_equity_grouped = debt_to_equity_grouped[debt_to_equity_grouped["Target Population"].isin(["Elderly","Family"])]
#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
debt_to_equity_grouped = debt_to_equity_grouped.sort_values("Year")
debt_to_equity_grouped.Year = debt_to_equity_grouped.Year.astype(str)

In [182]:
debt_to_equity_grouped_pop = debt_to_equity.groupby(["Target Population","Year"]).mean().reset_index()
#want to drop mixed and SRO because 
debt_to_equity_grouped_pop = debt_to_equity_grouped_pop[debt_to_equity_grouped_pop["Target Population"].isin(["Elderly","Family"])]
#need to sort by year and make it a string to not have a comma (like 2,004) in the year name
debt_to_equity_grouped_pop = debt_to_equity_grouped_pop.sort_values("Year")
debt_to_equity_grouped_pop.Year = debt_to_equity_grouped_pop.Year.astype(str)

In [183]:
alt.Chart(debt_to_equity_grouped_pop).mark_line().encode(
    x='Year',
    y='Cost Per Unit',
    color="Target Population"
)