In [27]:
import pandas as pd
from functools import reduce
import numpy as np
import re
import matplotlib as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4000)

In [30]:
#read in data 
b = pd.read_csv("syndicator_b.csv")
i = pd.read_csv("syndicator_i.csv")
g = pd.read_csv("syndicator_g.csv")

#### Syndicator B 

In [31]:
cols = ["totalcost","con_type","units_n"]

def clean_syndicator_cols(df,cols,df_name):
    """
    Grab only cols i need to find avg unit cost, clean numeric cols and convert to ints 
    """
    df = df[cols]
    df[cols] = df[cols].replace({'\$': '', ',': ''}, regex=True)
    df[["totalcost","units_n"]] = df[["totalcost","units_n"]].astype(int)
    df["flag"] = df_name
    return df 

In [32]:
#clean strings and turn strings into ints 
b_units_cost = clean_syndicator_cols(b,cols,"b")
b_units_cost.head()

Unnamed: 0,totalcost,con_type,units_n,flag
0,32481257,New,75,b
1,5333498,New,24,b
2,6849238,New,44,b
3,7183151,New,72,b
4,5957537,New,56,b


#### Syndicator i 

In [33]:
i_units_cost = clean_syndicator_cols(i,cols,"i")
i_units_cost.head()

Unnamed: 0,totalcost,con_type,units_n,flag
0,16715361,New,98,i
1,8541173,Rehab-Around,100,i
2,22461237,New,80,i
3,73044023,New,136,i
4,23949900,New,65,i


#### Syndicator G 


In [39]:
g = g.dropna()
g_units_cost = clean_syndicator_cols(g,cols,"g")
g_units_cost.head()

Unnamed: 0,totalcost,con_type,units_n,flag
0,69809586,New Construction,136,g
1,39190669,Rehab,137,g
2,49178710,New Construction,107,g
3,21037569,New Construction,142,g
4,39964744,New Construction,240,g


### concat together and find avg. 

In [42]:
unit_cost = pd.concat([b_units_cost,i_units_cost,g_units_cost])

In [43]:
#find average cost per unit 
unit_cost["cost_per_unit"] = unit_cost["totalcost"] / unit_cost["units_n"]

In [44]:
#clearly need to standize con_type 
unit_cost.con_type.value_counts()

 New                151
New                 143
 Rehab              107
New Construction    101
Rehab-Around         71
Rehab                29
Rehab - TIP          19
Gut Rehab            12
 Historic Rehab      11
 New/Rehab            6
Acq/Rehab             2
Adaptive              1
Name: con_type, dtype: int64

In [46]:
#drop when it says both new and rehab as unclear what these are and there are few
#and then standarize to just new and rehab
unit_cost = unit_cost[~unit_cost.con_type.isin([' New/Rehab ',"New/Rehab","Acq/Rehab","New Construction & Rehab - TIP","New Construction & Rehab"])]

In [47]:
def categorize_new(x):
    if "New" in x:
        return "New Construction"
    else:
        return "Rehab"
#label new or rehab 
unit_cost["con_type_clean"] = unit_cost.con_type.apply(lambda x: categorize_new(x))                                                                    

In [48]:
unit_cost[["con_type_clean",'cost_per_unit']].groupby("con_type_clean").describe()

Unnamed: 0_level_0,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
con_type_clean,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
New Construction,395.0,208792.0,101422.97,79815.64,134653.6,173091.85,265229.35,540610.69
Rehab,250.0,166151.7,92777.95,35525.3,96311.85,142922.67,208986.31,635903.44


In [49]:
unit_cost[["con_type_clean",'cost_per_unit',"flag"]].groupby(["con_type_clean","flag"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit,cost_per_unit
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
con_type_clean,flag,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
New Construction,b,151.0,198188.28,94706.98,87302.07,135706.51,169929.68,225149.64,517371.78
New Construction,g,101.0,221897.33,115639.6,86162.98,131852.45,172898.17,295704.5,540610.69
New Construction,i,143.0,210732.72,96975.42,79815.64,137353.34,183675.3,268824.85,537088.4
Rehab,b,118.0,151751.11,97598.95,35525.3,84027.78,111870.01,195060.34,635903.44
Rehab,g,48.0,206354.3,98619.48,49173.76,111511.61,204743.57,260148.79,404705.22
Rehab,i,84.0,163408.17,75174.02,55399.76,116937.56,145642.65,196360.12,559205.95
