In [None]:
# import packages 

import pandas as pd
import numpy as np
import requests
import os
import pickle
import matplotlib.pyplot as plt

In [None]:
# adjust pandas printing

pd.options.display.max_colwidth = 200
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

In [None]:
# register for census API https://api.census.gov/data/key_signup.html
# store key as environment variable 

CENSUS_API_KEY = os.getenv('CENSUS_API_KEY')

In [None]:
# import dataframe redfin

dfrf = pd.read_pickle('final_for_automl.pkl')

In [None]:
# inspect data frame 

dfrf.head()

Unnamed: 0,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,X,Y,Z,yearSold,Neighborhood_Arlington,Neighborhood_Aspinwall,Neighborhood_Bloomfield,Neighborhood_Central North Side,Neighborhood_Downtown Pgh,Neighborhood_E Pittsburgh,Neighborhood_East Allegheny,Neighborhood_East Liberty,Neighborhood_Edgewood,Neighborhood_Fox Chapel,Neighborhood_Friendship Park,Neighborhood_Garfield,Neighborhood_Greenfield,Neighborhood_Hazelwood,Neighborhood_Highland Park,Neighborhood_Hill District,Neighborhood_Homestead,Neighborhood_Homewood-Brushton,Neighborhood_Lawrenceville,Neighborhood_Lincoln Place,Neighborhood_Lincoln-Larimer,Neighborhood_Morningside,Neighborhood_Mt Oliver,Neighborhood_Munhall,Neighborhood_Murdoch Farms,Neighborhood_North of Forbes,Neighborhood_O'Hara,Neighborhood_Oakland,Neighborhood_Penn Hills,Neighborhood_Point Breeze,Neighborhood_Polish Hill,Neighborhood_Regent Square,Neighborhood_Reserve,Neighborhood_Ross Twp,Neighborhood_Schenley Farms,Neighborhood_Schenley Heights,Neighborhood_Shadyside,Neighborhood_Sharpsburg,Neighborhood_South Side,Neighborhood_South-Other Area,Neighborhood_Spring Garden,Neighborhood_Spring Hill,Neighborhood_Squirrel Hill,Neighborhood_Stanton Heights,Neighborhood_Swissvale,Neighborhood_Troy Hill,Neighborhood_Wilkinsburg,sin_monthSold,PRICE,Crime Score,HS_Score,ES_MS_Score,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),Location,dt
2,4.0,4.0,1785.0,1306.0,1900.0,0.124192,-0.905528,0.405704,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.8660254,275000.0,35.38036,95.233333,62.0,https://www.redfin.com/PA/Pittsburgh/45-Greeley-St-15203/home/74490865,"(40.4229551, -79.9743145)",2021-10-01
3,5.0,2.0,3034.0,2178.0,1900.0,0.142107,-0.921593,0.361209,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.224647e-16,302500.0,25.490296,58.833333,79.666667,https://www.redfin.com/PA/Pittsburgh/259-45th-St-15201/home/74651854,"(40.4711406, -79.9576202)",2021-06-01
4,3.0,1.5,1972.0,1306.0,1890.0,0.146368,-0.917002,0.371059,21,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.8660254,301000.0,26.678413,58.833333,54.5,https://www.redfin.com/PA/Pittsburgh/431-Taylor-St-15224/home/74563342,"(40.4605557, -79.9523315)",2021-10-01
9,2.0,1.5,1226.0,1742.0,1890.0,0.134636,-0.92155,0.364169,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.0,379900.0,17.730598,58.833333,70.0,https://www.redfin.com/PA/Pittsburgh/159-1-2-38th-St-15201/home/74651930,"(40.4679636, -79.9655419)",2021-09-01
11,3.0,3.0,1519.0,2178.0,1910.0,0.142769,-0.924006,0.354725,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-2.449294e-16,390000.0,15.909914,90.8,79.666667,https://www.redfin.com/PA/Pittsburgh/4919-Hatfield-St-15201/home/73509681,"(40.4780842, -79.9573142)",2020-12-01


In [None]:
# get the census tract by latitude and longitude based on redfin data

import censusgeocode as cg 

def getCensusGeoData(latitude, longitude): 
    initResult = cg.coordinates(x=latitude, y=longitude).get('Census Tracts')[0]
    tract = initResult.get('TRACT') 
    county = initResult.get('COUNTY')    
    state = initResult.get('STATE') 
    return tract, county, state

In [None]:
# UNCOMMENT FOR CANNED MODEL RUNS 
# load pickle
# dfrf = pd.read_pickle("./censusTract.pkl")  

# UNCOMMENT FOR LIVE MODEL RUNS 
# get target census tract from redfin data
dfrf[['censusTract', 'censusCounty', 'censusState']] = dfrf.apply(lambda row: getCensusGeoData(row.Location[0], row.Location[1]), axis = 1, result_type="expand")

# set up key to join in census data
dfrf['joinKey'] = dfrf['censusState'] + dfrf['censusCounty'] + dfrf['censusTract']

# UNCOMMENT FOR LIVE MODEL RUNS
# save to pickle
dfrf.to_pickle("./censusTract.pkl")  

# inspect results
dfrf.head()

TypeError: 'NoneType' object is not subscriptable

In [None]:
# https://www.census.gov/data/developers/data-sets/acs-1year.html
# https://www.census.gov/content/dam/Census/data/developers/api-user-guide/api-guide.pdf
# reference query: https://api.census.gov/data/2019/acs/acs5?get=NAME,B23025_003E,B23025_005E,B15003_001E,B15003_002E,B15003_003E,B15003_004E,B15003_005E,B15003_006E,B15003_007E,B15003_008E,B15003_009E,B15003_010E,B15003_011E,B15003_012E,B15003_013E,B15003_014E,B15003_015E,B15003_016E&for=block+group:*&in=state:17+county:031
# reference query: webResponse = requests.get('https://api.census.gov/data/'+targetYear+'/acs/acs5?get=NAME,B08126_001E,B08126_002E,B08126_003E,B08126_004E,B08126_005E,B08126_006E,B08126_007E,B08126_008E,B08126_009E,B08126_010E,B08126_011E,B08126_012E,B08126_013E,B08126_014E,B08126_015E,B06009_001E,B06009_002E,B06009_003E,B06009_004E,B06009_005E,B06009_006E,B07010_001E,B07010_002E,B07010_004E,B07010_005E,B07010_006E,B07010_007E,B07010_008E,B07010_009E,B07010_010E,B07010_011E&for=tract:*&in=state:'+targetState+'+county:'+targetCounty+"&key="+CENSUS_API_KEY).json()
# representative map of census tract: https://www2.census.gov/geo/maps/DC2020/PL20/st42_pa/censustract_maps/c42003_allegheny/DC20CT_C42003.pdf

def getACSData(targetYear, targetCounty, targetState):
   
    # get web response
    webResponse=requests.get('https://api.census.gov/data/'+targetYear+'/acs/acs5?get=NAME,B01002_001E,B25109_001E,B25111_001E,B08134_001E,B08134_002E,B08134_003E,B08134_004E,B08134_005E,B08134_006E,B08134_007E,B15012_001E,B15012_009E,B15003_001E,B15003_023E,B15003_024E,B15003_025E,B19001_001E,B19001_002E,B19001_003E,B19001_004E,B19001_005E,B19001_006E,B19001_007E,B19001_008E,B19001_009E,B19001_010E,B19001_014E,B19001_015E,B19001_016E,B19001_017E,B19083_001E&for=tract:*&in=state:'+targetState+'+county:'+targetCounty+"&key="+CENSUS_API_KEY).json()
    
    # format web response
    df = pd.DataFrame.from_records(webResponse)
    df.columns = df.iloc[0] # enforce columns
    df = df[1:] # keep data but have properly formatted columns from index 0 
    df['joinKey'] = df['state'] + df['county'] + df['tract'] # set up join key to merge with geoData and redfin
    df.drop(columns = ['state', 'county', 'tract', 'NAME'], inplace=True)
    
    
    'B08134_002','B08134_003','B08134_004','B08134_005','B08134_006',
    
    # set up variable list to type cast
    varList = ['B01002_001E','B25109_001E' ,'B25111_001E' 
               ,'B08134_001E','B08134_002E','B08134_003E','B08134_004E','B08134_005E','B08134_006E','B08134_007E' 
               ,'B15012_001E' ,'B15012_009E'
               ,'B15003_001E' ,'B15003_023E' ,'B15003_024E' ,'B15003_025E' ,'B19001_001E' 
               ,'B19001_002E' ,'B19001_003E' ,'B19001_004E' ,'B19001_005E' ,'B19001_006E' ,'B19001_007E' 
               ,'B19001_008E' ,'B19001_009E' ,'B19001_010E' ,'B19001_014E' ,'B19001_015E' ,'B19001_016E'
               ,'B19001_017E' ,'B19083_001E']
    
    # cast data types
    for i in varList:
        try:
            df[i] = df[i].astype(int)
        except:
            df[i] = df[i].astype(float)
            
    for i in varList: 
        try:
            df.loc[(df[i] < 0), i] = df[i].median()
        except Exception:
            pass # do nothing
    
    # rename columns
    df.rename(columns={
        'B01002_001E':'age_Median'
        ,'B25109_001E':'housing_OwnerOccupiedMedianValue'
        ,'B25111_001E':'renting_MedianRentValue'
        ,'B08134_001E':'commute_Total'
        ,'B08134_002E':'commute_LessThan10mins'
        ,'B08134_003E':'commute_10to14mins'
        ,'B08134_004E':'commute_15to19mins'
        ,'B08134_005E':'commute_20to24mins'
        ,'B08134_006E':'commute_26to29mins'
        ,'B08134_007E':'commute_30to34mins'
        ,'B15012_001E':'bachelors_Total'
        ,'B15012_009E':'bachelors_STEM'
        ,'B15003_001E':'education_Total'
        ,'B15003_023E':'education_MasterDegree'
        ,'B15003_024E':'education_ProfessionalDegree'
        ,'B15003_025E':'education_DoctorateDegree'
        ,'B19001_001E':'income_Total'
        ,'B19001_002E':'income_LessThan10K'
        ,'B19001_003E':'income_10Kto15K'
        ,'B19001_004E':'income_15Kto20K'
        ,'B19001_005E':'income_20Kto25K'
        ,'B19001_006E':'income_25Kto30K'
        ,'B19001_007E':'income_30Kto35K'
        ,'B19001_008E':'income_35Kto40K'
        ,'B19001_009E':'income_40Kto45K'
        ,'B19001_010E':'income_45Kto50K'
        ,'B19001_014E':'income_100Kto125K'
        ,'B19001_015E':'income_125Kto150K'
        ,'B19001_016E':'income_150Kto200K'
        ,'B19001_017E':'income_200KOrMore'
        ,'B19083_001E':'inequality_GiniIndex'
    }, inplace = True)
    
    # calculate percentages instead of raw numbers
    
    df['commute_pctLessThan34Mins'] = (df['commute_LessThan10mins'] + df['commute_10to14mins'] + df['commute_15to19mins'] + df['commute_20to24mins'] + df['commute_26to29mins'] + df['commute_30to34mins']) / df['commute_Total']
    df.drop(columns=['commute_LessThan10mins', 'commute_10to14mins', 'commute_15to19mins', 'commute_20to24mins', 'commute_26to29mins', 'commute_30to34mins', 'commute_Total'], inplace=True)
    
    df['bachelors_pctSTEM'] = df['bachelors_STEM'] / df['bachelors_Total']
    df.drop(columns=['bachelors_STEM', 'bachelors_Total'], inplace=True)
    
    df['education_pctAdvancedDegree'] = (df['education_MasterDegree'] + df['education_ProfessionalDegree'] + df['education_DoctorateDegree']) / df['education_Total']
    df.drop(columns=['education_MasterDegree', 'education_ProfessionalDegree', 'education_DoctorateDegree', 'education_Total'], inplace=True)
    
    df['income_pctBelow50K'] = (df['income_LessThan10K'] + df['income_10Kto15K'] + df['income_15Kto20K'] + df['income_20Kto25K'] + df['income_25Kto30K'] + df['income_30Kto35K'] + df['income_35Kto40K'] + df['income_40Kto45K'] + df['income_45Kto50K']) / df['income_Total']  
    df['income_pctAbove150K'] = (df['income_150Kto200K'] + df['income_200KOrMore']) / df['income_Total']
    df.drop(columns=['income_Total','income_LessThan10K','income_10Kto15K','income_15Kto20K','income_20Kto25K','income_25Kto30K','income_30Kto35K','income_35Kto40K','income_40Kto45K','income_45Kto50K','income_100Kto125K','income_125Kto150K','income_150Kto200K','income_200KOrMore'], inplace=True)
    
    return df

In [None]:
# check for numerous counties ... as the data scales so will this list
distinctStateCounties = list(set(dfrf['censusState'] + dfrf['censusCounty']))

distinctStateCounties

In [None]:
# set up list that will hold dataframe objects
censusDFList = []

# iterate and populate list with dataframe objects
for i in range(len(distinctStateCounties)):
    print('Evaluating: ','2020', distinctStateCounties[i][2:], distinctStateCounties[i][:2])
    censusDFList.append(getACSData('2020', distinctStateCounties[i][2:], distinctStateCounties[i][:2]))
    
# turn list of dataframe objects into single dataframe since indexes will be shared
dfcensus = pd.concat(censusDFList)

In [None]:
# merge the census and redfin data together 

combined = dfrf.merge(dfcensus, on='joinKey', how='left')


In [None]:
combined.describe()

In [None]:
# inspect data frame

combined.head()

In [None]:
combined.columns

In [None]:
combined.to_csv('censusDF.csv')

In [None]:
# save to pickle
combined.to_pickle("./censusMergedWithRedfin.pkl")  

In [None]:
combined.sample(n=5)