In [1]:
# Import the dependencies.
import pandas as pd
import json
import pandas.io.json as pd_json
import censusdata
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# Import the requests library.
import requests

api_key = "24f7bba880a1af36816cec59796a7e4f07da5789"

In [2]:
# Import ACS Data Profile variables
dp_var_url = "https://api.census.gov/data/2018/acs/acs5/profile/variables.json"
dp_acs_vars = requests.get(dp_var_url).json()
dp_acs_vars = dp_acs_vars["variables"]

dp_vars_df = pd.DataFrame.from_dict(dp_acs_vars, orient = 'index')
dp_vars_df.index.name = 'code'

In [3]:
#Import ACS Detail variables
b_var_url = "https://api.census.gov/data/2018/acs/acs5/variables.json"
b_acs_vars = requests.get(b_var_url).json()
b_acs_vars = b_acs_vars["variables"]

b_vars_df = pd.DataFrame.from_dict(b_acs_vars, orient = 'index')
b_vars_df.index.name = 'code'

Variable list:

Maybe variables:
- 

In [4]:
# URL for ACS Data Profile call
dp_var_list = ["DP02_0122E", "DP02_0001E", "DP02_0015E", "DP03_0001E", "DP03_0005PE",
               "DP04_0001E", "DP04_0050E"]

dp_url_start = "https://api.census.gov/data/2018/acs/acs5/profile?get=NAME"
dp_url_end =  "&for=county:*&in=state:*&key=" + api_key

dp_url = dp_url_start

for var in dp_var_list:
    dp_url = dp_url + ',' + var

dp_url = dp_url + dp_url_end

print(dp_url)

https://api.census.gov/data/2018/acs/acs5/profile?get=NAME,DP02_0122E,DP02_0001E,DP02_0015E,DP03_0001E,DP03_0005PE,DP04_0001E,DP04_0050E&for=county:*&in=state:*&key=24f7bba880a1af36816cec59796a7e4f07da5789


In [5]:
# URL for ACS Detail call
b_var_list = ["B01001_001E", "B01002_001E","B06001_013E","B06009_002E","B06009_003E","B06009_004E","B06009_005E","B06009_006E",
              "B06012_002E","B08133_001E","B19013_001E","B19301_001E","B19326_001E","B25071_001E",
              "B25077_001E"]

b_url_start = "https://api.census.gov/data/2018/acs/acs5?get=NAME"
b_url_end =  "&for=county:*&in=state:*&key=" + api_key

b_url = b_url_start

for var in b_var_list:
    b_url = b_url + ',' + var

b_url = b_url + b_url_end

print(b_url)

https://api.census.gov/data/2018/acs/acs5?get=NAME,B01001_001E,B01002_001E,B06001_013E,B06009_002E,B06009_003E,B06009_004E,B06009_005E,B06009_006E,B06012_002E,B08133_001E,B19013_001E,B19301_001E,B19326_001E,B25071_001E,B25077_001E&for=county:*&in=state:*&key=24f7bba880a1af36816cec59796a7e4f07da5789


In [6]:
#Print variable code names
dp_codes = []
b_codes = []

for var in dp_var_list:
    dp_codes.append(dp_vars_df.loc[var, 'label'])

for var in b_var_list:
    b_codes.append(b_vars_df.loc[var, 'label'])
    
print(dp_codes, b_codes)

['Estimate!!ANCESTRY!!Total population', 'Estimate!!HOUSEHOLDS BY TYPE!!Total households', 'Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Average household size', 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over', 'Percent Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Unemployed', 'Estimate!!HOUSING OCCUPANCY!!Total housing units', 'Estimate!!YEAR HOUSEHOLDER MOVED INTO UNIT!!Occupied housing units'] ['Estimate!!Total', 'Estimate!!Median age --!!Total', 'Estimate!!Total!!Born in state of residence', 'Estimate!!Total!!Less than high school graduate', 'Estimate!!Total!!High school graduate (includes equivalency)', "Estimate!!Total!!Some college or associate's degree", "Estimate!!Total!!Bachelor's degree", 'Estimate!!Total!!Graduate or professional degree', 'Estimate!!Total!!Below 100 percent of the poverty level', 'Estimate!!Aggregate travel time to work (in minutes)', 'Estimate!!Median household income in the past 12 months 

In [7]:
codes_dict = {'code' : dp_var_list + b_var_list, 'label' : dp_codes + b_codes}
codes_df = pd.DataFrame(data = codes_dict)
codes_df

Unnamed: 0,code,label
0,DP02_0122E,Estimate!!ANCESTRY!!Total population
1,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households
2,DP02_0015E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...
3,DP03_0001E,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...
4,DP03_0005PE,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...
5,DP04_0001E,Estimate!!HOUSING OCCUPANCY!!Total housing units
6,DP04_0050E,Estimate!!YEAR HOUSEHOLDER MOVED INTO UNIT!!Oc...
7,B01001_001E,Estimate!!Total
8,B01002_001E,Estimate!!Median age --!!Total
9,B06001_013E,Estimate!!Total!!Born in state of residence


In [8]:
# Request the Data Profile table
dp_call = requests.get(dp_url).json()
dp_df = pd.DataFrame(dp_call[1:len(dp_call)], columns = dp_call[0], dtype = float)

In [9]:
# Request the Detail table
b_call = requests.get(b_url).json()
b_df = pd.DataFrame(b_call[1:len(b_call)], columns = b_call[0], dtype = float)

In [10]:
# Merge the ACS tables
acs_df = dp_df.merge(b_df, left_on = ['NAME', 'state', 'county'], right_on = ['NAME', 'state', 'county'])

In [11]:
acs_df= acs_df.astype({'state': 'object', 'county':'object'})
acs_df['fips'] = acs_df['state']*1000 + acs_df['county']
acs_df

Unnamed: 0,NAME,DP02_0122E,DP02_0001E,DP02_0015E,DP03_0001E,DP03_0005PE,DP04_0001E,DP04_0050E,state,county,...,B06009_005E,B06009_006E,B06012_002E,B08133_001E,B19013_001E,B19301_001E,B19326_001E,B25071_001E,B25077_001E,fips
0,"Washington County, Mississippi",47086.0,18299.0,2.54,36109.0,8.4,21591.0,18299.0,28,151,...,3476.0,2220.0,15496.0,277765.0,30834.0,19884.0,19069.0,35.6,74700.0,28151
1,"Perry County, Mississippi",12028.0,4563.0,2.61,9508.0,3.5,5620.0,4563.0,28,111,...,694.0,204.0,2231.0,,39007.0,21611.0,19724.0,29.0,83700.0,28111
2,"Choctaw County, Mississippi",8321.0,3164.0,2.58,6718.0,4.5,4200.0,3164.0,28,19,...,625.0,399.0,1888.0,,37203.0,20589.0,18967.0,31.9,81600.0,28019
3,"Itawamba County, Mississippi",23480.0,8706.0,2.57,18949.0,2.2,10289.0,8706.0,28,57,...,1337.0,772.0,3428.0,228155.0,40510.0,20629.0,21698.0,27.4,89500.0,28057
4,"Carroll County, Mississippi",10129.0,3658.0,2.72,8404.0,4.5,5154.0,3658.0,28,15,...,692.0,359.0,1477.0,,43060.0,22567.0,22091.0,36.0,89300.0,28015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,"Clayton County, Iowa",17672.0,7652.0,2.27,14243.0,1.4,9099.0,7652.0,19,43,...,1670.0,496.0,1564.0,186660.0,52828.0,28486.0,28407.0,23.5,121000.0,19043
3216,"Buena Vista County, Iowa",20260.0,7550.0,2.55,15640.0,3.8,8296.0,7550.0,19,21,...,1751.0,755.0,2507.0,136945.0,54556.0,26607.0,28553.0,23.9,112700.0,19021
3217,"Guthrie County, Iowa",10674.0,4397.0,2.39,8530.0,2.2,5786.0,4397.0,19,77,...,1093.0,393.0,1175.0,136575.0,57075.0,28953.0,30675.0,24.2,115900.0,19077
3218,"Humboldt County, Iowa",9566.0,4221.0,2.24,7572.0,2.4,4729.0,4221.0,19,91,...,1089.0,304.0,1103.0,83660.0,52219.0,29882.0,29958.0,25.1,102300.0,19091


In [12]:
acs_df["% Housing Units Occupied"] = acs_df["DP02_0001E"]/acs_df["DP04_0001E"]
acs_df["% Unemployment Rate"] = acs_df["DP03_0005PE"]/100
acs_df["% Born in State"] = acs_df["B06001_013E"]/acs_df["B01001_001E"]
acs_df["% <HS"] = acs_df["B06009_002E"]/acs_df["B01001_001E"]
acs_df["% HS Grad"] = acs_df["B06009_003E"]/acs_df["B01001_001E"]
acs_df["% Some College"] = acs_df["B06009_004E"]/acs_df["B01001_001E"]
acs_df["% Bach Degree"] = acs_df["B06009_005E"]/acs_df["B01001_001E"]
acs_df["% Grad Degree"] = acs_df["B06009_006E"]/acs_df["B01001_001E"]
acs_df["% Below Pov Level"] = acs_df["B06012_002E"]/acs_df["B01001_001E"]
acs_df["Commute Time"] = acs_df["B08133_001E"]
acs_df["Median Income"] = acs_df["B19326_001E"]
acs_df["Median Home Value"] = acs_df["B25077_001E"]

In [13]:
model_df = acs_df[["state", "% Housing Units Occupied", "% Unemployment Rate", "% Born in State", "% <HS", "% HS Grad",
                  "% Some College", "% Bach Degree", "% Grad Degree", "% Below Pov Level", "Commute Time", "Median Income", "Median Home Value"]]
for column in model_df.columns:
    print(f"Column {column} has {model_df[column].isnull().sum()} null values")

Column state has 0 null values
Column % Housing Units Occupied has 78 null values
Column % Unemployment Rate has 0 null values
Column % Born in State has 78 null values
Column % <HS has 78 null values
Column % HS Grad has 78 null values
Column % Some College has 78 null values
Column % Bach Degree has 78 null values
Column % Grad Degree has 78 null values
Column % Below Pov Level has 79 null values
Column Commute Time has 551 null values
Column Median Income has 1 null values
Column Median Home Value has 0 null values


In [14]:
model_df

Unnamed: 0,state,% Housing Units Occupied,% Unemployment Rate,% Born in State,% <HS,% HS Grad,% Some College,% Bach Degree,% Grad Degree,% Below Pov Level,Commute Time,Median Income,Median Home Value
0,28,0.847529,0.084,0.868199,0.138534,0.196534,0.187635,0.073822,0.047148,0.329100,277765.0,19069.0,74700.0
1,28,0.811922,0.035,0.826072,0.127702,0.273528,0.205687,0.057699,0.016960,0.185484,,19724.0,83700.0
2,28,0.753333,0.045,0.846773,0.136282,0.264511,0.173056,0.075111,0.047951,0.226896,,18967.0,81600.0
3,28,0.846146,0.022,0.752598,0.143739,0.202002,0.232411,0.056942,0.032879,0.145997,228155.0,21698.0,89500.0
4,28,0.709740,0.045,0.867213,0.130714,0.239115,0.241978,0.068319,0.035443,0.145819,,22091.0,89300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,19,0.840972,0.014,0.735344,0.062981,0.311340,0.217746,0.094500,0.028067,0.088502,186660.0,28407.0,121000.0
3216,19,0.910077,0.038,0.583860,0.140178,0.185686,0.180553,0.086426,0.037266,0.123741,136945.0,28553.0,112700.0
3217,19,0.759938,0.022,0.825745,0.050497,0.272063,0.243020,0.102398,0.036818,0.110081,136575.0,30675.0,115900.0
3218,19,0.892578,0.024,0.783713,0.054359,0.250993,0.247543,0.113841,0.031779,0.115304,83660.0,29958.0,102300.0


In [15]:
model_df = model_df.dropna()
model_df = pd.get_dummies(model_df)
model_df

Unnamed: 0,% Housing Units Occupied,% Unemployment Rate,% Born in State,% <HS,% HS Grad,% Some College,% Bach Degree,% Grad Degree,% Below Pov Level,Commute Time,...,state_46.0,state_47.0,state_48.0,state_49.0,state_50.0,state_51.0,state_53.0,state_54.0,state_55.0,state_56.0
0,0.847529,0.084,0.868199,0.138534,0.196534,0.187635,0.073822,0.047148,0.329100,277765.0,...,0,0,0,0,0,0,0,0,0,0
3,0.846146,0.022,0.752598,0.143739,0.202002,0.232411,0.056942,0.032879,0.145997,228155.0,...,0,0,0,0,0,0,0,0,0,0
9,0.849791,0.136,0.890667,0.196283,0.201980,0.194384,0.051275,0.024824,0.345768,59630.0,...,0,0,0,0,0,0,0,0,0,0
10,0.846609,0.050,0.858708,0.147398,0.157738,0.181977,0.091372,0.060843,0.329007,188085.0,...,0,0,0,0,0,0,0,0,0,0
11,0.903702,0.062,0.691064,0.060827,0.154517,0.211244,0.146154,0.085817,0.164138,696795.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,0.840972,0.014,0.735344,0.062981,0.311340,0.217746,0.094500,0.028067,0.088502,186660.0,...,0,0,0,0,0,0,0,0,0,0
3216,0.910077,0.038,0.583860,0.140178,0.185686,0.180553,0.086426,0.037266,0.123741,136945.0,...,0,0,0,0,0,0,0,0,0,0
3217,0.759938,0.022,0.825745,0.050497,0.272063,0.243020,0.102398,0.036818,0.110081,136575.0,...,0,0,0,0,0,0,0,0,0,0
3218,0.892578,0.024,0.783713,0.054359,0.250993,0.247543,0.113841,0.031779,0.115304,83660.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split our preprocessed data into our features and target arrays
y = model_df["Median Home Value"]
X = model_df.drop(["Median Home Value"],1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [17]:
# Create a StandardScaler instance
#scaler = StandardScaler()

# Fit the StandardScaler
#X_scaler = scaler.fit(X_train)
#y_scaler = scaler.fit(y_train)

# Scale the data
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)
#y_train_scaled = y_scaler.transform(y_train)
#y_test_scaled = y_scaler.transform(y_test)
X

Unnamed: 0,% Housing Units Occupied,% Unemployment Rate,% Born in State,% <HS,% HS Grad,% Some College,% Bach Degree,% Grad Degree,% Below Pov Level,Commute Time,...,state_46.0,state_47.0,state_48.0,state_49.0,state_50.0,state_51.0,state_53.0,state_54.0,state_55.0,state_56.0
0,0.847529,0.084,0.868199,0.138534,0.196534,0.187635,0.073822,0.047148,0.329100,277765.0,...,0,0,0,0,0,0,0,0,0,0
3,0.846146,0.022,0.752598,0.143739,0.202002,0.232411,0.056942,0.032879,0.145997,228155.0,...,0,0,0,0,0,0,0,0,0,0
9,0.849791,0.136,0.890667,0.196283,0.201980,0.194384,0.051275,0.024824,0.345768,59630.0,...,0,0,0,0,0,0,0,0,0,0
10,0.846609,0.050,0.858708,0.147398,0.157738,0.181977,0.091372,0.060843,0.329007,188085.0,...,0,0,0,0,0,0,0,0,0,0
11,0.903702,0.062,0.691064,0.060827,0.154517,0.211244,0.146154,0.085817,0.164138,696795.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,0.840972,0.014,0.735344,0.062981,0.311340,0.217746,0.094500,0.028067,0.088502,186660.0,...,0,0,0,0,0,0,0,0,0,0
3216,0.910077,0.038,0.583860,0.140178,0.185686,0.180553,0.086426,0.037266,0.123741,136945.0,...,0,0,0,0,0,0,0,0,0,0
3217,0.759938,0.022,0.825745,0.050497,0.272063,0.243020,0.102398,0.036818,0.110081,136575.0,...,0,0,0,0,0,0,0,0,0,0
3218,0.892578,0.024,0.783713,0.054359,0.250993,0.247543,0.113841,0.031779,0.115304,83660.0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Train the Logistic Regression model using the resampled data
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 50, random_state=0)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [19]:
# Calculate the R^2 Score score
from sklearn.metrics import r2_score
y_pred = regressor.predict(X_test)
print("R^2 Score")
r2_score(y_test, y_pred)

R^2 Score


0.82468435346738

In [20]:
#Explained variance score
from sklearn.metrics import explained_variance_score
print("Explained variance")
explained_variance_score(y_test, y_pred)

Explained variance


0.8246847360492006

In [29]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(regressor.feature_importances_, X.columns), reverse=True)
print("Top 10 Values ranked by importance")
importances[0:10]

Top 10 Values ranked by importance


[(0.46397967118915356, '% Bach Degree'),
 (0.10892710563538276, '% Grad Degree'),
 (0.10815623905276753, 'Median Income'),
 (0.07698852656004797, 'state_6.0'),
 (0.062301264857016085, '% Born in State'),
 (0.03439514930618599, 'Commute Time'),
 (0.02550372360262147, '% Housing Units Occupied'),
 (0.020110775588303632, '% <HS'),
 (0.019427128674394594, '% Below Pov Level'),
 (0.017131277861308344, 'state_15.0')]

In [22]:
regressor.predict([model_df.drop(["Median Home Value"],1).loc[3219]])


array([147478.])

In [28]:
hypo = model_df.drop(["Median Home Value"],1).loc[3219]
hypo["% Housing Units Occupied"] = hypo["% Housing Units Occupied"]*1.1
print(hypo["% Housing Units Occupied"])
regressor.predict([hypo])


0.9902952715259138


array([150312.])

In [24]:
choro_df = acs_df[['fips', "NAME", "% Housing Units Occupied", "% Unemployment Rate", "% Born in State", "% <HS", "% HS Grad",
                  "% Some College", "% Bach Degree", "% Grad Degree", "% Below Pov Level", "Commute Time", "Median Income", "Median Home Value"]]
choro_df = choro_df.sort_values(by=['fips'])
choro_df['decile'] = pd.qcut(choro_df['Median Home Value'], 10, labels=False) + 1
#choro_df.to_csv("Resources/choro.csv", index= False)
choro_df = choro_df.set_index('fips')
choro_df.to_json("Resources/county.json", orient = 'index')
choro_df

Unnamed: 0_level_0,NAME,% Housing Units Occupied,% Unemployment Rate,% Born in State,% <HS,% HS Grad,% Some College,% Bach Degree,% Grad Degree,% Below Pov Level,Commute Time,Median Income,Median Home Value,decile
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1001.0,"Autauga County, Alabama",0.905640,0.025,0.694511,0.076159,0.219547,0.191159,0.106938,0.079493,0.152572,612795.0,29667.0,147900.0,7
1003.0,"Baldwin County, Alabama",0.702327,0.026,0.537526,0.068763,0.194991,0.221160,0.146228,0.075173,0.104047,2354950.0,28632.0,189800.0,9
1005.0,"Barbour County, Alabama",0.769540,0.044,0.682065,0.190094,0.251571,0.177100,0.054961,0.031146,0.255876,191690.0,18138.0,92900.0,3
1007.0,"Bibb County, Alabama",0.746643,0.036,0.812092,0.117637,0.331646,0.170728,0.053136,0.027345,0.127092,226250.0,22298.0,96500.0,3
1009.0,"Blount County, Alabama",0.850467,0.020,0.795472,0.136369,0.234001,0.230150,0.055807,0.031104,0.142597,724905.0,26509.0,124700.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72145.0,"Vega Baja Municipio, Puerto Rico",,0.076,,,,,,,,414650.0,12952.0,107600.0,4
72147.0,"Vieques Municipio, Puerto Rico",,0.062,,,,,,,,,13886.0,104200.0,4
72149.0,"Villalba Municipio, Puerto Rico",,0.105,,,,,,,,,11598.0,90900.0,3
72151.0,"Yabucoa Municipio, Puerto Rico",,0.100,,,,,,,,,11651.0,88100.0,2


In [25]:
# counties = pd.read_json("https://raw.githubusercontent.com/python-visualization/folium/master/tests/us-counties.json", orient = "split)")
with open('Resources/us-counties.json', 'r') as f:
        counties = json.load(f)
#counties

In [26]:
for feature in counties['features']:
    try:
        feature['properties']['home_value'] = int(choro_df.loc[float(feature['id']), "Median Home Value"])
        feature['properties']['decile'] = int(choro_df.loc[float(feature['id']), "decile"])
    except:
        feature['properties']['home_value'] = 'NA'
    #print(feature['id'], ":" ,feature['properties']['home_value'])
counties

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'id': '1001',
   'properties': {'name': 'Autauga', 'home_value': 147900, 'decile': 7},
   'geometry': {'type': 'Polygon',
    'coordinates': [[[-86.411786, 32.706342],
      [-86.411786, 32.410587],
      [-86.499417, 32.344863],
      [-86.817079, 32.339387],
      [-86.915664, 32.662526],
      [-86.411786, 32.706342]]]}},
  {'type': 'Feature',
   'id': '1003',
   'properties': {'name': 'Baldwin', 'home_value': 189800, 'decile': 9},
   'geometry': {'type': 'Polygon',
    'coordinates': [[[-87.76459, 31.298768],
      [-87.616713, 31.243998],
      [-87.600282, 30.997536],
      [-87.518128, 30.280057],
      [-88.005575, 30.685351],
      [-87.972714, 31.161844],
      [-87.945329, 31.194706],
      [-87.76459, 31.298768]]]}},
  {'type': 'Feature',
   'id': '1005',
   'properties': {'name': 'Barbour', 'home_value': 92900, 'decile': 3},
   'geometry': {'type': 'Polygon',
    'coordinates': [[[-85.354736, 32.147694],
  

In [27]:
with open('Resources/choro.json', 'w') as f:
        json.dump(counties, f)