<b>Create Sidebar Navigation - best viewed in full screen</b>

In [1]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');

<IPython.core.display.Javascript object>

# Read In Data

In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Import housing data to match houses with noise values
housing_df = pd.read_csv("../resources/in_progress/housing_data_clean.csv")

housing_df.head(1)

Unnamed: 0,Total_Value,Address,City,Zip_Code,Planning_Jurisdiction,Zoning,Acreage,Sqft,Age,Bath,Remodel_Addition,Style
0,249321,2457 BERTIE DR,RALEIGH,27610,9,R-4,0.21,1828,59,2.0,0,8


# Create Starter DataFrame

In [3]:
# Create starter dataframe
noise_starter_df = pd.DataFrame(housing_df[["Address", "City"]])

# Set state to North Carolina for easier geocoding
noise_starter_df["State"] = "NC"

# Add zip codes last
noise_starter_df["Zip_Code"] = housing_df[["Zip_Code"]]

noise_starter_df.head(1)

Unnamed: 0,Address,City,State,Zip_Code
0,2457 BERTIE DR,RALEIGH,NC,27610


# Geocode Process

## Round I
Uses https://geocoding.geo.census.gov/geocoder/locations/addressbatch?form to geocode addresses

In [4]:
# Batch into CSVs of 9,999 rows for geocoding (states 10,000 but will not accept; 9,999 at a time works)
import numpy as np
no_of_rows = 9999
for k,g in noise_starter_df.groupby(np.arange(len(noise_starter_df))//no_of_rows):
    g.to_csv("../resources/in_progress/addresses_to_geocode/set_{}.csv".format(k+1))

In [6]:
# Read in and combine geocoded addresses into singular dataframe
import glob
import os

joined_files = os.path.join("../resources/in_progress/addresses_geocoded", "*.csv")
joined_list = glob.glob(joined_files)

first_df = pd.concat(map(pd.read_csv, joined_list))
first_df.head(1)

Unnamed: 0.1,Unnamed: 0,"Address, City, State, Zip_Code",Match,Type,Address Matched,Coordinates,Tiger_Line_ID,Side
0,17288,"1505 BASEWOOD DR, RALEIGH, NC, 27609",Match,Exact,"1505 BASEWOOD DR, RALEIGH, NC, 27609","-78.61314566299995,35.83405377100007",72507908.0,L


### Matched Address Count

In [7]:
# See how many matched
first_df["Match"].value_counts()

Match       228517
No_Match     12995
Tie            394
Name: Match, dtype: int64

## Round II
Use https://www.geoapify.com/tools/geocoding-online to geocode addresses

In [8]:
# Create dataframe of addresses to retry geolocating
second_df_start = first_df.loc[((first_df["Match"] == "No_Match") | (first_df["Match"] == "Tie"))]
second_df_start = second_df_start[["Address, City, State, Zip_Code"]]
second_df_start.head(1)

Unnamed: 0,"Address, City, State, Zip_Code"
2,"1143 OLD US 264 HWY, ZEBULON, NC, 27597"


In [9]:
# Separate address fields
cols = second_df_start.columns.tolist()
separate_cols = []
for col in cols:
    separate_cols.append(col.split(","))
    
values = second_df_start.values.tolist()
separate_values = []
for value in values:
    separate_values.append(value[0].split(","))
    
second_df_start = pd.DataFrame(separate_values, columns=separate_cols[0])
second_df_start.head(1)

Unnamed: 0,Address,City,State,Zip_Code
0,1143 OLD US 264 HWY,ZEBULON,NC,27597


In [10]:
# Split into rows to retry geocoding
no_of_rows = 500
for k,g in second_df_start.groupby(np.arange(len(second_df_start))//no_of_rows):
    g.to_csv("../resources/in_progress/addresses_to_retry/set_{}.csv".format(k+1))

In [11]:
# Read in and combine geocoded addresses into singular dataframe
joined_files_2 = os.path.join("../resources/in_progress/addresses_retried", "*.csv")
joined_list_2 = glob.glob(joined_files_2)

second_df = pd.concat(map(pd.read_csv, joined_list_2))
second_df.head(1)

Unnamed: 0,original_,original_Address,original_ City,original_ State,original_ Zip_Code,lat,lon,formatted,housenumber,name,...,state_code,country,country_code,confidence,confidence_city_level,confidence_street_level,attribution,attribution_license,attribution_url,suburb
0,7500,8801 GREEN ARBOR CT,WAKE FOREST,NC,27587,36.017726,-78.568723,"Green Arbor Court, Wake County, NC 27587, Unit...",,Green Arbor Court,...,NC,United States,us,0.5,1.0,1.0,© OpenStreetMap contributors,Open Database License,https://www.openstreetmap.org/copyright,


## Clean Geocoding Results

In [12]:
# Clean first dataframe
first_df[["Address", "City", "State", "Zip_Code"]] = first_df["Address, City, State, Zip_Code"].str.split(",", expand=True)
first_df[["Longitude", "Latitude"]] = first_df.Coordinates.str.split(",", expand=True)
first_df = first_df.drop(columns=["Address, City, State, Zip_Code", "Coordinates", "Unnamed: 0", "Match", "Type", 
                                        "Address Matched", "Tiger_Line_ID", "Side", "State"])
first_df = first_df.dropna()
first_df = first_df[["Address", "City", "Zip_Code", "Latitude", "Longitude"]]
first_df.head(1)

Unnamed: 0,Address,City,Zip_Code,Latitude,Longitude
0,1505 BASEWOOD DR,RALEIGH,27609,35.83405377100007,-78.61314566299995


In [13]:
# Clean second dataframe
second_df = second_df[["original_Address", "original_ City", "original_ Zip_Code", "lat", "lon"]]
second_df.rename(columns={"original_Address": "Address", "original_ City": "City", "original_ Zip_Code": "Zip_Code", 
                          "lat": "Latitude", "lon": "Longitude"}, inplace=True)
second_df.head(1)

Unnamed: 0,Address,City,Zip_Code,Latitude,Longitude
0,8801 GREEN ARBOR CT,WAKE FOREST,27587,36.017726,-78.568723


# Merge and Clean Datasets

In [14]:
merged_latlong = pd.concat([first_df, second_df], ignore_index=True)
merged_latlong = merged_latlong.sort_values(by=["Address"])
merged_latlong = merged_latlong.reset_index(drop=True)
merged_latlong["Latitude"] = merged_latlong["Latitude"].astype("float64")
merged_latlong["Longitude"] = merged_latlong["Longitude"].astype("float64")
merged_latlong.head()

Unnamed: 0,Address,City,Zip_Code,Latitude,Longitude
0,0 ADAMS MOUNTAIN RD,RALEIGH,27614,35.952688,-78.646766
1,0 BAILEYWICK RD,RALEIGH,27613,35.912011,-78.69808
2,0 BALLENTINE DAIRY RD,FUQUAY VARINA,27526,35.612817,-78.783274
3,0 BASS LAKE RD,HOLLY SPRINGS,27540,35.647025,-78.818525
4,0 BILLY HOPKINS RD,ZEBULON,27597,35.825658,-78.315189


## Check for and remove incorrect values

In [15]:
import geopandas as gpd
import shapely
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)

# Change latitude/longitude values to geometry points
merged_latlong["Geometry"] = gpd.points_from_xy(merged_latlong.Longitude, 
                                                merged_latlong.Latitude, 
                                                crs="EPSG:4326")

# Create new geodataframe from data
geo_df = gpd.GeoDataFrame(merged_latlong, 
                          geometry=merged_latlong.Geometry, 
                          crs="EPSG:4326")
geo_df = geo_df.drop(columns=["Geometry"])

# Import Wake County boundaries
# Boundary values from https://data.wakegov.com/datasets/Wake::wake-county-line/explore
wake = gpd.read_file("../resources/original/Wake_County_Line.geojson")

In [16]:
# Check individual coordinates against Wake County polygon
geo_df["In_Wake"] = geo_df.within(wake.at[0, "geometry"])

# Remove rows unable to be geolocated
geo_df = geo_df.loc[(geo_df["In_Wake"] == True)]
geo_df.head()

Unnamed: 0,Address,City,Zip_Code,Latitude,Longitude,geometry,In_Wake
0,0 ADAMS MOUNTAIN RD,RALEIGH,27614,35.952688,-78.646766,POINT (-78.64677 35.95269),True
1,0 BAILEYWICK RD,RALEIGH,27613,35.912011,-78.69808,POINT (-78.69808 35.91201),True
2,0 BALLENTINE DAIRY RD,FUQUAY VARINA,27526,35.612817,-78.783274,POINT (-78.78327 35.61282),True
3,0 BASS LAKE RD,HOLLY SPRINGS,27540,35.647025,-78.818525,POINT (-78.81852 35.64702),True
4,0 BILLY HOPKINS RD,ZEBULON,27597,35.825658,-78.315189,POINT (-78.31519 35.82566),True


# Update and export housing data

In [17]:
# Update housing data to only include geolocated addresses, add latitude and longitude, and export to CSV
updated_housing_df = housing_df.merge(geo_df[["Address", "Latitude", "Longitude"]], on="Address")
updated_housing_df.to_csv("../resources/clean/housing_data_with_coordinates.csv", index=False)
updated_housing_df.head()

Unnamed: 0,Total_Value,Address,City,Zip_Code,Planning_Jurisdiction,Zoning,Acreage,Sqft,Age,Bath,Remodel_Addition,Style,Latitude,Longitude
0,249321,2457 BERTIE DR,RALEIGH,27610,9,R-4,0.21,1828,59,2.0,0,8,35.785561,-78.600881
1,159933,2848 PROVIDENCE RD,RALEIGH,27610,9,R-4,0.46,1240,53,1.0,0,0,35.743016,-78.573618
2,222624,409 S LAKESIDE DR,RALEIGH,27606,9,R-4,0.43,1037,24,2.0,0,0,35.774212,-78.7289
3,150723,540 MARSHBURN RD,WENDELL,27591,13,R3,0.46,2261,123,2.0,0,0,35.787242,-78.374324
4,140801,605 WOODLAND RD,RALEIGH,27603,5,R2,0.51,996,67,1.0,0,0,35.701073,-78.6459


# Export Noise Starter CSV for QGIS

In [18]:
geo_df = geo_df.drop(columns=["geometry", "In_Wake"])
geo_df.head(1)

Unnamed: 0,Address,City,Zip_Code,Latitude,Longitude
0,0 ADAMS MOUNTAIN RD,RALEIGH,27614,35.952688,-78.646766


In [20]:
# Send latitude and longitude values to CSV for QGIS work (add decibel levels via map rasters)
geo_df.to_csv("../resources/in_progress/geocoded_addresses(noise_starter).csv", index=False)