In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")

In [2]:
df = pd.read_csv("data/county_centers.csv")
def tidy_coordinates(string):
    return string.replace("+", "").replace("°", "").replace("\u2013", "-")

lats = df.Latitude
df.Latitude = pd.to_numeric(lats.apply(tidy_coordinates))

lons = df.Longitude
df.Longitude = pd.to_numeric(lons.apply(tidy_coordinates))

# clean and save
new_col_names = ["id", "state", "fips", "county", "county_seat", "population",
             "land_area_km2", "land_area_mi2", "water_area_km2",
             "water_area_mi2", "total_area_km", "total_area_mi2",
             "latitude", "longitude"]

rename_dict = {}
for k, v in zip(df.columns, new_col_names):
    rename_dict[k] = v

df = df.rename(columns=rename_dict)
df = df.set_index("id")

# clean up numeric cols and convert to numeric dtypes
def tidy_numbers(string):
    return string.replace(",", "").replace("-", "")

numeric_cols = ["population", "land_area_km2", "land_area_mi2",
                "water_area_km2", "water_area_mi2", "total_area_km",
                "total_area_mi2"]

for col in numeric_cols:
    df[col] = df[col].apply(tidy_numbers)
    df[col] = pd.to_numeric(df[col])

df.head()

Unnamed: 0_level_0,state,fips,county,county_seat,population,land_area_km2,land_area_mi2,water_area_km2,water_area_mi2,total_area_km,total_area_mi2,latitude,longitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,AL,1001,Autauga,Prattville,54571,1539.582,594.436,25.776,9.952,1565.358,604.388,32.536382,-86.64449
2,AL,1003,Baldwin,Bay Minette,182265,4117.522,1589.784,1133.19,437.527,5250.712,2027.311,30.659218,-87.746067
3,AL,1005,Barbour,Clayton,27457,2291.819,884.876,50.865,19.639,2342.684,904.515,31.87067,-85.405456
4,AL,1007,Bibb,Centreville,22915,1612.481,622.582,9.289,3.587,1621.77,626.169,33.015893,-87.127148
5,AL,1009,Blount,Oneonta,57322,1669.962,644.776,15.157,5.852,1685.119,650.628,33.977448,-86.567246


In [3]:
# label counties based on CONUS (CONUS = The 48 CONtiguous States and the
# District of Columbia). give each county a binary yes/no column id
# this concerns the states Alaska, Puerto Rico and Hawaii
# source for codes: https://www.iso.org/obp/ui/#iso:code:3166:US
non_conus_states = ["AK", "HI", "PR"]
df["conus"] = [False if iso in non_conus_states else True for iso in df.state]

In [4]:
gdf = gpd.GeoDataFrame(df,
                       geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf.to_csv("../data/processed/county_geodata/county_centers_cleaned.csv")