In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic


In [2]:
# Import datasets 

# HSI (Hazardous Site Inventory) contains a list of contaminated sites in Georgia that need to be cleaned up
# Use to identify contaminated sites in Georgia (includes landfills, superfund)
df_hsi = pd.read_excel("../../data/raw/scoring_indicators/desirable_undesirable_activities/hsi/July-2024-Hazardous-Site-Inventory.xlsx")

# TRI (Toxic Release Inventory) contains how much toxic chemicals are released into the environment
# Use to identify sites in Georgia that release toxic chemicals
df_tri = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/tri/waste_hazardous_chemicals.csv")

# RCRA (Resource Conservation and Recovery Act) contains information on hazardous waste management
# Use to identify landfills, waste treatment, storage, and disposal facilities in Georgia
df_rcra = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/rcra/RCRA_FACILITIES_GA.csv")

# Food Access Research Atlas contains information on food access in Georgia
# Use to identify food deserts in Georgia
df_food_deserts = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/usda/food_access_research_atlas.csv")

# CDR (Chemical Data Reporting) identifies where chemicals are handled, but not whether or not they are hazardous
# Using this dataset to help identify chemical and manufacturing activities in Georgia (note: only includes manufacturers that handle chemicals)
df_cdr = pd.read_csv("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/cdr_industrial_manufacturing_facilities.csv")

# FRS (Facility Registry System) contains information on facilities and their industries 
# Use to further identify chemical and manufacturing activities, as well as livestock activities in Georgia
df_frs = pd.read_csv("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/frs_facilities_naics_sic.csv")

df_undes_google_places = pd.read_csv("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/ga_undesirable_rough.csv")

# gdf_wetland = gpd.read_file("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/ga_wetlands_cleaned.gpkg")

  df_frs = pd.read_csv("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/frs_facilities_naics_sic.csv")


In [3]:
# Create a cleaned and standardized version of df_hsi
df_hsi_clean = df_hsi.copy()
df_hsi_clean['data_source'] = 'hsi'
df_hsi_clean['undesirable_activity'] = 'hazardous_inventory'
df_hsi_clean = df_hsi_clean.rename(columns={
    'Site Name': 'site_name',
    'Address': 'site_address',
    'City': 'site_city',
    'County': 'site_county',
    'Lattitude': 'site_latitude',
    'Longitude': 'site_longitude'
})
df_hsi_final = df_hsi_clean[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [4]:
# Create a cleaned and standardized version of df_tri
df_tri_clean = df_tri.copy()
df_tri_clean['data_source'] = 'tri'
df_tri_clean['undesirable_activity'] = 'chemical_activity'
df_tri_clean = df_tri_clean.rename(columns={
    'TRI Facility Name': 'site_name',
    'Latitude': 'site_latitude',
    'Longitude': 'site_longitude'
})
df_tri_clean['site_address'] = None  
df_tri_clean['site_city'] = None     
df_tri_clean['site_county'] = None   
df_tri_final = df_tri_clean[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [5]:
# Filter and clean df_cdr
df_cdr_filtered = df_cdr[df_cdr['SITE NAICS CODE 1'].str[:2].isin(['31', '32', '33'])]
df_cdr_filtered = df_cdr_filtered.copy()
df_cdr_filtered['data_source'] = 'cdr'
df_cdr_filtered['undesirable_activity'] = 'heavy_or_chemical_manufacturing'
df_cdr_filtered = df_cdr_filtered.rename(columns={
    'SITE NAME': 'site_name',
    'SITE ADDRESS LINE1': 'site_address',
    'SITE CITY': 'site_city',
    'SITE COUNTY / PARISH': 'site_county',
    'SITE LATITUDE': 'site_latitude',
    'SITE LONGITUDE': 'site_longitude'
})
df_cdr_final = df_cdr_filtered[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [6]:
df_final_v1 = pd.concat([df_hsi_final, df_tri_final, df_cdr_final], ignore_index=True)

In [7]:
# Mapping of letters to undesirable activities
enforcement_map = {
    'L': 'land_disposal',
    'I': 'incinerator',
    'B': 'industrial_furnace',
    'S': 'storage',
    'T': 'treatment',
    'H': 'solid_waste_management'
}

# Function to extract activities from FULL_ENFORCEMENT string
def map_enforcement(enforcement_str):
    return ', '.join([activity for letter, activity in enforcement_map.items() if letter in enforcement_str]) or None

# Apply the function to create the undesirable_activity column
df_rcra['undesirable_activity'] = df_rcra['FULL_ENFORCEMENT'].apply(map_enforcement)

In [8]:
df_rcra_filtered = df_rcra[df_rcra['undesirable_activity'].notna()].copy()

In [9]:
df_rcra_filtered['data_source'] = 'rcra'
df_rcra_filtered = df_rcra_filtered.rename(columns={
    'FACILITY_NAME': 'site_name',
    'STREET_ADDRESS': 'site_address',
    'CITY_NAME': 'site_city',
    'LATITUDE83': 'site_latitude',
    'LONGITUDE83': 'site_longitude'
})
df_rcra_filtered['site_county'] = None

# Reorder columns to match df_final
df_rcra_final = df_rcra_filtered[[
    'data_source', 'site_name', 'site_address', 'site_city',
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [10]:
df_final_v2 = pd.concat([df_final_v1, df_rcra_final], ignore_index=True)

In [11]:
# Filter and clean df_frs
df_frs['naics_or_sic_code'] = df_frs['naics_or_sic_code'].astype(str)
df_frs['undesirable_activity'] = None

# NAICS conditions
naics_mask = df_frs['naics_or_sic'] == 'NAICS'

df_frs.loc[
    naics_mask & df_frs['naics_or_sic_code'].str[:2].isin(['31', '32', '33']),
    'undesirable_activity'
] = 'heavy_chemical_manufacturing'

df_frs.loc[
    naics_mask & (df_frs['naics_or_sic_code'].str[:2] == '21'),
    'undesirable_activity'
] = 'environmental_hazards'

df_frs.loc[
    naics_mask & (df_frs['naics_or_sic_code'].str[:3] == '112'),
    'undesirable_activity'
] = 'commercial_livestock'

# SIC conditions
sic_mask = df_frs['naics_or_sic'] == 'SIC'

df_frs.loc[
    sic_mask & (df_frs['naics_or_sic_code'].str[:2] == '02'),
    'undesirable_activity'
] = 'commercial_livestock'

df_frs.loc[
    sic_mask & df_frs['naics_or_sic_code'].str[:2].isin(['10', '12', '13', '14']),
    'undesirable_activity'
] = 'environmental_hazards'

df_frs.loc[
    sic_mask & df_frs['naics_or_sic_code'].str[:2].isin([f"{i:02}" for i in range(20, 40)]),
    'undesirable_activity'
] = 'heavy_or_chemical_manufacturing'

df_frs_filtered = df_frs[df_frs['undesirable_activity'].notna()]

df_frs_filtered = df_frs_filtered.rename(columns={
    'FAC_NAME': 'site_name',
    'FAC_STREET': 'site_address',
    'FAC_CITY': 'site_city',
    'FAC_COUNTY': 'site_county',
    'LATITUDE_MEASURE': 'site_latitude',
    'LONGITUDE_MEASURE': 'site_longitude'
})

df_frs_filtered['data_source'] = 'frs'

df_frs_final = df_frs_filtered[[
    'data_source', 'site_name', 'site_address', 'site_city',
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]


In [12]:
df_final_v3 = pd.concat([df_final_v2, df_frs_final], ignore_index=True)

In [13]:
# Filter and clean df_undes_google_places
df_undes_google_places_clean  = df_undes_google_places.copy()
df_undes_google_places_clean = df_undes_google_places_clean[df_undes_google_places_clean['amenity_key'] != 'commercial_livestock']
df_undes_google_places_clean = df_undes_google_places_clean[df_undes_google_places_clean['business_status'].notna()]

df_undes_google_places_clean['data_source'] = 'google_places'
df_undes_google_places_clean['undesirable_activity'] = df_undes_google_places_clean['amenity_key']

df_undes_google_places_clean = df_undes_google_places_clean.rename(columns={
    'name': 'site_name',
    'vicinity': 'site_address',
    'lat': 'site_latitude',
    'lon': 'site_longitude'
})

df_undes_google_places_clean['site_city'] = None
df_undes_google_places_clean['site_county'] = None

df_undes_google_places_final = df_undes_google_places_clean[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [14]:
df_final = pd.concat([df_final_v3, df_undes_google_places_final], ignore_index=True)

In [17]:
df_final

Unnamed: 0,data_source,site_name,site_address,site_city,site_county,site_latitude,site_longitude,undesirable_activity
0,hsi,Dow Chemical - Dalton Site,"1467 Prosser Dr., SE",Dalton,Whitfield,34.632778,-84.928056,hazardous_inventory
1,hsi,Shaver's Farm,641 Shaver Road,Chickamauga,Walker,34.798889,-85.307500,hazardous_inventory
2,hsi,G. C. Lee Site - Lee Engr & Const,Lutz Farm Road,Dupont,Clinch,30.988333,-82.896667,hazardous_inventory
3,hsi,Hercules 009 Landfill - NPL Site,Benedict Road and Route 25,Brunswick,Glynn,31.209444,-81.488056,hazardous_inventory
4,hsi,CSX Transportation - Middleton Derailment,Intersection of County Roads 296 & 304,Middleton,Elbert,34.097778,-82.768611,hazardous_inventory
...,...,...,...,...,...,...,...,...
44399,google_places,FD Cigar Company,"108 East South Main Street Suite B, Waxhaw",,,34.924614,-80.743212,excessive_light
44400,google_places,cape fear aero,"7900 Spanish Oaks Drive, Waxhaw",,,34.935563,-80.754885,excessive_noise
44401,google_places,Hawk's Knoll Airport - 2NC1,"2018 Crane Road, Waxhaw",,,34.998760,-80.802543,excessive_noise
44402,google_places,Aero Plantation Airport-NC21,"865 Baron Road, Waxhaw",,,34.991611,-80.748788,excessive_noise


In [None]:
df_final.to_csv("../../data/processed/scoring_indicators/desirable_undesirable_activities/undesirable_hsi_tri_cdr_rcra_frs_google_places.csv", index=False) 

In [18]:
gdf = gpd.GeoDataFrame(df_final,
        geometry=gpd.points_from_xy(df_final.site_longitude, df_final.site_latitude),
        crs="EPSG:4326")    


gdf.to_file("../../data/maps/desirable_undesirable_activities/undesirable_hsi_tri_cdr_rcra_frs_google_places.geojson", driver="GeoJSON")

In [20]:
# Food deserts
df_food_deserts = df_food_deserts.rename(columns={"CensusTract": "GEOID"})

# Convert tract column to an 11‑digit string
df_food_deserts["GEOID"] = (df_food_deserts["GEOID"].astype(str).str.replace(r"\.0$", "", regex=True).str.zfill(11)         )

gdf_tract = gpd.read_file("../../data/raw/shapefiles/tl_2024_13_tract/tl_2024_13_tract.shp")[["GEOID", "geometry"]]

gdf_tract["GEOID"] = gdf_tract["GEOID"].astype(str).str.zfill(11)

merged = gdf_tract.merge(df_food_deserts, on="GEOID", how="inner")

merged.to_file("../../data/maps/desirable_undesirable_activities/food_access_research_atlas.geojson", driver="GeoJSON")