In [10]:
import pandas as pd
import geopandas as gpd
from geopy.geocoders import Nominatim
import time
import requests
import csv
import os

In [11]:
# CDR (Chemical Data Reporting) identifies where chemicals are handled, but not whether or not they are hazardous
# Using this dataset to help identify chemical and manufacturing activities in Georgia (note: only includes manufacturers that handle chemicals)
df_cdr_industrial = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/cdr/2020_cdr_industrial_processing_and_use_information.csv", encoding="ISO-8859-1")

In [24]:
# FRS (Facility Registry System) contains information on facilities and their industries 
# Use to further identify chemical and manufacturing activities in Georgia
df_frs_fac = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_FACILITIES_GA.csv")
df_frs_naics = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_NAICS_CODES.csv")
df_frs_sic = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_SIC_CODES.csv")
df_frs_program = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_PROGRAM_LINKS_GA.csv")


  df_frs_fac = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_FACILITIES_GA.csv")
  df_frs_naics = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_NAICS_CODES.csv")
  df_frs_sic = pd.read_csv("../../data/raw/scoring_indicators/desirable_undesirable_activities/frs/FRS_SIC_CODES.csv")


### Handling FRS data

In [25]:
# Making sure the REGISTRY_ID is a string for merging
df_frs_fac['REGISTRY_ID_STR'] = df_frs_fac['REGISTRY_ID'].astype(str)
df_frs_naics['REGISTRY_ID_STR'] = df_frs_naics['REGISTRY_ID'].astype(str)
df_frs_sic['REGISTRY_ID_STR'] = df_frs_sic['REGISTRY_ID'].astype(str)

In [28]:
# Merging datasets 
df_frs_facilities = pd.merge(df_frs_fac, df_frs_naics, left_on='REGISTRY_ID_STR', right_on='REGISTRY_ID_STR', how='left')
df_frs_facilities_final = pd.merge(df_frs_facilities, df_frs_sic, left_on='REGISTRY_ID_STR', right_on='REGISTRY_ID_STR', how='left')

In [29]:
# Dropping unmatched columns
df_frs_facilities_final = df_frs_facilities_final.dropna(subset=['REGISTRY_ID_y', 'REGISTRY_ID'], how='all')

In [30]:
# Checking for duplicates
print(df_frs_facilities_final.duplicated().sum())

0


In [31]:
# Dropping unnecessary columns
df_frs_facilities_final = df_frs_facilities_final.drop(['PGM_SYS_ID_x', 'PGM_SYS_ACNRM_x', 'REGISTRY_ID_y', 'PGM_SYS_ID_y', 'PGM_SYS_ACNRM_y', 'REGISTRY_ID'], axis=1)

In [32]:
# Replacing NAICS codes that are NaN or start with '99' with the SIC code
def choose_code(row):
    if pd.isna(row['NAICS_CODE']) or str(row['NAICS_CODE']).startswith('99'):
        sic_clean = str(int(float(row['SIC_CODE']))) if pd.notna(row['SIC_CODE']) else None
        return sic_clean, 'SIC'
    else:
        return str(row['NAICS_CODE']), 'NAICS'

# Apply the logic and unpack the results into two new columns
df_frs_facilities_final[['naics_or_sic_code', 'naics_or_sic']] = df_frs_facilities_final.apply(choose_code, axis=1, result_type='expand')

In [33]:
df_frs_facilities_final

Unnamed: 0,FAC_NAME,FAC_STREET,FAC_CITY,FAC_STATE,FAC_ZIP,REGISTRY_ID_x,FAC_COUNTY,FAC_EPA_REGION,LATITUDE_MEASURE,LONGITUDE_MEASURE,REGISTRY_ID_STR,NAICS_CODE,SIC_CODE,naics_or_sic_code,naics_or_sic
0,"DSM NUTRITIONAL PRODUCTS, INC.",495 JACKSON CON,PENDERGRASS,GA,30567,110070516892,JACKSON COUNTY,4.0,34.154310,-83.668110,110070516892,,2048.0,2048,SIC
1,WESTFORK DRUM DUMP,242 WESTFORK COURT,LITHIA SPRINGS,GA,30122,110030826104,DOUGLAS,4.0,33.795410,-84.632410,110030826104,236210,,236210,NAICS
2,SAWNEE EMC/GAINES FOR,6305 SNELLING MILL RD,FLOWERY BRANCH,GA,30542,110038606461,HALL,4.0,34.171270,-83.961800,110038606461,999999,4911.0,4911,SIC
3,ELLIJAY QUARRY,131 ROCK QUARRY ROAD,ELLIJAY,GA,30536,110054894478,GILMER,4.0,34.741000,-84.416722,110054894478,212313,1423.0,212313,NAICS
4,ELLIJAY QUARRY,131 ROCK QUARRY ROAD,ELLIJAY,GA,30536,110054894478,GILMER,4.0,34.741000,-84.416722,110054894478,212313,1423.0,212313,NAICS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176846,NORTHSHORE,LOT 7,NEWNAN,GA,30265.0,110071371562,COWETA COUNTY,4.0,33.475149,-84.771726,110071371562,236115,,236115,NAICS
176847,WM ? ATLANTA RECYCLING FACILITY,5670 SHIRLEE INDUSTRIAL WAY N,ALPHARETTA,GA,30004.0,110071407898,FORSYTH COUNTY,4.0,34.147800,-84.257600,110071407898,,4212.0,4212,SIC
176848,WM ? ATLANTA RECYCLING FACILITY,5670 SHIRLEE INDUSTRIAL WAY N,ALPHARETTA,GA,30004.0,110071407898,FORSYTH COUNTY,4.0,34.147800,-84.257600,110071407898,,5093.0,5093,SIC
176849,LE22028 QUAIL CHASE 7,TUPELO DRIVE,LEESBURG,GA,31763.0,110071370889,LEE COUNTY,4.0,31.661427,-84.267431,110071370889,236115,,236115,NAICS


In [34]:
# Saving data to csv
df_frs_facilities_final.to_csv("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/frs_facilities_naics_sic.csv", index=False)

### Handling CDR data

In [37]:
# Filtering to only needed columns 
df_cdr = df_cdr_industrial[['SITE NAME', 'SITE ADDRESS LINE1', 'SITE CITY', 'SITE COUNTY / PARISH', 'SITE POSTAL CODE', 'SITE STATE', 'SITE LATITUDE', 'SITE LONGITUDE', 'SITE NAICS CODE 1']].drop_duplicates()

In [38]:
# Filling in missing latitude and longitude points for df_cdr
geolocator = Nominatim(user_agent="geo_lookup")

# Function to get latitude and longitude
def get_lat_long(row):
    if (row['SITE LATITUDE'] == 0 and row['SITE LONGITUDE'] == 0) or pd.isna(row['SITE LATITUDE']) or pd.isna(row['SITE LONGITUDE']):
        address = f"{row['SITE ADDRESS LINE1']}, {row['SITE CITY']}, {row['SITE STATE']} {row['SITE POSTAL CODE']}, {row['SITE COUNTY / PARISH']}"
        try:
            location = geolocator.geocode(address, timeout=10)
            if location:
                return pd.Series([location.latitude, location.longitude])
        except Exception as e:
            print(f"Error geocoding {address}: {e}")
    return pd.Series([row['SITE LATITUDE'], row['SITE LONGITUDE']])

In [39]:
df_cdr[['SITE LATITUDE', 'SITE LONGITUDE']] = df_cdr.apply(get_lat_long, axis=1)


In [None]:
# Saving data to csv
df_cdr.to_csv("../../data/preprocessed/scoring_indicators/desirable_undesirable_activities/cdr_industrial_manufacturing_facilities.csv", index=False)

### Google Maps API 

In [None]:
GOOGLE_API_KEY = ''

In [21]:
UNDESIRABLE_ACTIVITIES = {
    "auto_repair_station": {"google_type": "car_repair"},
    # "commercial_livestock": {"google_type": "farm"},
    "excessive_light": {"google_type": ["casino", "stadium", 'night_club']},
    "excessive_noise": {'google_type': ['airport']}, 
}

In [14]:
# Approximate Georgia box bounds 
LAT_MIN, LAT_MAX = 30.33, 35.00
LON_MIN, LON_MAX = -85.60, -80.75

STEP_LAT = 0.2
STEP_LON = 0.2

SEARCH_RADIUS = 10000

In [None]:
# Function makes a single API call to the Google Places Nearby Search endpoint 

def google_places_nearby_search(lat, lon, place_type, radius_m=SEARCH_RADIUS, page_token=None):
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "location": f"{lat},{lon}",
        "radius": radius_m,
        "type": place_type,
        "key": GOOGLE_API_KEY
    }
    if page_token:
        params["pagetoken"] = page_token

    # parsinbg json response and returning it
    resp = requests.get(url, params=params).json()
    return resp

In [None]:
# Function handles multiple API calls to collect all available results for a grid point, managing pagination if the Google API returns a next_page_token 

def fetch_places_with_pagination(lat, lon, place_type, radius_m=SEARCH_RADIUS):
    """
    Calls google_places_nearby_search repeatedly to handle next_page_token,
    collecting up to 60 results. Returns a list of place dicts.
    """
    all_results = []
    page_token = None
    first_pass = True

    # Looping to repeatedly call google_places_nearby_search for each page (if multiple pages)
    while True:
        if page_token and not first_pass:
            time.sleep(2) 
        else:
            first_pass = False

        resp = google_places_nearby_search(lat, lon, place_type, radius_m, page_token)
        
        # Debugging: see the status field
        status = resp.get("status", "")
        results = resp.get("results", [])
        print(f"status={status}, lat={lat}, lon={lon}, type={place_type}, got {len(results)} results")

        all_results.extend(results)

        page_token = resp.get("next_page_token")
        if not page_token:
            break

    return all_results

In [None]:
# Function that uses steps (0.2° = ~22.2km stepping) + ~10km radius (20km diameter)
# Write results to CSV incrementally after processing each amenity at each grid point.

def rough_bulk_crawl_georgia():
    # Creating output directory
    output_dir = "../../data/preprocessed/scoring_indicators/desirable_undesirable_activities"
    os.makedirs(output_dir, exist_ok=True)
    # File to save results
    csv_file = os.path.join(output_dir, "ga_undesirable_rough.csv")
    # File to track grid boxes and google_types that have been processed
    checkpoint_file = os.path.join(output_dir, "checkpoint_undesirable_activities.csv")

    
    # Setting up the CSV file with headers
    fieldnames = [
        "amenity_key", "google_type", "place_id", "name",
        "lat", "lon", "types", "vicinity", "business_status"
    ]

    # Checking if the results file already exists and write header only if it doesn't
    if not os.path.exists(csv_file):
        with open(csv_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

    # Setting up the checkpoint file and load already processed cells
    processed_checkpoints = set()
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r", newline="", encoding="utf-8") as cp:
            reader = csv.reader(cp)
            header = next(reader, None)
            for row in reader:
                if len(row) >= 3:
                    processed_checkpoints.add((row[0], row[1], row[2]))
    else:
        # If checkpoint file does not exist create one with a header
        with open(checkpoint_file, "w", newline="", encoding="utf-8") as cp:
            writer = csv.writer(cp)
            writer.writerow(["lat", "lon", "google_type"])


    
    # Tracking place_ids already processed to avoid duplicates
    processed_place_ids = set()

    
    lat = LAT_MIN
    while lat <= LAT_MAX + 1e-9:
        lon = LON_MIN
        while lon <= LON_MAX + 1e-9:
            
            lat_str = f"{lat:.3f}"
            lon_str = f"{lon:.3f}"
            print(f"Grid point lat={lat:.3f}, lon={lon:.3f}")
            
            # For each amenity in undesirable dictionary
            for amenity_key, config in UNDESIRABLE_ACTIVITIES.items():
                gtypes = config.get("google_type", [])
                if isinstance(gtypes, str):
                    gtypes = [gtypes]
                
                # For each google_type do a search
                for gtype in gtypes:
                    checkpoint_key = (lat_str, lon_str, gtype)
                    if checkpoint_key in processed_checkpoints:
                        print(f"Skipping {checkpoint_key} as it was already processed.")
                        continue
                    try:
                        raw_places = fetch_places_with_pagination(lat, lon, gtype, SEARCH_RADIUS)
                    
                        
                        # Preparing rows for writing
                        rows_to_write = []
                        for p in raw_places:
                            place_id = p.get("place_id", "")
                            
                            if place_id in processed_place_ids:
                                continue
                                
                            # Addding to processed set
                            processed_place_ids.add(place_id)
                            
                            # Creating row dict
                            rows_to_write.append({
                                "amenity_key": amenity_key,
                                "google_type": gtype,
                                "place_id": place_id,
                                "name": p.get("name", ""),
                                "lat": p.get("geometry", {}).get("location", {}).get("lat"),
                                "lon": p.get("geometry", {}).get("location", {}).get("lng"),
                                "types": "|".join(p.get("types", [])),
                                "vicinity": p.get("vicinity", ""),
                                "business_status": p.get("business_status", "")
                            })
                        
                        # Appending to CSV if we have rows to write
                        if rows_to_write:
                            with open(csv_file, "a", newline="", encoding="utf-8") as f:
                                writer = csv.DictWriter(f, fieldnames=fieldnames)
                                for row in rows_to_write:
                                    writer.writerow(row)
                            
                            print(f"Appended {len(rows_to_write)} places for {amenity_key}/{gtype} at lat={lat_str}, lon={lon_str}")
                        
                         # After processing logging the grid cell and google_type to checkpont file
                        with open(checkpoint_file, "a", newline="", encoding="utf-8") as cp:
                            cp_writer = csv.writer(cp)
                            cp_writer.writerow([lat_str, lon_str, gtype])
                        # Adding to the in memory set
                        processed_checkpoints.add(checkpoint_key)
                                           

                    except Exception as e:
                        print(f"Error fetching lat={lat_str}, lon={lon_str}, type={gtype}: {e}")
            
            lon += STEP_LON
        lat += STEP_LAT
    
    print(f"Finished crawling. Processed {len(processed_place_ids)} unique places total.")
    print(f"Data saved to {csv_file}")

In [22]:
if __name__ == "__main__":
    rough_bulk_crawl_georgia()
    print("Done with rough coverage of Georgia.")

Grid point lat=30.330, lon=-85.600
Skipping ('30.330', '-85.600', 'car_repair') as it was already processed.
Skipping ('30.330', '-85.600', 'casino') as it was already processed.
Skipping ('30.330', '-85.600', 'stadium') as it was already processed.
Skipping ('30.330', '-85.600', 'night_club') as it was already processed.
Skipping ('30.330', '-85.600', 'airport') as it was already processed.
Grid point lat=30.330, lon=-85.400
Skipping ('30.330', '-85.400', 'car_repair') as it was already processed.
Skipping ('30.330', '-85.400', 'casino') as it was already processed.
Skipping ('30.330', '-85.400', 'stadium') as it was already processed.
Skipping ('30.330', '-85.400', 'night_club') as it was already processed.
Skipping ('30.330', '-85.400', 'airport') as it was already processed.
Grid point lat=30.330, lon=-85.200
Skipping ('30.330', '-85.200', 'car_repair') as it was already processed.
Skipping ('30.330', '-85.200', 'casino') as it was already processed.
Skipping ('30.330', '-85.200', 