In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Obtain the Zip Codes for each requested state
def get_zipcode_df(st_abbrev):
    """ 
    Scrape the ZIP codes for each county in the specified state.
    
    Parameters:
    - st_abbrev: Abbreviation of the state (e.g., 'ca' for California)
    """

    st_abbrev = st_abbrev.lower().strip()

    # Webpage URL to scrape the ZIP codes for each county in California
    source = f"https://www.unitedstateszipcodes.org/{st_abbrev}/"

    # Important Note: Set up headers to mimic a browser request
    # Does not work otherwise - Page blocks the request
    # Learned of this from:
    # https://zenscrape.com/5-approaches-to-write-unblock-able-data-scraping-in-python/
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Referer": "https://www.google.com/",
        "Upgrade-Insecure-Requests": "1"
    }

    # Make a request to the webpage
    response = requests.get(source, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all the relevant div elements with the class 'list-group-item'
        items = soup.find_all("div", class_="list-group-item")

        # List to store the extracted data
        data = []

        # Iterate through each item and extract the required information
        for item in items:
            row = item.find_all("div", class_="row")[0]
            zip_code = row.find("div", class_="prefix-col1").text.strip()
            type_ = row.find("div", class_="prefix-col2").text.strip()
            city = row.find("div", class_="prefix-col3").text.strip()
            county = row.find("div", class_="prefix-col4").text.strip()
            area_codes = [a.text.strip() for a in row.find("div", class_="prefix-col5").find_all("a")]

            # Append the extracted data to the list
            data.append({
                "Zip Code": zip_code,
                "Type": type_,
                "Cities": city,
                "County": county,
                "Area Codes": ", ".join(area_codes)
            })

        # Convert the list of dictionaries to DataFrame
        zipcode_df = pd.DataFrame(data)

        print("Shape:", zipcode_df.shape)
        display(zipcode_df.head())

        return zipcode_df

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

In [3]:
# Save the DataFrame to a CSV file
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [4]:
# States for which ZIP codes are to be extracted
states = ["ca", "tx", "fl"]

# Iterate through each state and extract the ZIP codes
for state in states:
    df = get_zipcode_df(state)
    file_name = f"{state}_zipcodes.csv"
    file_path = Path(f"../../../../data/processed_data/{file_name}")
    save_csv_file(df, file_path)

Shape: (2655, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,90001,Standard,"Los Angeles, Firestone Park, Firestone Pk",Los Angeles County,"Area Code 213, Area Code 323"
1,90002,Standard,"Los Angeles, Watts",Los Angeles County,"Area Code 213, Area Code 310, Area Code 323, A..."
2,90003,Standard,Los Angeles,Los Angeles County,"Area Code 213, Area Code 323"
3,90004,Standard,"Los Angeles, Oakwood",Los Angeles County,"Area Code 213, Area Code 323"
4,90005,Standard,"Los Angeles, Sanford",Los Angeles County,"Area Code 213, Area Code 323"


File `ca_zipcodes.csv` already exists. Overwriting file.
File saved as `ca_zipcodes.csv`
Shape: (2661, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,73301,Unique,Austin,Travis County,Area Code 512
1,73344,Unique,Austin,Travis County,Area Code 512
2,73960,PO Box,Texhoma,Sherman County,Area Code 806
3,75001,Standard,Addison,Dallas County,"Area Code 214, Area Code 469, Area Code 945, A..."
4,75002,Standard,"Allen, Lucas, Parker",Collin County,"Area Code 214, Area Code 469, Area Code 945, A..."


File `tx_zipcodes.csv` already exists. Overwriting file.
File saved as `tx_zipcodes.csv`
Shape: (1495, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,32003,Standard,"Fleming Island, Fleming Isle, Orange Park",Clay County,Area Code 904
1,32004,PO Box,"Ponte Vedra Beach, Ponte Vedra",St. Johns County,Area Code 904
2,32006,PO Box,"Fleming Island, Fleming Isle, Orange Park",Clay County,Area Code 904
3,32007,PO Box,Bostwick,Putnam County,Area Code 386
4,32008,Standard,Branford,Suwannee County,"Area Code 352, Area Code 386"


File `fl_zipcodes.csv` already exists. Overwriting file.
File saved as `fl_zipcodes.csv`
