In [1]:
'''
This script is a web scraper used to assist cleaning and standardizing city and zip code data. The URL is a 
direct postal reference for zip codes in Allegheny County, Pennsylvania and their respective towns/cities associated
to that zip. 

I was able to merge this script's resulting dataframe with the data I had collected to stadardize city names 
(ex: East Pittsburgh and E Pittsburgh should both be under East Pittsburgh, some values show Bradford Woods 
while others show Bradfordwoods, etc.) 
'''

from bs4 import BeautifulSoup
import requests
import pandas as pd

# Scraped website
url = 'https://www.zipdatamaps.com/en/us/zip-maps/pa/county/borders/allegheny-county-zip-code-map'

# HTTP GET request, stored in 'page'
page = requests.get(url)

# Beautifulsoup object to parse the HTML content
soup = BeautifulSoup(page.text, 'html.parser')

# HTML table with the data I'm looking for 
table = soup.find('table', class_='table')

# List to store extracted data
data = []

# Pulling rows ('tr')
column_data = table.find_all('tr')

# Loop through each row in the table
for row in column_data:
    # Pulling column values ('td') in each section of current row
    row_data = row.find_all('td')
    
    '''
    As for the indented lines below, I encountered an issue where there were empty values in cells in each of 
    the rows of the data. The info I was looking for was in 1st and 5th positions on the row, so below is how
    I navigated this: 
    '''
    
    # Checking row for expected number of columns
    if len(row_data) == 5:
        # Pull zip code/city name from the 1st/5th column, then stripping whitespace
        zip_code = row_data[0].text.strip()
        city_name = row_data[4].text.strip()
        
        # Appending zip code and city name to the 'data' list
        data.append([zip_code, city_name])

# Setting dataframe with the collected data, with column names
df = pd.DataFrame(data, columns=["Zip Code", "City Name"])

# Saving as a CSV
df.to_csv('C:/Users/ryanm/Desktop/Projects/3/Datasets/zip_city_check.csv', index=False)
print("Scraped")

Scraped
