# Collect addresses

In this notebook we're going to collect every Buc-ee's address from [their company directory](https://buc-ees.com/locations/).

In [1]:
# We import requests and BeautifulSoup to scrape addresses from Buc-ee's website
import requests
from bs4 import BeautifulSoup

# For data manipulation and analysis we import pandas
import pandas as pd

# To display data in a table we import itables
from itables import show

We're going to scrape the Buc-ee's directory to get all the locations. Their site is unreliably slow, so we're just going to scrape the html file I saved locally.

In [34]:
buc_ees_list = []

# I have a file located in data/locations.html. I can read it with open() and pass it to BeautifulSoup.
with open('../data/locations.html') as f:
    soup = BeautifulSoup(f, 'html.parser')

# Find all the divs with the class 'bucees-location'
locations = soup.find_all('div', class_='bucees-location')

# Print each of the locations
for location in locations:
    
    # Find the address. It is in a div with the class 'bucees-location-address'. Only search the direct children.
    address = location.find('div', class_='bucees-location-address')

    # Replace <br /> with a space
    for br in address.find_all('br'):
        br.replace_with(' ')

    # Remove text that's in <small> tags
    for small in address.find_all('small'):
        small.decompose()

    # Remove newline characters from the address
    address = address.text.replace('\n', ' ')

    address = address.strip()

    # Remove consecutive spaces
    while '  ' in address:
        address = address.replace('  ', ' ')

    # Get the href located inside the div tags with a class of bucees-location-directions
    directions = location.find('div', class_='bucees-location-directions').find('a')['href']

    buc_ees_list.append(
        {
            'name': location.find('h4').text,
            'address': address,
            'directions': directions,
        }
    )

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(buc_ees_list)

df.to_csv('../output/buc-ees_directory.csv', index=False)

# Display the DataFrame
df

name,address,directions
Loading... (need help?),,
