# Get Piggly Wiggly locations

#### Load Python tools and Jupyter config

In [1]:
import re
import us 
import json
import black
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
place = "piggly-wiggly"
place_formal = "Piggly Wiggly"
color = "#d72832"
latitude = "39.106667"
longitude = "-94.676392"

---

## Prepare

#### Directory pages for states, [like Alabama](https://www.pigglywiggly.com/store-locations/alabama), have inline JSON to feed a map. So, which states have Piggly Wiggly stores?

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

In [5]:
response = requests.get(
    "https://www.pigglywiggly.com/store-locations/", headers=headers
)
soup = BeautifulSoup(response.text, "html.parser")

#### Find all `a` tags containing the specific string

In [6]:
specific_links = soup.find_all(
    "a",
    href=lambda href: href and "https://www.pigglywiggly.com/store-locations/" in href,
)

#### Use a set to store unique URLs

In [7]:
unique_urls = set()

#### Add the unique URLs to the set

In [8]:
for link in specific_links:
    unique_urls.add(link.get("href"))
    # unique_urls.remove("https://www.pigglywiggly.com/store-locations/")

In [9]:
unique_urls = list(unique_urls)[1:]
unique_urls

['https://www.pigglywiggly.com/store-locations/illinois/',
 'https://www.pigglywiggly.com/store-locations/arkansas/',
 'https://www.pigglywiggly.com/store-locations/virginia/',
 'https://www.pigglywiggly.com/store-locations/west-virginia/',
 'https://www.pigglywiggly.com/store-locations/kentucky/',
 'https://www.pigglywiggly.com/store-locations/',
 'https://www.pigglywiggly.com/store-locations/louisiana/',
 'https://www.pigglywiggly.com/store-locations/south-carolina/',
 'https://www.pigglywiggly.com/store-locations/georgia/',
 'https://www.pigglywiggly.com/store-locations/ohio/',
 'https://www.pigglywiggly.com/store-locations/florida/',
 'https://www.pigglywiggly.com/store-locations/texas/',
 'https://www.pigglywiggly.com/store-locations/tennessee/',
 'https://www.pigglywiggly.com/store-locations/oklahoma/',
 'https://www.pigglywiggly.com/store-locations/wisconsin/',
 'https://www.pigglywiggly.com/store-locations/new-york/',
 'https://www.pigglywiggly.com/store-locations/mississippi/'

---

## Scrape

#### Loop through state URLs and extract inline script

In [10]:
for url in unique_urls:
    state_response = requests.get(url)
    state_soup = BeautifulSoup(state_response.text, "html.parser")
    inline_script = state_soup.find("script")
    script_tags = soup.find_all("script")

In [11]:
unique_urls

['https://www.pigglywiggly.com/store-locations/illinois/',
 'https://www.pigglywiggly.com/store-locations/arkansas/',
 'https://www.pigglywiggly.com/store-locations/virginia/',
 'https://www.pigglywiggly.com/store-locations/west-virginia/',
 'https://www.pigglywiggly.com/store-locations/kentucky/',
 'https://www.pigglywiggly.com/store-locations/',
 'https://www.pigglywiggly.com/store-locations/louisiana/',
 'https://www.pigglywiggly.com/store-locations/south-carolina/',
 'https://www.pigglywiggly.com/store-locations/georgia/',
 'https://www.pigglywiggly.com/store-locations/ohio/',
 'https://www.pigglywiggly.com/store-locations/florida/',
 'https://www.pigglywiggly.com/store-locations/texas/',
 'https://www.pigglywiggly.com/store-locations/tennessee/',
 'https://www.pigglywiggly.com/store-locations/oklahoma/',
 'https://www.pigglywiggly.com/store-locations/wisconsin/',
 'https://www.pigglywiggly.com/store-locations/new-york/',
 'https://www.pigglywiggly.com/store-locations/mississippi/'

In [12]:
# Iterate through each <script> tag to find the one containing specific content

In [13]:
# Iterate through each <script> tag to find the one containing specific content
for script_tag in script_tags:
    if script_tag.string:  # Check if string attribute exists
        if "var locations" in script_tag.string:
            # Extract the content of the script tag
            script_content = script_tag.string

            # Here you can further process or extract the desired data from the script_content
            # For example, you can parse the JSON data contained within the script

            # Print or do something with the script content
            print(script_content)

In [14]:
# Extracting JSON data from inline script
json_str = re.search(r'var locations = (\[.*?\]);', inline_script, re.DOTALL).group(1)

# Loading JSON data into a Python dictionary
locations_data = json.loads(json_str)

# Converting dictionary to DataFrame
df = pd.DataFrame(locations_data)

# Displaying the DataFrame
print(df)

AttributeError: 'NoneType' object has no attribute 'group'

#### Create a mapping of state abbreviations to full state names using the us library

In [None]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [None]:
df["state_name"] = df["state"].map(state_mapping)

---

## Geography

#### Make it a geodataframe

In [None]:
df_geo = df.copy()

In [None]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [None]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [None]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [None]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [None]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [None]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [None]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)