# Get Supreme Grocers locations

#### Load Python tools and Jupyter config

In [1]:
import re
import us
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data as vega_data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
place = "supreme-grocers"
place_formal = "Supreme Grocers"
color = "#ed1c24"
today = pd.Timestamp.today().strftime("%Y-%m-%d")

---

## Scrape

#### Headers for request

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

#### Fetch and parse location map page

In [5]:
response = requests.get("https://superiorgrocers.com/locations/", headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

#### Make a list of location page urls

In [6]:
base_url = "https://superiorgrocers.com"
urls = soup.find_all("a", class_="choose")
location_urls = [
    base_url + loc_url.get("href")
    for loc_url in urls
    if "location" in loc_url.get("href")
]

#### Loop through location pages, collecting addresses and coordinates from content

In [7]:
locations = []

for url in tqdm(location_urls):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    address_links = soup.find_all(
        "a", href=re.compile(r"google\.com/maps|maps\.google.com")
    )
    address = None
    for link in address_links:
        if "maps/place" in link["href"] or "?daddr=" in link["href"]:
            address = link["href"]
            break  # Assuming you need just one address per location

    # Searching for script content using 'string' attribute
    script_content = soup.find("script", string=re.compile(r"var latlon = \["))
    latlon = None
    store_name = None
    if script_content:
        # Get the store name from an inline script
        store_match = re.search(r"var store = '(.+?)';", script_content.string)
        if store_match:
            store_name = store_match.group(1)

        # Regex pattern to deal with occasional space after the comma for latlon
        latlon_match = re.search(
            r"var latlon = \[([-\d.]+),\s*([-\d.]+)\];", script_content.string
        )
        if latlon_match:
            latlon = [float(latlon_match.group(1)), float(latlon_match.group(2))]

    address_clean = address.replace("https://www.google.com/maps/place/", "")

    location_dict = {
        "location_name": store_name,
        "street": address_clean.split("%2C")[0],
        "city": address_clean.split("%2C")[1],
        "state": address_clean.split("%2C")[2].split("+")[0],
        "zip": address_clean.split("%2C")[2].split("+")[1],
        "latitude": latlon[0],
        "longitude": latlon[1],
        "url": url,
    }
    locations.append(location_dict)

  0%|          | 0/73 [00:00<?, ?it/s]

In [8]:
df = pd.DataFrame(locations)

In [9]:
df["street"] = df["street"].str.replace("+", " ")
df["city"] = df["city"].str.replace("+", " ")

#### Create a mapping of state abbreviations to full state names using the us library

In [10]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [11]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [12]:
df["brand"] = place_formal

#### Add fetch date

In [13]:
df["updated"] = today

#### How many? 

In [14]:
len(df)

73

#### The result: 

In [15]:
df.head()

Unnamed: 0,location_name,street,city,state,zip,latitude,longitude,url,state_name,brand,updated
0,Las Vegas #701,1955 N. Nellis Blvd.,Las Vegas,NV,89115,36.194967,-115.063645,https://superiorgrocers.com/location/lv-nellis,Nevada,Supreme Grocers,2024-03-31
1,Bakersfield #134,2100 White Ln.,Bakersfield,CA,93304,35.317725,-119.026632,https://superiorgrocers.com/location/bakersfield,California,Supreme Grocers,2024-03-31
2,Bakersfield #147,1115 Union Ave,Bakersfield,CA,93304,35.366892,-119.003561,https://superiorgrocers.com/location/bakersfield-union-ave,California,Supreme Grocers,2024-03-31
3,Baldwin Park #128,14433 Ramona Blvd.,Baldwin Park,CA,91706,34.088487,-117.958922,https://superiorgrocers.com/location/baldwin-park,California,Supreme Grocers,2024-03-31
4,Bellflower #140,16100 Lakewood Blvd,Bellflower,CA,90706,33.888283,-118.142638,https://superiorgrocers.com/location/bellflower,California,Supreme Grocers,2024-03-31


In [16]:
df.tail()

Unnamed: 0,location_name,street,city,state,zip,latitude,longitude,url,state_name,brand,updated
68,Dinuba #150,2150 E El Monte Way,Dinuba,CA,93618,36.544004,-119.371436,https://superiorgrocers.com/location/dinuba,California,Supreme Grocers,2024-03-31
69,Tulare #151,115 S West St,Tulare,CA,93274,36.205881,-119.369231,https://superiorgrocers.com/location/tulare,California,Supreme Grocers,2024-03-31
70,Oxnard #138,2401 Saviers Road,Oxnard,CA,93033,34.176216,-119.177687,https://superiorgrocers.com/location/oxnard,California,Supreme Grocers,2024-03-31
71,Oxnard #323,1111 E. Channel Islands Blvd.,Oxnard,CA,93033,34.17346,-119.16528,https://superiorgrocers.com/location/channel-islands,California,Supreme Grocers,2024-03-31
72,Santa Paula #202,280 East Harvard Blvd.,Santa Paula,CA,93060,34.347052,-119.068336,https://superiorgrocers.com/location/santa-paula,California,Supreme Grocers,2024-03-31


---

## Geography

#### Make it a geodataframe

In [17]:
df_geo = df.copy()

In [18]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
).set_crs("4326")

---

## Maps

#### US states background

In [19]:
background = (
    alt.Chart(alt.topo_feature(vega_data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [20]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [21]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [22]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [23]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [24]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)