# Get Shipley Do-nuts locations

#### Load Python tools and Jupyter config

In [1]:
import us 
import json
import black
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
place = "shipleys"
place_formal = "Shipley Do-nuts"
color = "#bf201e"

---

## Scrape

#### Headers for request

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

In [5]:
url = "https://shipleydonuts.com/locations/"

#### Read the page contents

In [6]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

#### Fortunately, all the location info is in an inline script. Let's parse it out ...

In [7]:
# Use `find` to get the first matching script
script_tag = soup.find("script", string=lambda t: t and "var locations_meta =" in t)

if script_tag:
    script_content = json.loads(
        script_tag.string.strip()
        .replace("var locations_meta = ", "")
        .replace("var is_single_location_post = false;", "")
        .replace('\/"}];', '\/"}]')
    )
    # Now you have the script content, and you can further process it as needed
else:
    print("Script not found.")

#### ... into a dataframe

In [8]:
src = pd.DataFrame(script_content)

#### Deal with nexted columns - location info

In [9]:
src[
    [
        "address",
        "lat",
        "lng",
        "zoom",
        "place_id",
        "name",
        "street_number",
        "street_name",
        "street_name_short",
        "city",
        "state",
        "state_short",
        "post_code",
        "country",
        "country_short",
    ]
] = pd.json_normalize(src["map_pin"])

#### Deal with nested columns - store info

In [10]:
src[
    [
        "drive_thru",
        "store_owner",
        "store_image",
        "store_description",
        "phone_number",
        "zip_code_repeater",
        "icon_bbb",
        "icon_us_chamber_of_commerce",
        "logos_max_width",
        "icon_google_reviews.title",
        "icon_google_reviews.url",
        "icon_google_reviews.target",
        "icon_facebook.title",
        "icon_facebook.url",
        "icon_facebook.target",
        "icon_google_reviews",
        "icon_facebook",
        "store_image.ID",
        "store_image.id",
        "store_image.title",
        "store_image.filename",
        "store_image.filesize",
        "store_image.url",
        "store_image.link",
        "store_image.alt",
        "store_image.author",
        "store_image.description",
        "store_image.caption",
        "store_image.name",
        "store_image.status",
        "store_image.uploaded_to",
        "store_image.date",
        "store_image.modified",
        "store_image.menu_order",
        "store_image.mime_type",
        "store_image.type",
        "store_image.subtype",
        "store_image.icon",
        "store_image.width",
        "store_image.height",
        "store_image.sizes.thumbnail",
        "store_image.sizes.thumbnail-width",
        "store_image.sizes.thumbnail-height",
        "store_image.sizes.medium",
        "store_image.sizes.medium-width",
        "store_image.sizes.medium-height",
        "store_image.sizes.medium_large",
        "store_image.sizes.medium_large-width",
        "store_image.sizes.medium_large-height",
        "store_image.sizes.large",
        "store_image.sizes.large-width",
        "store_image.sizes.large-height",
        "store_image.sizes.1536x1536",
        "store_image.sizes.1536x1536-width",
        "store_image.sizes.1536x1536-height",
        "store_image.sizes.2048x2048",
        "store_image.sizes.2048x2048-width",
        "store_image.sizes.2048x2048-height",
    ]
] = pd.json_normalize(src["branch_information"])

#### Clean up addresses and store IDs

In [11]:
src["street"] = src["street_number"] + " " + src["street_name"]

In [12]:
src["store_id"] = src["single_page"].str.split("/", expand=True)[4].str[2:6]

#### Just the columns we need and as we want them

In [13]:
df = src[
    [
        "store_id",
        "street",
        "city",
        "state_short",
        "post_code",
        "lat",
        "lng",
        "phone_number",
        "single_page",
    ]
].rename(
    columns={
        "single_page": "url",
        "state_short": "state",
        "post_code": "zip",
        "lat": "latitude",
        "lng": "longitude",
        "phone_number": "phone",
    }
)

#### The result

In [14]:
df.head()

Unnamed: 0,store_id,street,city,state,zip,latitude,longitude,phone,url
0,130,130 North Highway 123 Bypass,Seguin,TX,78155,29.568573,-97.941474,(830) 406-2195,https://shipleydonuts.com/locations/fc0130/
1,1130,11300 U.S. 290,Manor,TX,78653,30.34969,-97.526133,(512) 953-5499,https://shipleydonuts.com/locations/fc1130/
2,5154,1001 McKinney Street,Houston,TX,77002,29.756905,-95.364309,(713) 651-3033,https://shipleydonuts.com/locations/fc5154/
3,6018,3485 College Street,Beaumont,TX,77701,30.067952,-94.126887,(409) 832-2595,https://shipleydonuts.com/locations/fc6018/
4,6057,620 State Highway 288,Clute,TX,77531,29.0157,-95.393875,(979) 265-2822,https://shipleydonuts.com/locations/fc6057/


#### How many? 

In [15]:
len(df)

374

#### Create a mapping of state abbreviations to full state names using the us library

In [16]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [17]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [18]:
df["brand"] = place_formal

---

## Geography

#### Make it a geodataframe

In [19]:
df_geo = df.copy()

In [20]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [21]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [22]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [23]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [24]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [25]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [26]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)