# Get Walmart locations

#### Load Python tools and Jupyter config

In [1]:
import us
import json
import requests
import logging
import pandas as pd
import jupyter_black
import altair as alt
from time import sleep
import geopandas as gpd
from random import randint
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 10000
pd.options.display.max_colwidth = None

In [3]:
place = "walmart"
place_formal = "Walmart"
color = "#004f9a"
latitude = "39.106667"
longitude = "-94.676392"

## Scrape

#### Headers for subsequent requests

In [22]:
import requests

cookies = {
    "vtc": "dsf-feqNjK3e99eds_itoc",
    "pxcts": "4703c602-d356-11ee-9f4d-a0714ce64e69",
    "_pxvid": "4703b8e7-d356-11ee-9f4d-0f7c20d54edc",
    "TBV": "7",
    "ACID": "bf61a010-a7bd-48c3-ba44-57d414d376a1",
    "hasACID": "true",
    "_m": "9",
    "locGuestData": "eyJpbnRlbnQiOiJTSElQUElORyIsImlzRXhwbGljaXQiOmZhbHNlLCJzdG9yZUludGVudCI6IlBJQ0tVUCIsIm1lcmdlRmxhZyI6ZmFsc2UsImlzRGVmYXVsdGVkIjpmYWxzZSwicGlja3VwIjp7Im5vZGVJZCI6IjU2MDQiLCJ0aW1lc3RhbXAiOjE3MDg4MDc4MDE1NjcsInNlbGVjdGlvblR5cGUiOiJMU19TRUxFQ1RFRCIsInNlbGVjdGlvblNvdXJjZSI6IklQX1NOSUZGRURfQllfTFMifSwicG9zdGFsQ29kZSI6eyJ0aW1lc3RhbXAiOjE3MDg4MDc4MDE1NjcsImJhc2UiOiI5MDA2NiJ9LCJtcCI6W10sInZhbGlkYXRlS2V5IjoicHJvZDp2MjpiZjYxYTAxMC1hN2JkLTQ4YzMtYmE0NC01N2Q0MTRkMzc2YTEifQ%3D%3D",
    "abqme": "false",
    "_pxhd": "fd815f95f080e7d386372fdcf2159aa5410ee682ef944df6a31b087241f16bc3:4703b8e7-d356-11ee-9f4d-0f7c20d54edc",
    "AID": "wmlspartner%3D0%3Areflectorid%3D0000000000000000000000%3Alastupd%3D1708824467031",
    "_astc": "7499aa0c7866e9220858d7da3273c971",
    "auth": "MTAyOTYyMDE44bSOOTMNtTO0NdzoBcpEiz71L45BainhBvo3xdCjlUZ8gp66XNXxj1D0rBQjPnYhaQ2sx9WZJPSHyV%2Fvs0o343HxyURrpjJp%2BqLaLSE9%2BxWQCjJynR6puDQrOE%2Fu9%2FnZ767wuZloTfhm7Wk2KcjygjFwIZIekXC4wlSRgDWHtly3RD4tMMaLR6dneb45Zfr9fJxtKJ8cEG0q6gXZ0%2FkIngpIbvLLzf%2Fu%2BqjCpU4aavMUMk70P8glgOEpLOprhDfMJ0tmvH1FCaN9tZDh4SCrHa7G7xrQQyHZn%2BeeTotcM0yqCkaFAyxvCSy8VuhYNnogSakep0QzYEXi%2FF%2BHtMeNBGECdoDPhWRVD4D9oyw7Dt5OKFi3swiMmlsNxpQUUQ9ZrHhGHap%2B3Azx2tCZG8UHHUjyrOXbKKhH072NS%2FW0j%2FU%3D",
    "locDataV3": "eyJpc0RlZmF1bHRlZCI6ZmFsc2UsImlzRXhwbGljaXQiOmZhbHNlLCJpbnRlbnQiOiJTSElQUElORyIsInBpY2t1cCI6W3siYnVJZCI6IjAiLCJub2RlSWQiOiI1NjA0IiwiZGlzcGxheU5hbWUiOiJIYXd0aG9ybmUgTmVpZ2hib3Job29kIE1hcmtldCIsIm5vZGVUeXBlIjoiU1RPUkUiLCJhZGRyZXNzIjp7InBvc3RhbENvZGUiOiI5MDI1MCIsImFkZHJlc3NMaW5lMSI6IjE0NDQxIEluZ2xld29vZCBBdmUiLCJjaXR5IjoiSGF3dGhvcm5lIiwic3RhdGUiOiJDQSIsImNvdW50cnkiOiJVUyIsInBvc3RhbENvZGU5IjoiOTAyNTAtNjcxNSJ9LCJnZW9Qb2ludCI6eyJsYXRpdHVkZSI6MzMuOTAwMjA3LCJsb25naXR1ZGUiOi0xMTguMzYyNDM0fSwiaXNHbGFzc0VuYWJsZWQiOnRydWUsInNjaGVkdWxlZEVuYWJsZWQiOnRydWUsInVuU2NoZWR1bGVkRW5hYmxlZCI6ZmFsc2UsImh1Yk5vZGVJZCI6IjU2MDQiLCJzdG9yZUhycyI6IjA2OjAwLTIzOjAwIiwic3VwcG9ydGVkQWNjZXNzVHlwZXMiOlsiUElDS1VQX0NVUkJTSURFIiwiUElDS1VQX0lOU1RPUkUiXSwic2VsZWN0aW9uVHlwZSI6IkxTX1NFTEVDVEVEIn1dLCJzaGlwcGluZ0FkZHJlc3MiOnsibGF0aXR1ZGUiOjM0LjAwMSwibG9uZ2l0dWRlIjotMTE4LjQzMDMsInBvc3RhbENvZGUiOiI5MDA2NiIsImNpdHkiOiJMb3MgQW5nZWxlcyIsInN0YXRlIjoiQ0EiLCJjb3VudHJ5Q29kZSI6IlVTQSIsImdpZnRBZGRyZXNzIjpmYWxzZSwidGltZVpvbmUiOiJBbWVyaWNhL0xvc19BbmdlbGVzIn0sImFzc29ydG1lbnQiOnsibm9kZUlkIjoiNTYwNCIsImRpc3BsYXlOYW1lIjoiSGF3dGhvcm5lIE5laWdoYm9yaG9vZCBNYXJrZXQiLCJpbnRlbnQiOiJQSUNLVVAifSwiaW5zdG9yZSI6ZmFsc2UsInJlZnJlc2hBdCI6MTcwOTU2NjU4OTE1MiwidmFsaWRhdGVLZXkiOiJwcm9kOnYyOmJmNjFhMDEwLWE3YmQtNDhjMy1iYTQ0LTU3ZDQxNGQzNzZhMSJ9",
    "assortmentStoreId": "5604",
    "hasLocData": "1",
    "userAppVersion": "us-web-1.121.0-bacc43dc6e33f9a3d16d10d1c017ecb2728c6565-0301",
    "bstc": "dXFZEevgJ8Go8JI9pw0IQM",
    "mobileweb": "0",
    "xpth": "x-o-mart%2BB2C~x-o-mverified%2Bfalse",
    "xpa": "5bkff|7ieYb|BENcO|EecJe|JhmI0|KfNzJ|LYm-g|RX-Ie|S2Tuw|TpjqX|Y-4cz|YI6Fz|cZcLX|cf8cl|eZVOZ|f1Vss|fdm-7|lGW14|oAvw4|xL5Od",
    "exp-ck": "5bkff2BENcO1EecJe2S2Tuw1cZcLX1f1Vsscfdm-71oAvw41xL5Od1",
    "akavpau_p2": "1709563589~id=432bd1e1352892f65bab4b0f1d35e7ac",
    "ak_bmsc": "592088B926BB7516DF48EA1D8090C02D~000000000000000000000000000000~YAAQ79fOF8I7TQWOAQAApw3lCReuPa7HLO2Lg2ByVyFK0e2d5aQOKbbEq1w6sE9BYdHtJdXjvGqnwTCC3wWOWZ4G82Hlss1qIgqJ6SkAmz16WBtV7GF02ZdhTzAQ3g/yDTHnHMWgRJCyGTFxR3c/7lBBZahgFnSUhXHPlG/+9nXlyPUXmS0SENMzwh0kZ3tN804gWXRLAShAAs/HnHHHHl6P0fJXnaW9b4YkRosR452Bd6ChlwEQrrfffgEXxCGbSduMQ14jwsGXOKqSYDxGFE8ROOzGEWtbaPhO5lqrpyraWJIITBIWi09u/9he1taaGzzH01hr3UiaKtY55lk80PuAyEKf2BHKP1Q5ox293h0VDlERuX7VUWXG0Nnf05A2adJsSIwBpP+K2fQ=",
    "adblocked": "true",
    "xptc": "assortmentStoreId%2B5604",
    "xpm": "1%2B1709562990%2Bdsf-feqNjK3e99eds_itoc~%2B0",
    "akavpau_p1": "1709563644~id=70b22eff83f9698d237f9d65d9ee3b28",
    "xptwg": "4153479651:1191438151E2EB0:2C228F5:7A62626E:AF3222DD:6C4B450A:",
    "xptwj": "qq:d2d9eeb9d4d3701b96f4:rfJGbUQvrOee7sCNb5xENVHT44mYzJHBtvnCS3HnWItOl4DVCUYQZGcmxz+6v57+b5az4VabIoFz6vOqe+V2XGGT4YiKWVxQerZJ7b0Aa/fWwwNo470BE0K4iG46Ib4EL+7Ykza861OTWVhDNQP3HoRrpgt4VgE5",
    "com.wm.reflector": '"reflectorid:0000000000000000000000@lastupd:1709563291000@firstcreate:1708824462448"',
    "TS01a90220": "010a503cbd5bd734cb6d0ebd356ad10ea5d1dfbe6261f1f7837e88ede5a07f0e57836a1f4205b84c702d91a660491ea2ead54095f6",
    "bm_mi": "D3FA32D0ADEDCAC644E740D302650395~YAAQ3dfOF88SZQCOAQAA0wvsCRfAb/jqhsAaecw5f659xPwE+s7Vlo9hJPjzhLPpz3JjqNbXGvX+b0C+23J/MOalrBqPERVl9nGVo6SrMEizRPKavubM71mvbOL7wd7doztrVhJrCTWhEmjWXFa7QeGHF6v7oNKUuJsyOyby2voHCrP+s2oKOfYwjYg/Ma0vAOmSt479+sW2RjbEhYZDohZ/fQLlyi8PDEsl169CSbuMggQsn1RNI5b995pBR4EzAcjhpP6X4gQICwyBs5wpqy2MJj40Pssq0O6Zng9N+QghDsviWWiqfFQrUjH48Biegwtg0BBpQA6y0bYO9+DSflCzdi3j07TJUrA=~1",
    "TS012768cf": "01f7d437ca15b341b722fbd3c220957c381287fd47ebe52219b18beff1b3de6a890ceb41da2fd7e8f33e83aa3fa1c657688e561932",
    "TS2a5e0c5c027": "08c62f723dab20006967d1a561772b1e3bca2ee5bfe72daeae38ec1ffb3dfec92c0f1fb0450cc70b08dd4762b41130003b63f0bf5117801827520c80867a572c4d4ec1b2e323e00ca2631f74ceb76406c347b35cef5fab5f93e38eaeb242000a",
    "bm_sv": "656B4A95451CB122CD32C2B9BC8426C9~YAAQ3dfOF1sYZQCOAQAA4hTsCRfIyL0EyFZ5+3gzJc39GqEey2rlFTeEx6BE1hHMU7dZ7AaGBwVI1RGDkC/3ZlSZBYP29+HHUU9Y9KMnGzK0svQRh6CzIpz/GyKppj1Khyo002hmyqFQtEOpndLUF1JHqyQuLjrGPoHJTzuhQGlKgSokFZbVdKJ7iodYPgrWo3T2ieKRLy8duk6lV0+EqJYYZwf7he54OYbj7NjyAtastTmJeUOEXTKZcA6y3m2+Zl0=~1",
    "_pxde": "c128d2ff1d389b1896276459e3dbff18e2f3120fc2c5f817cc337a3725895f86:eyJ0aW1lc3RhbXAiOjE3MDk1NjM0NTE2MDJ9",
}

headers = {
    "authority": "www.walmart.com",
    "accept": "*/*",
    "accept-language": "en-US,en;q=0.9,es;q=0.8",
    "downlink": "10",
    "dpr": "2",
    "referer": "https://www.walmart.com/store-directory/ak",
    "sec-ch-ua": '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "synthetic-request-for-logging": "1",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
}

#### Start by getting inline json from the [store directory](https://www.walmart.com/store-directory)

In [28]:
response = requests.get(
    "https://www.walmart.com/store-directory", cookies=cookies, headers=headers
)
soup = BeautifulSoup(response.text, "html.parser")

In [None]:

stores = json.loads(soup.find_all("script", type="application/json")[0].string)

In [29]:
soup



#### Navigate to the object containing a rough directory of each store

In [6]:
stores = json.loads(soup.find_all("script", type="application/json")[0].string)[
    "props"
]["pageProps"]["bootstrapData"]["cv"]["storepages"]["_all_"]["sdStoresPerCityPerState"]

#### Remove the string representation around the JSON

In [7]:
stores = stores.strip('"')

#### Parse the JSON string into a Python dictionary

In [8]:
stores_dict = json.loads(stores)

#### Parse details about each store, with error handling because some rows will be incomplete

In [9]:
# Initialize an empty list to store the data
all_data = []

# Iterate over each state's data
for state, cities in stores_dict.items():
    for city_data in cities:
        city_name = city_data["city"]

        # Check if 'stores' key exists in city_data and if it's iterable
        if "stores" in city_data and isinstance(city_data["stores"], list):
            for store in city_data["stores"]:
                store_info = {
                    "state": state.upper(),
                    "city": city_name,
                    "store_type": store.get("displayName", "N/A"),
                    "store_name": store.get("storeName", "N/A"),
                    "address": store.get("address", "N/A"),
                    "phone": store.get("phone", "N/A"),
                    "postal_code": store.get("postalCode", "N/A"),
                    "store_id": store.get("storeId", "N/A"),
                }
                all_data.append(store_info)
        else:
            # If 'stores' key doesn't exist or is not iterable, append placeholder data
            store_info = {
                "state": state.upper(),
                "city": city_name,
                "store_type": "N/A",
                "store_name": "N/A",
                "address": "N/A",
                "phone": "N/A",
                "postal_code": "N/A",
                "store_id": "N/A",
            }
            all_data.append(store_info)
            sleep(randint(1, 3))

#### Create a store dataframe from the list of dictionaries

In [10]:
df = pd.DataFrame(all_data)

In [11]:
df.head()

Unnamed: 0,state,city,store_type,store_name,address,phone,postal_code,store_id
0,AK,Anchorage,Supercenter,Walmart Supercenter,3101 A St,907-563-5900,99503.0,2070
1,AK,Anchorage,Supercenter,Walmart Supercenter,8900 Old Seward Hwy,907-344-5300,99515.0,2071
2,AK,Anchorage,Supercenter,Walmart Supercenter,7405 Debarr Road,907-339-9039,99504.0,4359
3,AK,Eagle River,,,,,,2188
4,AK,Fairbanks,,,,,,2722


#### The store data is incomple (some addresses missing and no geo coordinates) so we need to hit each store page

In [None]:
df["url"] = "https://www.walmart.com/store/" + df["store_id"].astype(str)
store_urls = df["url"].to_list()

In [17]:
for store_url in tqdm(store_urls[0:2]):
    response = requests.get(store_url, headers=headers)
    response.raise_for_status()  # Raise exception for 4XX and 5XX status codes
    soup = BeautifulSoup(response.text, "html.parser")

  0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
soup

<html lang="en">
<head>
<title>Robot or human?</title>
<meta content="width=device-width" name="viewport"/>
<style>
    #sign-in-widget a,
    #sign-in-widget a:active,
    #sign-in-widget a:hover {
        color: #000
    }

    #sign-in-widget h1 {
        font-weight: 500;
        font-size: 20px;
        font-size: 1.25rem;
        letter-spacing: -.6px;
        margin: 1px auto
    }

    @media (min-width:30em) {
        #sign-in-widget h1 {
            margin-top: 24px;
            font-size: 24px;
            font-size: 1.5rem
        }
    }

    #sign-in-widget {
        font-family: BogleWeb, Helvetica Neue, Helvetica, Arial, sans-serif
    }

    #sign-in-widget * {
        box-sizing: border-box
    }

    #sign-in-widget .text-right {
        text-align: right
    }

    @font-face {
        font-family: NewYorkIcons;
        src: url(6255ed72d86ece856725a2d80878bce6.eot);
        font-weight: 400;
        font-style: normal
    }

    @font-face {
        font-family: Ne

#### Function for looping through urls, reading inline json and storing details

In [13]:
logging.basicConfig(level=logging.INFO)


def scrape_store_details(store_url):
    try:
        response = requests.get(store_url, headers=headers)
        response.raise_for_status()  # Raise exception for 4XX and 5XX status codes
        soup = BeautifulSoup(response.text, "html.parser")
        script_content = soup.find("script", id="__NEXT_DATA__").string
        if script_content:
            store_detail = json.loads(script_content)["props"]["pageProps"][
                "initialData"
            ]["initialDataNodeDetail"]["data"]["nodeDetail"]
            id = store_detail["id"]
            displayName = store_detail["displayName"]
            type = store_detail["type"]
            phoneNumber = store_detail["phoneNumber"]
            name = store_detail["name"]
            open24Hours = store_detail["open24Hours"]
            street = store_detail["address"]["addressLineOne"]
            state = store_detail["address"]["state"]
            city = store_detail["address"]["city"]
            zip_code = store_detail["address"]["postalCode"]
            geo_point = store_detail["geoPoint"]

            store_details_dict = {
                "id": id,
                "displayName": displayName,
                "type": type,
                "phoneNumber": phoneNumber,
                "name": name,
                "open24Hours": open24Hours,
                "street": street,
                "city": city,
                "state": state,
                "zip": zip_code,
                "geoPoint": geo_point,
            }
            return store_details_dict
        else:
            logging.error(f"No script content found for URL: {store_url}")
            return None
    except Exception as e:
        logging.error(f"Error scraping data from {store_url}: {e}")
        return None


store_details = []

for store_url in tqdm(store_urls):
    store_detail = scrape_store_details(store_url)
    if store_detail:
        store_details.append(store_detail)
        sleep(randint(4,8))

  0%|          | 0/4627 [00:00<?, ?it/s]

ERROR:root:Error scraping data from https://www.walmart.com/store/2070: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2071: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/4359: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2188: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2722: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/4474: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2710: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2711: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2074: 

KeyboardInterrupt: 

#### Put the store details into a dataframe

In [None]:
src = pd.DataFrame(store_details)

In [None]:
src[["latitude", "longitude"]] = pd.json_normalize(src["geoPoint"])

In [None]:
df = src.copy()

In [None]:
len(df)

#### Create a mapping of state abbreviations to full state names using the us library

In [None]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [None]:
df["state_name"] = df["state"].map(state_mapping)

---

## Geography

#### Make it a geodataframe

In [None]:
df_geo = df.copy()

In [None]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [None]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [None]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [None]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [None]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [None]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [None]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)