# Get Walmart locations

#### Load Python tools and Jupyter config

In [1]:
import us
import json
import black
import requests
import logging
import pandas as pd
import jupyter_black
import altair as alt
from time import sleep
import geopandas as gpd
from random import randint
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 10000
pd.options.display.max_colwidth = None

In [3]:
place = "walmart"
place_formal = "Walmart"
color = "#004f9a"
latitude = "39.106667"
longitude = "-94.676392"

## Scrape

#### Headers for subsequent requests

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

#### Start by getting inline json from the [store directory](https://www.walmart.com/store-directory)

In [5]:
response = requests.get("https://www.walmart.com/store-directory", headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
stores = json.loads(soup.find_all("script", type="application/json")[0].string)

#### Navigate to the object containing a rough directory of each store

In [6]:
stores = json.loads(soup.find_all("script", type="application/json")[0].string)[
    "props"
]["pageProps"]["bootstrapData"]["cv"]["storepages"]["_all_"]["sdStoresPerCityPerState"]

#### Remove the string representation around the JSON

In [7]:
stores = stores.strip('"')

#### Parse the JSON string into a Python dictionary

In [8]:
stores_dict = json.loads(stores)

#### Parse details about each store, with error handling because some rows will be incomplete

In [9]:
# Initialize an empty list to store the data
all_data = []

# Iterate over each state's data
for state, cities in stores_dict.items():
    for city_data in cities:
        city_name = city_data["city"]

        # Check if 'stores' key exists in city_data and if it's iterable
        if "stores" in city_data and isinstance(city_data["stores"], list):
            for store in city_data["stores"]:
                store_info = {
                    "state": state.upper(),
                    "city": city_name,
                    "store_type": store.get("displayName", "N/A"),
                    "store_name": store.get("storeName", "N/A"),
                    "address": store.get("address", "N/A"),
                    "phone": store.get("phone", "N/A"),
                    "postal_code": store.get("postalCode", "N/A"),
                    "store_id": store.get("storeId", "N/A"),
                }
                all_data.append(store_info)
        else:
            # If 'stores' key doesn't exist or is not iterable, append placeholder data
            store_info = {
                "state": state.upper(),
                "city": city_name,
                "store_type": "N/A",
                "store_name": "N/A",
                "address": "N/A",
                "phone": "N/A",
                "postal_code": "N/A",
                "store_id": "N/A",
            }
            all_data.append(store_info)
            sleep(randint(1, 3))

#### Create a store dataframe from the list of dictionaries

In [10]:
df = pd.DataFrame(all_data)

#### The store data is incomple (some addresses missing and no geo coordinates) so we need to hit each store page

In [11]:
df["url"] = "https://www.walmart.com/store/" + df["store_id"].astype(str)
store_urls = df["url"].to_list()

#### Function for looping through urls, reading inline json and storing details

In [12]:
logging.basicConfig(level=logging.INFO)


def scrape_store_details(store_url):
    try:
        response = requests.get(store_url, headers=headers)
        response.raise_for_status()  # Raise exception for 4XX and 5XX status codes
        soup = BeautifulSoup(response.text, "html.parser")
        script_content = soup.find("script", id="__NEXT_DATA__").string
        if script_content:
            store_detail = json.loads(script_content)["props"]["pageProps"][
                "initialData"
            ]["initialDataNodeDetail"]["data"]["nodeDetail"]
            id = store_detail["id"]
            displayName = store_detail["displayName"]
            type = store_detail["type"]
            phoneNumber = store_detail["phoneNumber"]
            name = store_detail["name"]
            open24Hours = store_detail["open24Hours"]
            street = store_detail["address"]["addressLineOne"]
            state = store_detail["address"]["state"]
            city = store_detail["address"]["city"]
            zip_code = store_detail["address"]["postalCode"]
            geo_point = store_detail["geoPoint"]

            store_details_dict = {
                "id": id,
                "displayName": displayName,
                "type": type,
                "phoneNumber": phoneNumber,
                "name": name,
                "open24Hours": open24Hours,
                "street": street,
                "city": city,
                "state": state,
                "zip": zip_code,
                "geoPoint": geo_point,
            }
            return store_details_dict
        else:
            logging.error(f"No script content found for URL: {store_url}")
            return None
    except Exception as e:
        logging.error(f"Error scraping data from {store_url}: {e}")
        return None


store_details = []

for store_url in tqdm(store_urls):
    store_detail = scrape_store_details(store_url)
    if store_detail:
        store_details.append(store_detail)
        sleep(randint(1,3))

  0%|          | 0/4627 [00:00<?, ?it/s]

ERROR:root:Error scraping data from https://www.walmart.com/store/306: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/661: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/7247: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/316: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/4673: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/5062: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/356: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/2739: 'NoneType' object has no attribute 'string'
ERROR:root:Error scraping data from https://www.walmart.com/store/764: 'None

KeyboardInterrupt: 

#### Put the store details into a dataframe

In [None]:
src = pd.DataFrame(store_details)

In [None]:
src[["latitude", "longitude"]] = pd.json_normalize(src["geoPoint"])

In [None]:
df = src.copy()

In [None]:
len(df)

#### Create a mapping of state abbreviations to full state names using the us library

In [None]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [None]:
df["state_name"] = df["state"].map(state_mapping)

---

## Geography

#### Make it a geodataframe

In [None]:
df_geo = df.copy()

In [None]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [None]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [None]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [None]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [None]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [None]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [None]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)