# Get Home Depot locations

#### Load Python tools and Jupyter config

In [244]:
import re
import us
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [20]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [None]:
place = "home-depot"
place_formal = "Home Depot"
color = "#f96302"

---

## Scrape
> Home Depot has a directory style store locator: 1. [States directory](https://www.homedepot.com/l/storeDirectory) > 2. [stores by state](https://www.homedepot.com/l/CA) > 3. [individual locations](https://www.homedepot.com/l/Santa-Rosa/CA/Santa-Rosa/95403/1379)

#### Headers for requests

In [22]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

### 1. States

#### State page content

In [None]:
states_response = requests.get(
    "https://www.homedepot.com/l/storeDirectory", headers=headers
)
states_soup = BeautifulSoup(states_response.text, "html.parser")

#### Read state page links into a list

In [None]:
state_urls = []
for s in states_soup.find_all("a", class_="store-directory__state-link--rivl5"):
    state_urls.append(s["href"])

#### Remove dupes with a list comprehension and enumerate

In [None]:
state_urls_unique = [i for n, i in enumerate(state_urls) if i not in state_urls[:n]]

### 2. Stores by state

In [223]:
# Initialize an empty list to store all URLs across states
state_page_url_all = []

# List of substrings to exclude from the URLs
exclusions = ["rentals", "services", "garden-center", "designcenter"]

# Assuming state_urls_unique is a list of unique state URLs to iterate over
for su in tqdm(state_urls_unique):
    state_page_response = requests.get(f"{su}", headers=headers)
    state_page_soup = BeautifulSoup(state_page_response.text, "html.parser")

    # Directly appending each valid URL to the main list
    for sps in state_page_soup.find_all(
        "a", class_="store-directory__store-link--rivl5"
    ):
        url = sps["href"]
        if not any(exclusion in url for exclusion in exclusions):
            state_page_url_all.append(url)

  0%|          | 0/54 [00:00<?, ?it/s]

---

### 3. Individual locations

#### Get locations and coordinates from each page and deal with potential errors

In [230]:
locations_list = []

for location in tqdm(state_page_url_all):
    store_id = location.split("/")[-1]
    location_response = requests.get(location, headers=headers)
    location_soup = BeautifulSoup(location_response.text, "html.parser")

    script_content = next(
        (
            script.text
            for script in location_soup.find_all("script")
            if "window.__APOLLO_STATE__" in script.text
        ),
        None,
    )
    json_str = re.search(
        r"window\.__APOLLO_STATE__\s*=\s*(\{.*?\});", script_content, re.DOTALL
    ).group(1)
    json_data = json.loads(json_str)["ROOT_QUERY"][
        f'storeSearch({{"lat":"","lng":"","pagesize":"10","storeFeaturesFilter":{{"applianceShowroom":false,"expandedFlooringShowroom":false,"keyCutting":false,"loadNGo":false,"penske":false,"propane":false,"toolRental":false,"wiFi":false}},"storeSearchInput":"{store_id}"}})'
    ]["stores"][0]

    store_info = {
        "store_id": None,
        "store_name": None,
        "street": None,
        "city": None,
        "state": None,
        "zip": None,
        "latitude": None,
        "longitude": None,
        "phone": None,
        "store_details": None,
    }

    try:
        store_info["store_id"] = store_id
        store_info["store_name"] = json_data["name"]
        store_info["street"] = json_data["address"]["street"]
        store_info["city"] = json_data["address"]["city"]
        store_info["state"] = json_data["address"]["state"]
        store_info["zip"] = json_data["address"]["postalCode"]
        store_info["latitude"] = json_data["coordinates"]["lat"]
        store_info["longitude"] = json_data["coordinates"]["lng"]
        store_info["phone"] = json_data["phone"]
        store_info["store_details"] = json_data["storeDetailsPageLink"]
    except KeyError as e:
        print(f"Key error: {e} - some information may be missing.")
        continue

    locations_list.append(store_info)

  0%|          | 0/2002 [00:00<?, ?it/s]

#### Put the list of location dictionaries into a dataframe

In [234]:
df = pd.DataFrame(locations_list)

In [None]:
df.head()

#### How many stores? 

In [237]:
len(df)

2002

#### Create a mapping of state abbreviations to full state names using the us library

In [238]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [239]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [240]:
df["brand"] = place_formal

---

## Geography

#### Make it a geodataframe

In [241]:
df_geo = df.copy()

In [242]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [245]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [246]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [247]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [248]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [249]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [250]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)