# Get Hmart locations

#### Load Python tools and Jupyter config

In [1]:
import re
import us
import json
import black
import codecs
import requests
import usaddress
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
place = "hmart"
place_formal = "Hmart"
color = "#ea1a1e"

## Read data

#### All the locations

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

In [5]:
response = requests.get("https://www.hmart.com/ourstores")

In [6]:
soup = BeautifulSoup(response.text, "html.parser")

In [7]:
store_urls = []

for l in soup.find_all("a", class_="store-name"):
    store_urls.append(l["href"])

#### Send a GET request to the main store locator URL

In [8]:
url = "https://www.hmart.com/ourstores"
response = requests.get(url)

#### Parse the HTML content of the page

In [9]:
soup = BeautifulSoup(response.content, "html.parser")

#### Find all scripts containing the store information

In [10]:
scripts = soup.find_all("script", attrs={"xml": "space"})

#### Extract JSON data from the second script (index 1)

In [11]:
json_data = None
if len(scripts) > 1:
    script_text = scripts[1].text.strip()
    # Use regular expressions to extract the JSON data
    match = re.search(r"jsonLocations:\s*(\{.*\})", script_text)
    if match:
        json_data = match.group(1)

# Process the JSON data if available
if json_data:
    try:
        # Parse the JSON data
        store_info = json.loads(json_data.replace("}        }", "}"))
        # Extract store information from the 'items' key
        store_items = store_info.get("items", [])

        # Define a list to store dictionaries of store information
        stores_data = []

        # Iterate over store items and extract required elements
        for store in store_items:
            store_dict = {
                "store_id": store["id"],
                "store_name": store["name"],
                "address": store["address"],
                "latitude": store["lat"],
                "longitude": store["lng"],
                "phone": store["phone"],
            }
            stores_data.append(store_dict)

        # Convert the list of dictionaries to a dataframe
        df = pd.DataFrame(stores_data)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON data: {e}")
else:
    print("No JSON data found in the script.")

---

## Addresses

#### Clean up punctuation quirk in a couple addresses

In [12]:
df["address"] = df["address"].str.replace(".", ". ")

addresses = df["address"]

#### Define lists to store parsed address components

In [13]:
streets = []
cities = []
states = []
zips = []

#### Parse each address and extract components

In [14]:
for address in addresses:
    parsed_address = usaddress.tag(address)
    parsed_dict = dict(parsed_address[0])
    streets.append(
        parsed_dict.get("AddressNumber", "") + " " + parsed_dict.get("StreetName", "")
    )
    cities.append(parsed_dict.get("PlaceName", ""))
    states.append(parsed_dict.get("StateName", ""))
    zips.append(parsed_dict.get("ZipCode", ""))

#### Add parsed components to DataFrame

In [15]:
df["street"] = streets
df["city"] = cities
df["state"] = states
df["zip"] = zips

#### Drop the original 'address' column if needed

In [16]:
df = df.drop(columns=["address"])

#### Create a mapping of state abbreviations to full state names using the us library

In [17]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [18]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [19]:
df["brand"] = place_formal

---

## Geography

#### Make it a geodataframe

In [20]:
df_geo = df.copy()

In [21]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [22]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [23]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=10, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [24]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [25]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [26]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [27]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)