# Get Dollar Tree locations

#### Load Python tools and Jupyter config

In [1]:
import us
import json
import time
import random
import requests
import urllib.parse
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
place = "dollar-tree"
place_formal = "Dollar Tree"
color = "#008500"
today = pd.Timestamp.today().strftime("%Y-%m-%d")

---

## Scrape

#### Get a dataframe of geographically dispersed zip codes

In [4]:
# zips_large = (
#     pd.read_json("../_reference/data/zip_code_demographics_esri.json")
#     .query("population > 5000")
#     .sort_values("population", ascending=False)
#     .reset_index(drop=True)
#     .head(1000)
# )
# zips_large["zipcode"] = zips_large["zipcode"].astype(str).str.zfill(5)

In [5]:
# zips_sample = (
#     pd.read_json("../_reference/data/zip_code_demographics_esri.json")
#     .query("population > 5000")
#     .sort_values("population", ascending=False)
#     .reset_index(drop=True)
#     .tail(11000)
#     .sample(500)
# )
# zips_sample["zipcode"] = zips_sample["zipcode"].astype(str).str.zfill(5)

In [6]:
# zips = pd.concat([zips_sample, zips_large]).reset_index(drop=True)

In [7]:
zips = pd.read_json("../_reference/representative_zip_codes.json")
zips["zipcode"] = zips["zipcode"].astype(str).str.zfill(5)

#### Headers for request

In [8]:
headers = {
    "Accept": "text/javascript, text/html, application/xml, text/xml, */*",
    "Referer": "https://hosted.where2getit.com/dollartree/",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "sec-ch-ua-platform": '"macOS"',
}

In [9]:
dfs = []

for index, row in tqdm(zips.iterrows(), total=zips.shape[0]):
    # Extract zip code, longitude, and latitude
    zip_code = row["zipcode"]
    longitude = row["longitude"]
    latitude = row["latitude"]

    # Dynamically create the XML part of the request with the current zip code, longitude, and latitude
    xml_request = f"""<request><appkey>134E9A7A-AB8F-11E3-80DE-744E58203F82</appkey><formdata id="locatorsearch"><dataview>store_default</dataview><limit>250</limit><geolocs><geoloc><addressline>{zip_code}</addressline><longitude>{longitude}</longitude><latitude>{latitude}</latitude></geoloc></geolocs><searchradius>100</searchradius><where><icon><eq></eq></icon><ebt><eq></eq></ebt><crafters_square><eq></eq></crafters_square><freezers><eq></eq></freezers><snack_zone><eq></eq></snack_zone></where></formdata></request>"""

    # URL encode the XML request
    encoded_xml_request = urllib.parse.quote_plus(xml_request)

    # Construct the URL with the encoded XML request
    url = f"https://hosted.where2getit.com/dollartree/ajax?&xml_request={encoded_xml_request}"

    response = requests.get(url, headers=headers)
    xml_data = response.text

    # Parse the XML
    tree = ET.ElementTree(ET.fromstring(xml_data))
    root = tree.getroot()

    collection_metadata = {}
    if root.tag == "response" and root.attrib.get("code") == "1":
        collection = root.find("collection")
        if collection is not None:
            collection_metadata = collection.attrib

    poi_data = []

    for poi in collection.findall("poi"):
        poi_info = {child.tag: child.text for child in poi}
        poi_data.append(poi_info)

    # Convert to DataFrame
    src = pd.DataFrame(poi_data)[
        [
            "name",
            "address1",
            "address2",
            "city",
            "clientkey",
            "latitude",
            "longitude",
            "phone",
            "postalcode",
            "state",
            "status",
            "temp_closed",
            "timezone",
            "uid",
        ]
    ]
    dfs.append(src)
    # time.sleep(random.uniform(0.5, 2.0))

  0%|          | 0/1053 [00:00<?, ?it/s]

#### Place list of dataframes into a dataframe

In [10]:
src_df = pd.concat(dfs).reset_index(drop=True).drop_duplicates()

#### How many stores?

In [11]:
len(src_df)

8159

#### How many tagged "temp_closed"? 

In [12]:
len(src_df.query("temp_closed == '1'"))

36

#### How many have no closed flag?

In [13]:
len(src_df.query("temp_closed.isnull()"))

3444

#### How many have an open flag?

In [14]:
len(src_df.query("temp_closed == '0'"))

4679

In [15]:
df = (
    src_df.drop(["uid", "status"], axis=1)
    .rename(columns={"address1": "street", "address2": "location", "postalcode": "zip"})
    .copy()
)

#### Create a mapping of state abbreviations to full state names using the us library

In [16]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [17]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [18]:
df["brand"] = place_formal

#### Add fetch date

In [19]:
df["updated"] = today

---

## Geography

#### Make it a geodataframe

In [20]:
df_geo = df.copy()

In [21]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [22]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [23]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [24]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [25]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations_{today}.json",
    indent=4,
    orient="records",
)

#### CSV

In [26]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations_{today}.csv",
    index=False,
)

#### GeoJSON

In [27]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)



In [28]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations_{today}.geojson",
    driver="GeoJSON",
)

