# Get DSW locations

#### Load Python tools and Jupyter config

In [1]:
import us 
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
place = "dsw"
place_formal = "DSW"
color = "#777777"

---

## Scrape
> DSW has a directory style store locator, with pages for 1. [states](https://stores.dsw.com/usa.html) 2. [locations within states](https://stores.dsw.com/usa/al.html) and 3. [location detail pages](https://stores.dsw.com/usa/al/birmingham/dsw-designer-shoe-warehouse-brookwood-village.html). 

#### Headers for requests

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

### 1. States directory

#### Read page and parse its contents

In [5]:
states_response = requests.get("https://stores.dsw.com/usa.html")
states_soup = BeautifulSoup(states_response.text, "html.parser")

In [6]:
state_page_urls = []

for s in tqdm(states_soup.find_all("a", class_="StateList-listLink")):
    state_page_urls.append("https://stores.dsw.com/" + s["href"][:6] + ".html")

  0%|          | 0/45 [00:00<?, ?it/s]

### 2. Locations within states

In [7]:
location_urls = []

for u in tqdm(state_page_urls):
    state_loc_response = requests.get(u)
    state_loc_soup = BeautifulSoup(state_loc_response.text, "html.parser")
    for state_loc in state_loc_soup.find_all("a", class_="CityList-listLink"):
        location_urls.append("https://stores.dsw.com/" + state_loc["href"])

  0%|          | 0/45 [00:00<?, ?it/s]

### 3. Location detail pages

In [8]:
locations_details = []

for l in tqdm(location_urls):
    location_response = requests.get(l)
    location_soup = BeautifulSoup(location_response.text, "html.parser")

    try:
        street = location_soup.find("meta", itemprop="streetAddress")["content"]
    except:
        continue
    city = location_soup.find("span", class_="c-address-city").text
    state = location_soup.find("abbr", class_="c-address-state").text
    zip = location_soup.find("span", class_="c-address-postal-code").text
    phone = location_soup.find("span", class_="c-phone-main-number-span").text
    longitude = location_soup.find("meta", itemprop="longitude")["content"]
    latitude = location_soup.find("meta", itemprop="latitude")["content"]

    locations_dict = {
        "street": street,
        "city": city,
        "state": state,
        "zip": zip,
        "phone": phone,
        "longitude": longitude,
        "latitude": latitude,
    }

    locations_details.append(locations_dict)

  0%|          | 0/462 [00:00<?, ?it/s]

In [9]:
df = pd.DataFrame(locations_details)

In [10]:
df.head()

Unnamed: 0,street,city,state,zip,phone,longitude,latitude
0,545 Brookwood Village,Birmingham,AL,35209,(205) 238-6245,-86.7742111,33.467652
1,4350 Creekside Avenue,Hoover,AL,35244,(205) 739-0219,-86.815287,33.375847
2,301 The Bridge Street,Huntsville,AL,35806,(256) 327-8550,-86.674362,34.715401
3,3930 B Airport Blvd,Mobile,AL,36608,(251) 460-9129,-88.1460254,30.6771837
4,7228 EastChase Parkway,Montgomery,AL,36117,(334) 270-7967,-86.165871,32.359165


In [11]:
len(df)

439

#### Create a mapping of state abbreviations to full state names using the us library

In [12]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [13]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [14]:
df["brand"] = place_formal

---

## Geography

#### Make it a geodataframe

In [15]:
df_geo = df.copy()

In [16]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [17]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [18]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=10, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [19]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [20]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [21]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [22]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)