# Get JC Penney locations

#### Load Python tools and Jupyter config

In [1]:
%load_ext lab_black

In [2]:
import us
import json
import requests
import usaddress
import pandas as pd
import altair as alt
import geopandas as gpd
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [3]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None

In [4]:
place = "jcpenney"
place_formal = "JC Penney"

## Read data

#### Get the state directory

In [5]:
headers = {
    "authority": "www.jcpenney.com",
    "origin": "https://www.jcpenney.com",
    "referer": "https://www.jcpenney.com/",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

#### Read the page and prepare for parsing

In [6]:
response = requests.get(
    "https://www.jcpenney.com/locations/index.html", headers=headers
)
soup = BeautifulSoup(response.text, "html.parser")

#### Split out the directory of states

In [7]:
states_list = []

for a in soup.find("ul", class_="Directory-listLinks"):
    states_dict = {
        "url": a.find("a")["href"],
        "state": a.text.split("(")[0],
        "count": a.text.split("(")[1].replace(")", ""),
    }
    states_list.append(states_dict)

#### Into a dataframe

In [8]:
states_df = pd.DataFrame(states_list)

#### Two states only have one location. They are directly linked. 

In [9]:
locations_later = list(
    "https://www.jcpenney.com/locations/"
    + states_df.query('state.isin(["Alaska", "Vermont"])')["url"]
)
locations_later

['https://www.jcpenney.com/locations/ak/anchorage/clothing-stores-anchorage-ak-1831.html',
 'https://www.jcpenney.com/locations/vt/southburlington/clothing-stores-southburlington-vt-530.html']

#### Others link to generic state pages

In [10]:
state_urls = list(
    "https://www.jcpenney.com/locations/"
    + states_df.query('~state.isin(["Alaska", "Vermont"])')["url"]
    # + states_df.query('~state.isin(["Alaska", "Vermont"])')["url"]
)

---

#### Go to each state page and grab an inline script with state location details

In [11]:
state_locs_df_list = []

for s in tqdm(state_urls):
    state_url_response = requests.get(f"{s}", headers=headers)
    state_url_soup = BeautifulSoup(state_url_response.text, "html.parser")
    state_url_json = state_url_soup.find("script", id="js-map-config-dir-map-2").string
    state_locs_df = pd.DataFrame(json.loads(state_url_json)["locs"])
    state_locs_df_list.append(state_locs_df)

  0%|          | 0/48 [00:00<?, ?it/s]

#### Similar stragegy for Alaska and Vermont

In [12]:
state_locs_df_list_l = []

for l in tqdm(locations_later):
    state_url_response_l = requests.get(f"{l}", headers=headers)
    state_url_soup_l = BeautifulSoup(state_url_response_l.text, "html.parser")
    state_url_json_l = state_url_soup_l.find(
        "script", id="js-map-config-dir-map-desktop-map"
    ).string
    state_locs_df_l = pd.DataFrame(json.loads(state_url_json_l)["locs"]).assign(url=l)
    state_locs_df_list_l.append(state_locs_df_l)

  0%|          | 0/2 [00:00<?, ?it/s]

#### Concatenate the returned lists

In [13]:
src_l = pd.concat(state_locs_df_list_l)[
    ["id", "altTagText", "latitude", "longitude", "url"]
]

In [14]:
src_o = pd.concat(state_locs_df_list)[
    ["id", "altTagText", "latitude", "longitude", "url"]
]

#### Concatenate all the states into one bid dataframe

In [15]:
df = (
    pd.concat([src_l, src_o])
    .reset_index(drop=True)
    .rename(columns={"altTagText": "address"})
).copy()

#### The addresses are pretty messy

In [16]:
df["address"] = (
    df["address"].str.replace("JCPenney at ", "").str.replace("Location at ", "")
)

In [17]:
addresses = df["address"]

#### Create empty lists to store parsed components

In [18]:
streets = []
cities = []
states = []
zipcodes = []

#### Iterate through each address and parse its components

In [19]:
for address in addresses:
    try:
        parsed_address, _ = usaddress.tag(address)

        # Extract relevant components
        street = (
            parsed_address.get("AddressNumber", "")
            + " "
            + parsed_address.get("StreetNamePreDirectional", "")
            + " "
            + parsed_address.get("StreetNamePreType", "")
            + " "
            + parsed_address.get("StreetName", "")
        ).strip()
        city = parsed_address.get("PlaceName", "")
        state = parsed_address.get("StateName", "")
        zipcode = parsed_address.get("ZipCode", "")

        # Append to respective lists
        streets.append(street)
        cities.append(city)
        states.append(state)
        zipcodes.append(zipcode)
    except usaddress.RepeatedLabelError:
        streets.append("")
        cities.append("")
        states.append("")
        zipcodes.append("")

#### Create new columns in the dataframe

In [20]:
df["parsed_street"] = streets
df["parsed_city"] = cities
df["parsed_state"] = states

---

## Geography

#### Make it a geodataframe

In [21]:
df_geo = df.copy()

In [22]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

---

## Maps

#### US states background

In [23]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [24]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=10, color="red")
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [25]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["parsed_state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value("red"),
        tooltip=["parsed_state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### CSV

In [None]:
df.to_csv(f"data/processed/{place}_locations.csv", index=False)

#### JSON

In [None]:
df.to_json(f"data/processed/{place}_locations.json", indent=4, orient="records")

#### GeoJSON

In [None]:
locations_gdf.to_file(f"data/processed/{place}_locations.geojson", driver="GeoJSON")