# Get Menchie's locations

#### Load Python tools and Jupyter config

In [3]:
%load_ext lab_black

In [165]:
import pandas as pd
import os
import requests
import json

os.environ["USE_PYGEOS"] = "0"
import geopandas as gpd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange
import usaddress

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1020
pd.options.display.max_colwidth = None

---

## Scrape

#### Get details about each location from its directory

In [198]:
url = "https://www.menchies.com/all-locations#intl"

In [199]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
location_list = soup.findAll("div", class_="loc-info")

In [200]:
loc_list = []

for loc in location_list:
    # Extracting directions element
    directions = loc.find("p", class_="loc-directions")

    if directions:
        # Check if coords element is not None before using it
        coords_element = directions.find("a")
        coords = (
            coords_element["href"].replace("https://maps.google.com/?daddr=", "")
            if coords_element
            else None
        )

        # Extracting phone element
        phone_element = loc.find("p", class_="loc-phone")
        phone = phone_element.get_text() if phone_element else None

        # Creating loc_dict with checks for coords and phone
        loc_dict = {
            "location": loc.find("a").text,
            "url": "https://www.menchies.com/" + loc.find("a")["href"],
            "address": loc.find("div", class_="loc-address")
            .get_text(separator=", ")
            .strip(),
            "coords": coords if coords is not None else "",
            "phone": phone if phone is not None else "",
        }

        # Append loc_dict to loc_list
        loc_list.append(loc_dict)
    else:
        print(f"Directions not found for a location.")

Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.
Directions not found for a location.


In [215]:
df = pd.DataFrame(loc_list)

In [216]:
len(df)

324

In [217]:
df["street"] = df["address"].str.split(", ", expand=True)[0]
df["unit"] = df["address"].str.split(", ", expand=True)[1]
df["city"] = df["address"].str.split(", ", expand=True)[2]
df["state"] = df["address"].str.split(", ", expand=True)[3]

In [218]:
df[["latitude", "longitude"]] = df["coords"].str.split(", ", expand=True)

In [219]:
def parse_address(address):
    try:
        parsed_address = usaddress.tag(address)[0]
        return {
            "number": parsed_address.get("AddressNumber", ""),
            "street": parsed_address.get("StreetName", ""),
            "unit": parsed_address.get("OccupancyIdentifier", ""),
            "city": parsed_address.get("PlaceName", ""),
            "state": parsed_address.get("StateName", ""),
            "zip": parsed_address.get("ZipCode", ""),
        }
    except Exception as e:
        print(f"Error parsing address: {address}")
        return ""


# Apply the parse_address function to the 'address' column
df["parsed_address"] = df["address"].apply(parse_address)

Error parsing address: 8544 US-42 #100, Suite 100, Florence, KY 41042
Error parsing address: 2004 50th Ave., Unit 117, Red Deer, AB AB T4R 3A2
Error parsing address: 525 Ninth St. East , Cornwall , ON K6H 0A3
Error parsing address: 158 Guelph Street, Unit 1, Georgetown, ON L7G 4A6
Error parsing address: 518 St. Clair Ave. West, (St. Clair just west of Bathurst St.), Toronto, ON M6C 1A2
Error parsing address: 20 Broadleaf Avenue, (Just North of Brock St. N and Taunton Rd.), Whitby, ON L1R 0B5


In [220]:
parsed_address_df = df["parsed_address"].apply(pd.Series)

# Explicitly specify the dtype for each column
column_dtype_mapping = {
    "number": "object",
    "street": "str",
    "unit": "str",
    "city": "str",
    "state": "str",
    "zip": "str",
}

parsed_address_df = parsed_address_df.astype(column_dtype_mapping)

# Replace None values with a default value, e.g., an empty string
parsed_address_df = parsed_address_df.fillna("")

# Concatenate the original DataFrame with the parsed_address_df
df = pd.concat([df, parsed_address_df], axis=1)

# Drop the original "parsed_address" column if needed
df = df.drop(columns=["parsed_address"])

# Display the updated DataFrame
df = df.drop(["coords", "address", "state", "state", 0], axis=1)

In [221]:
df.head()

Unnamed: 0,location,url,phone,street,unit,city,latitude,longitude,number,street.1,unit.1,city.1,zip
0,Hillside Plaza,https://www.menchies.com//locations/frozen-yogurt-hillside-plaza-ak,(907) 929-9977,9000 Lake Otis Parkway,Unit 4,Anchorage,61.139147,-149.834163,9000,Lake Otis,4.0,Anchorage,99507
1,Gilbert Gateway Towne Center,https://www.menchies.com//locations/frozen-yogurt-gilbert-gateway-towne-center-az,(480) 783-2441,5022 S. Power Road,Ste. 108,Gilbert,33.324948,-111.688498,5022,Power,108.0,Gilbert,85212
2,San Tan Village,https://www.menchies.com//locations/frozen-yogurt-san-tan-village-az,(480) 993-3336,3131 S. Market Street,Suite 111,Gilbert,33.297007,-111.748993,3131,Market,111.0,Gilbert,85295
3,Stockton Hill,https://www.menchies.com//locations/frozen-yogurt-stockton-hill-az,(928) 263-6646,3535 Stockton Hills Rd.,Kingman,AZ 86409,35.22653,-114.036959,3535,Stockton Hills,,Kingman,86409
4,The QC District,https://www.menchies.com//locations/frozen-yogurt-the-qc-district--az,(480) 784-3943,21295 S. Ellsworth Loop Road,Suite 104,Queen Creek,33.255009,-111.636543,21295,Ellsworth Loop,104.0,Queen Creek,85142


#### Geodataframe from lon/lat

In [223]:
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326"
)

ValueError: could not convert string to float: "43°58'56.7"

#### Plot it

In [224]:
gdf.plot()

NameError: name 'gdf' is not defined

In [225]:
df = df.drop(["geometry"], axis=1)

KeyError: "['geometry'] not found in axis"

---

## Exports

In [222]:
gdf.to_file("data/processed/ulta.geojson", driver="GeoJSON")

NameError: name 'gdf' is not defined

In [None]:
df.to_json("data/processed/ulta.json", orient="records", indent=4)

In [None]:
df.to_csv("data/processed/ulta.csv", index=False)