# Get Chipotle locations

#### Load Python tools and Jupyter config

In [1]:
import us 
import re
import json
import black
import random
import requests
import numpy as np
import pandas as pd
import jupyter_black
import altair as alt
from time import sleep
import geopandas as gpd
from random import randint
from bs4 import BeautifulSoup
from vega_datasets import data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
place = "chipotle"
place_formal = "Chipotle"
color = "#996f14"

---

## Scrape
> Unpack the locations directory: states, cities, locations

#### States

In [4]:
states = requests.get("https://locations.chipotle.com/")
states_soup = BeautifulSoup(states.text, "html.parser")

#### Get state, urls and counts for Chipotle restautant locations

In [5]:
state_urls = []

for s in states_soup.find_all("a", class_="Directory-listLink"):
    count = s["data-count"].replace(")", "").replace("(", "")
    url = f'https://locations.chipotle.com/{s["href"].split("/")[0]}'
    state_name = s.text
    state_urls_dict = {
        "state_name": state_name,
        "state_url": url,
        "state_locations": count,
    }
    state_urls.append(state_urls_dict)

#### Put it into a dataframe

In [6]:
states_df = pd.DataFrame(state_urls)

#### How many locations in the US?

In [7]:
states_df["state_locations"].astype(int).sum()

3385

#### List comprehension for state urls

In [8]:
state_directory_urls = [entry["state_url"] for entry in state_urls]

---

#### Cities

In [9]:
cities_urls = []

for sdu in tqdm(state_directory_urls):
    cities = requests.get(sdu)
    cities_soup = BeautifulSoup(cities.text, "html.parser")
    for cp in cities_soup.find_all("a", class_="Directory-listLink"):
        city_name = cp.text
        city_url = cp["href"]
        city_state = cp["href"].split("/")[0].upper()
        city_count = int(cp["data-count"].replace(")", "").replace("(", ""))
        cities_urls_dict = {
            "cities_name": city_name,
            "city_state": city_state,
            "cities_url": city_url,
            "cities_locations": city_count,
        }
        cities_urls.append(cities_urls_dict)

  0%|          | 0/49 [00:00<?, ?it/s]

#### Into a dataframe

In [10]:
cities_df = pd.DataFrame(cities_urls)

#### Cities with only one location already have a direct link to specific restaurants

In [11]:
cities_df_one = cities_df.query("cities_locations == 1").copy()

In [12]:
cities_df_one["cities_url"] = (
    "https://locations.chipotle.com/" + cities_df_one["cities_url"]
)

In [13]:
cities_df_one.head()

Unnamed: 0,cities_name,city_state,cities_url,cities_locations
0,Athens,AL,https://locations.chipotle.com/al/athens/1289-us-highway-72-e,1
1,Auburn,AL,https://locations.chipotle.com/al/auburn/346-w-magnolia-ave,1
3,Cullman,AL,https://locations.chipotle.com/al/cullman/1821-cherokee-ave-sw,1
4,Daphne,AL,https://locations.chipotle.com/al/daphne/914-van-ave,1
5,Decatur,AL,https://locations.chipotle.com/al/decatur/1109-beltline-rd-se,1


#### Cities with multiple locations need another round of scraping to get those urls

In [14]:
cities_df_plural = cities_df.query("cities_locations > 1").copy()

In [15]:
cities_df_plural["plural_cities_url"] = (
    "https://locations.chipotle.com/" + cities_df_plural["cities_url"]
)

In [16]:
plural_cities_directory_urls = list(cities_df_plural["plural_cities_url"].unique())
plural_cities_directory_urls[0:2]

['https://locations.chipotle.com/al/birmingham',
 'https://locations.chipotle.com/al/hoover']

#### Scrape the plural cities

In [17]:
plural_cities_urls = []

for psdu in tqdm(plural_cities_directory_urls):
    plural_cities = requests.get(psdu)
    plural_cities_soup = BeautifulSoup(plural_cities.text, "html.parser")
    for pcp in plural_cities_soup.find_all("a", class_="Teaser-titleLink"):
        plural_city_name = pcp.text
        plural_city_url = pcp["href"].split("../")[1]
        plural_city_state = pcp["href"].split("/")[1].upper()

        plural_cities_urls_dict = {
            "cities_name": plural_city_name,
            "city_state": plural_city_state,
            "cities_url": plural_city_url,
        }
        plural_cities_urls.append(plural_cities_urls_dict)

  0%|          | 0/514 [00:00<?, ?it/s]

In [18]:
plural_cities_df = pd.DataFrame(plural_cities_urls)

In [19]:
plural_cities_df["cities_url"] = (
    "https://locations.chipotle.com/" + plural_cities_df["cities_url"]
)

In [20]:
plural_cities_df.tail()

Unnamed: 0,cities_name,city_state,cities_url
1959,Chipotle East Towne,WI,https://locations.chipotle.com/wi/madison/4628-e-washington-ave
1960,Chipotle U of Wisconsin,WI,https://locations.chipotle.com/wi/madison/658-state-st
1961,Chipotle West Towne,WI,https://locations.chipotle.com/wi/madison/7066-sligo-dr
1962,Chipotle South 27th Street,WI,https://locations.chipotle.com/wi/milwaukee/3232-s-27th-st
1963,Chipotle Ogden Avenue,WI,https://locations.chipotle.com/wi/milwaukee/600-e-ogden-ave


In [21]:
cities_final_df = pd.concat([plural_cities_df, cities_df_one]).reset_index(drop=True)

In [22]:
len(cities_final_df)

3385

---

#### Locations

In [23]:
locations_urls = list(cities_final_df["cities_url"].unique())

In [24]:
locations_urls[1964:1966]

['https://locations.chipotle.com/al/athens/1289-us-highway-72-e',
 'https://locations.chipotle.com/al/auburn/346-w-magnolia-ave']

In [25]:
# Initialize lists to store extracted data
coordinates = []
street = []
city = []
state = []
postal_code = []
phone_number = []
location_code = []
link = []

# Iterate over the URLs
for url in tqdm(locations_urls):
    # Assuming you fetch the HTML content of each URL and store it in html_content
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract latitude and longitude
    latitude = soup.find("meta", itemprop="latitude")["content"]
    longitude = soup.find("meta", itemprop="longitude")["content"]
    coordinates.append((latitude, longitude))

    # Extract street, city, state, and postal code
    street_address = soup.find("span", class_="c-address-street-1").text.strip()
    city_name = soup.find("span", class_="c-address-city").text.strip()
    state_abbr = soup.find("abbr", class_="c-address-state").text.strip()
    postal = soup.find("span", class_="c-address-postal-code").text.strip()
    try:
        phone = soup.find("a", class_="Phone-link").text.strip()
    except:
        phone = ""
    try:
        location = soup.find("a", class_="Nutrition-cta")["href"].split("restaurant=")[
            1
        ]
    except:
        location = ""
    street.append(street_address)
    city.append(city_name)
    state.append(state_abbr)
    postal_code.append(postal)
    phone_number.append(phone)
    location_code.append(location)
    link.append(url.replace("https://locations.chipotle.com/", ""))

# Create DataFrame
df = pd.DataFrame(
    {
        "store_id": location_code,
        "coordinates": coordinates,
        "street": street,
        "city": city,
        "state": state,
        "zip": postal_code,
        "phone": phone_number,
        "url": link,
    }
)

# Split coordinates into latitude and longitude columns
df[["latitude", "longitude"]] = pd.DataFrame(df["coordinates"].tolist(), index=df.index)

# Drop the original 'Coordinates' column
df.drop("coordinates", axis=1, inplace=True)

  0%|          | 0/3385 [00:00<?, ?it/s]

In [42]:
df.head()

Unnamed: 0,store_id,street,city,state,zip,phone,url,latitude,longitude,state_name
0,2642.0,300 20th St S,Birmingham,AL,35233,(205) 326-8572,al/birmingham/300-20th-st-s,33.509721495414745,-86.80275567068401,Alabama
1,2894.0,3220 Morrow Rd,Birmingham,AL,35235,(205) 655-3734,al/birmingham/3220-morrow-rd,33.59558141391436,-86.64743684970284,Alabama
2,1131.0,4719 Highway 280,Birmingham,AL,35242,(205) 991-4846,al/birmingham/4719-highway-280,33.42258214624579,-86.69827946502971,Alabama
3,,5342 Highway 280,Birmingham,AL,35242,(659) 272-0321,al/birmingham/5342-highway-280,33.4190918,-86.6744839,Alabama
4,3605.0,1051 Amber Dr,Hoover,AL,35244,(205) 203-1028,al/hoover/1051-amber-dr,33.3785141,-86.82327269999999,Alabama


In [27]:
len(df)

3385

#### Create a mapping of state abbreviations to full state names using the us library

In [30]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [31]:
df["state_name"] = df["state"].map(state_mapping)

In [32]:
df.head()

Unnamed: 0,store_id,street,city,state,zip,phone,url,latitude,longitude,state_name
0,2642.0,300 20th St S,Birmingham,AL,35233,(205) 326-8572,al/birmingham/300-20th-st-s,33.509721495414745,-86.80275567068401,Alabama
1,2894.0,3220 Morrow Rd,Birmingham,AL,35235,(205) 655-3734,al/birmingham/3220-morrow-rd,33.59558141391436,-86.64743684970284,Alabama
2,1131.0,4719 Highway 280,Birmingham,AL,35242,(205) 991-4846,al/birmingham/4719-highway-280,33.42258214624579,-86.69827946502971,Alabama
3,,5342 Highway 280,Birmingham,AL,35242,(659) 272-0321,al/birmingham/5342-highway-280,33.4190918,-86.6744839,Alabama
4,3605.0,1051 Amber Dr,Hoover,AL,35244,(205) 203-1028,al/hoover/1051-amber-dr,33.3785141,-86.82327269999999,Alabama


---

## Geography

#### Make it a geodataframe

In [33]:
df_geo = df.copy()

In [34]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
)

In [35]:
locations_gdf = gdf.set_crs("EPSG:4326").copy()

---

## Maps

#### US states background

In [36]:
background = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [37]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=10, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [38]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [39]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [40]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [41]:
locations_gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)