# Get Au Bon Pain locations

#### Load Python tools and Jupyter config

In [154]:
%load_ext lab_black

In [1]:
import re
import json
import requests
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange

In [2]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None

## Read data

#### All the links from the locations page

In [3]:
link_response = requests.get("https://pterrys.com/locations")
link_html = BeautifulSoup(link_response.text, "html.parser")
links = link_html.select("a[href*=locations]")

In [4]:
links_list = []

for l in links:
    links_list.append("https://pterrys.com" + l["href"])

#### Remove duplicates and only select restuarants (not location landing pages for Austin and San Antonio)

In [5]:
unique_urls = list(set(links_list))
filtered_urls = [url for url in unique_urls if any(char.isdigit() for char in url)]

#### Get details from each page

In [90]:
locations_list = []

for u in filtered_urls[1:3]:
    location_response = requests.get(u)
    soup = BeautifulSoup(location_response.text, "html.parser")
    
#     # Extract Address
#     address_div = soup.find('div', {'class': 'itemContent', 'class': 'contentTitle'}, string='Address')
#     address_items = address_div.find_all('p')
#     address = [item.get_text(strip=True) for item in address_items]

#     # Extract Hours
#     hours_div = soup.find('div', {'class': 'itemContent', 'class': 'contentTitle'}, string='Hours')
#     hours_items = hours_div.find_all('p')
#     hours = [item.get_text(strip=True) for item in hours_items]

#     # Extract Get in Touch
#     get_in_touch_div = soup.find('div', {'class': 'itemContent', 'class': 'contentTitle'}, string='Get in Touch')
#     get_in_touch_items = get_in_touch_div.find_all(['p', 'a'])
#     get_in_touch = [item.get_text(strip=True) for item in get_in_touch_items]
    
    # Find the address section
    address_section = soup.find('h2', string='Address')
    if address_section:
        # Find the sibling div containing the Google Maps bit.ly link
        google_maps_bitly_link = address_section.find_next('a', href=re.compile(r'http://bit.ly/'))

        if google_maps_bitly_link:
            # Follow the redirection to get the actual Google Maps link
            response = requests.head(google_maps_bitly_link['href'], allow_redirects=True)
            actual_google_maps_link = response.url

            # Extract latitude and longitude from the actual Google Maps link
            match = re.search(r'@([-+]?\d*\.\d+),([-+]?\d*\.\d+)', actual_google_maps_link)

            if match:
                latitude, longitude = map(float, match.groups())
                print("Latitude:", latitude)
                print("Longitude:", longitude)
            else:
                print("Latitude and Longitude not found in the Google Maps link.")
        else:
            print("Google Maps bit.ly link not found in the address section.")
    else:
        print("Address section not found in the HTML.")
        
    locations_dict ={
        'name': soup.findAll('h2', class_='contentTitle')[0].text,
        'address': soup.select('div.itemInnerContent p')[0].text,
        'address': soup.select('div.itemInnerContent p')[1].text,
        'city_state_zip': soup.select('div.itemInnerContent p')[2].text,
        # 'phone': soup.findAll('div', {'class': 'itemInnerContent'})[2].text,
        'latitude': latitude,
        'longitude': longitude
    }
    
    locations_list.append(locations_dict)

Google Maps bit.ly link not found in the address section.
Latitude: 30.328228
Longitude: -97.727758


In [106]:


def extract_address_and_coords(address_section):
    # Check if there is an unordered list (ul) within the address_section
    ul_contents = address_section.find('ul', class_='unstyledList')
    
    if ul_contents:
        # Handle the case with unordered list
        lis = ul_contents.find_all('li')
        if len(lis) >= 2:
            address, city_state_zip = map(str.strip, [li.text for li in lis[:2]])
            return {
                "latitude": None,
                "longitude": None,
                "address": f"{address} {city_state_zip}",
                "city_state_zip": None
            }

    else:
        # Handle the case with the structure containing an address
        address_match = re.search(r'Address\s*<\/h2>([^<]*)<p>([^<]*)<br\s*\/>([^<]*)<\/p>', str(address_section))

        if address_match:
            address, city, state_zip = map(str.strip, address_match.groups())
            return {
                "latitude": None,
                "longitude": None,
                "address": f"{address} {city}",
                "city_state_zip": state_zip
            }

    return None

for u in filtered_urls[1:6]:
    location_response = requests.get(u)
    soup = BeautifulSoup(location_response.text, "html.parser")

    # Find all divs with class 'itemInnerContent'
    address_sections = soup.find_all('div', class_='itemInnerContent')

    for address_section in address_sections:
        extracted_info = extract_address_and_coords(address_section)

        if extracted_info:
            print(extracted_info)


{'latitude': None, 'longitude': None, 'address': 'Drive-Thru hours: Sun-Thurs: 7:00am-12:00am', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': '512-487-5103 ', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': '5900 N. Lamar Blvd. Austin, TX 78705', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': ' Mon-Thur: 7:00am-11:00pm', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': '512-814-0129 no call in orders please Got Feedback?Complete Our Survey or Contact Us', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': '(512) 256-5309no call in orders pleaseGot Feedback?Complete Our Survey or Contact Us (512) 256-5309', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': '3311 Ranch Road 620 S. Austin TX, 78738', 'city_state_zip': None}
{'latitude': None, 'longitude': None, 'address': ' Mon-Thurs: 7:00am-10:00pm', 'city_state_zip': None}
{'latitude': None, 

In [157]:
response = requests.get("https://pterrys.com/locations/austin/10-koenig-n-lamar")
soup = BeautifulSoup(response.text, "html.parser")

'5900 N. Lamar Blvd.|Austin, TX 78752|<a href="http://bit.ly/2gemV9e'

---

## Exports

#### CSV

In [15]:
df.to_csv("data/processed/p_terrys_locations.csv", index=False)

#### JSON

In [16]:
df.to_json("data/processed/p_terrys_locations.json", indent=4, orient="records")

#### GeoJSON

In [17]:
gdf.to_file("data/processed/p_terrys_locations.geojson", driver="GeoJSON")