# Lab 07 Web Scrapping
## Author: Zixin Feng, PhD student in UBDC

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import ast
from pandas import json_normalize
from tqdm import tqdm
import folium

## Starbucks store locator

- Website: https://www.starbucks.co.uk/store-locator?types=starbucks&latLng=55.8625388%2C-4.284226000000002&zoom=12

- Investigate the website and the feature of the URLs.
- Use `requests` to get the latitudes, longitudes, addresses, Unique IDs, store names, and openning hours of all the Starbucks scrapped as a DataFrame. 
- Visualise the stores on map. 

The boudanries to scrap:

- north_bound = 60.8590 N
- south_bound = 54.6356 N
- west_bound = -7.385 W
- east_bound = 1.7834 E

Since we need to scrap the data from different parts of the map, and each map's url is featured a unique centroid coordinate, we need to get a list of centroid coordinates in the large area before determining the urls. 

In [2]:
north_bound = 60.8590
south_bound = 54.6356
west_bound = -7.385
east_bound = 1.7834

# determine the total number of grids
divisions_lat = 40
divisions_lon = 10

# Devide the maps into grids
step_lat = (north_bound - south_bound) / divisions_lat
step_lon = (east_bound - west_bound) / divisions_lon

# Get the centroid coordinates of each grid
centroids = []

# Generate grid centroids
for i in range(divisions_lat):
    for j in range(divisions_lon):
        # Calculate the centroid latitude
        centroid_lat = south_bound + step_lat * (i+ 0.5)
        # Calculate the centroid longitude
        centroid_lon = west_bound + step_lon * (j+ 0.5)    
        # Append the centroid (latitude, longitude) tuple to the centroids list
        centroids.append((centroid_lat, centroid_lon))

len(centroids), centroids[:5]

(400,
 [(54.7133925, -6.9265799999999995),
  (54.7133925, -6.00974),
  (54.7133925, -5.0929),
  (54.7133925, -4.17606),
  (54.7133925, -3.25922)])

In [3]:
stores_data = []

base_url1 = 'https://www.starbucks.co.uk/api/v2/stores/?filter%5Bcoordinates%5D%5Blatitude%5D='
base_url2 = '&filter%5Bcoordinates%5D%5Blongitude%5D='
base_url3 = '&filter%5Bradius%5D=46.6'

for centroid in tqdm(centroids):
    # Format the request URL with the current centroid
    url = f"{base_url1}{centroid[0]}{base_url2}{centroid[1]}{base_url3}"
    response = requests.get(url)
    response.raise_for_status()  
    data = response.json()

    if data.get('data'):
        for store in data['data']:
            store_info = {
                'storeNumber': store['attributes']['storeNumber'],
                'name': store['attributes']['name'],
                'streetAddress': store['attributes']['address']['streetAddressLine1'],
                'city': store['attributes']['address']['city'],
                'postalCode': store['attributes']['address']['postalCode'],
                'latitude': store['attributes']['coordinates']['latitude'],
                'longitude': store['attributes']['coordinates']['longitude'],
                'isOpen': store['attributes']['isOpen'],
                'features': [feature['name'] for feature in store['attributes']['features']]
            }
            stores_data.append(store_info)

stores_df = pd.DataFrame(stores_data)

100%|██████████| 400/400 [04:55<00:00,  1.35it/s]


In [4]:
stores_df = pd.DataFrame(stores_data)
# Drop duplicate rows based on the 'storeNumber' column because the same store might appear in responses for multiple centroids,  
# especially if the centroids overlap in their coverage of Starbucks locations.
stores_df_unique = stores_df.drop_duplicates(subset=['storeNumber']) 

print(len(stores_df_unique))

stores_df_unique

214


Unnamed: 0,storeNumber,name,streetAddress,city,postalCode,latitude,longitude,isOpen,features
0,81125-310421,Magherafelt - Meadowlane SC,Unit 10 Meadowlane Shopping Centre,Magherafelt,BT456PR,54.75457,-6.61099,True,[Mobile Order and Pay]
1,23407-230067,Londonderry Crescent Link (7B),Unit 7B Crescent Link Retail Park,Londonderry,BT47 6SA,54.99889,-7.27455,True,"[Wireless Hot-Spot, Redeem Rewards, Mobile Ord..."
2,23413-230089,Londonderry Foyleside SC (Unit,Londenderry Foyleside,Londonderry,BT48 6XY,54.99450,-7.31838,True,"[Wireless Hot-Spot, Redeem Rewards, Mobile Ord..."
3,58739-292992,Richmond,Ferryquay Street Derry,Antrim,BT48 6QP,54.99600,-7.32066,True,[Mobile Order and Pay]
4,23410-236917,Antrim Junction 1,Unit 50 Junction One Retail Centre,Antrim,BT41 4JG,54.73045,-6.23192,True,"[Wireless Hot-Spot, Redeem Rewards, Mobile Ord..."
...,...,...,...,...,...,...,...,...,...
2392,54829-288559,Inverness - Rose St,Rose Street,Inverness,IV1 1NQ,57.48142,-4.22664,True,"[Wireless Hot-Spot, Redeem Rewards, Oven - War..."
2393,70443-305353,Inverness-New Look-Inverness R,Unit 1B West Seafield,Inverness,IV2 7GD,57.48248,-4.17646,True,"[Wireless Hot-Spot, Redeem Rewards, Oven - War..."
2432,56614-290497,Peterhead Buchan Way DT,Buchan Way,Peterhead,AB42 3GT,57.48563,-1.80407,True,"[Drive-Thru Ordering, Mobile Order and Pay]"
2465,57268-291315,Elgin - A96 DT,A96,Elgin,IV30 8QN,57.64026,-3.26137,True,"[Wireless Hot-Spot, Redeem Rewards, Drive-Thru..."


In [5]:
m = folium.Map(location=[55, -2], zoom_start=6)

for index, row in stores_df_unique.iterrows():
    lat = row['latitude']
    lng = row['longitude']

    popup_text = row['name']
    
    folium.CircleMarker(
        location=[lat, lng],
        radius=5,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=1,
        popup=folium.Popup(popup_text, parse_html=True) 
    ).add_to(m)


m