The bike supply chain has taken a major hit the past 2 years, caused by high demand and low supply. Finding a bike in stock has been challenging. 

This script removes some of that headache by scanning a particular bike manufacturer (Canyon's) webpages to find specific models in stock and sends an alert via email. The specific bike models we're looking for are Canyon Endurance CF-SL in size small, or any small bike in the Outlet (previous years). 

In [12]:
import pandas as pd
import numpy as np
import requests
import re
from datetime import datetime
import smtplib, ssl
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import os 
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient


### Find all relevant URLs to parse 
1. Search for CF-SL bikes, which come in multiple different configurations and each a separate html page)
2. Search for any road bike on the outlet page

In [14]:
outlet_page_root_url = 'https://www.canyon.com/en-us/outlet-bikes/road-bikes/'
endurance_cf_sl_root_url = 'https://www.canyon.com/en-us/road-bikes/endurance-bikes/endurace/cf-sl/#sections-products'

In [15]:
# Find all mentioned links on page by iterating over the root path and finding all html tags of <a href> which indicates a link
def get_links_on_page(root_url):
    links = set()
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    page = requests.get(root_url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    for a in soup.find_all('a'):
        link = a.get('href')
        if link != None:
            if  not link.startswith( 'https://www.canyon.com'):
                link = 'https://www.canyon.com' + link
            links.add(link)
    return links


In [16]:
# Find all endurance CF SL bike URLs (since there are multiple models) by iterating over the root path and finding relevant links
urls = []
all_cf_sl_links = get_links_on_page(endurance_cf_sl_root_url)
for link in all_cf_sl_links:
    if 'endurace-cf-sl'  in link:
        urls.append(link)


In [17]:
# Find all bike URLs in the outlet page (Previous years models that we still care about)
all_outlet_links = get_links_on_page(outlet_page_root_url)
for link in all_outlet_links:
    if '/en-us/outlet-bikes/road-bikes/'  in link and 'html' in link:
        urls.append(link)

In [18]:
urls

['https://www.canyon.com/en-us/road-bikes/endurance-bikes/endurace/cf-sl/endurace-cf-sl-8-disc/2963.html?dwvar_2963_pv_rahmenfarbe=GY%2FBK',
 'https://www.canyon.com/en-us/road-bikes/endurance-bikes/endurace/cf-sl/endurace-cf-sl-8-disc-di2/2964.html?dwvar_2964_pv_rahmenfarbe=GY%2FBK',
 'https://www.canyon.com/en-us/road-bikes/endurance-bikes/endurace/cf-sl/endurace-cf-sl-8-disc-etap/3368.html?dwvar_3368_pv_rahmenfarbe=BU%2FBK',
 'https://www.canyon.com/en-us/road-bikes/endurance-bikes/endurace/cf-sl/endurace-cf-sl-7-disc/2962.html?dwvar_2962_pv_rahmenfarbe=YE%2FBK',
 'https://www.canyon.com/en-us/outlet-bikes/road-bikes/endurace-cf-sl-disc-8.0-sl/50005034.html',
 'https://www.canyon.com/en-us/outlet-bikes/road-bikes/ultimate-cf-evo-frameset-electrical/1190.html?dwvar_1190_pv_rahmenfarbe=BK%2FBN',
 'https://www.canyon.com/en-us/outlet-bikes/road-bikes/canyon-ultimate-cf-evo-frameset-electric/2174.html?dwvar_2174_pv_rahmenfarbe=BK%2FBN',
 'https://www.canyon.com/en-us/outlet-bikes/road-b

# Parsing page

### Bike sizing
- Bike sizes are given in the <i>productConfiguration__variantType js-productConfigurationVariantType</i> div class
- Sizes can be ['2XS', 'XS', 'S', 'M', 'L', 'XL', '2XL'] 


### Bike availability

- This is indicated by the <i>productConfiguration__availabilityMessage</i> div class 

### A bike is available if: 

-  "Low Stock" is displayed 
- "Only N left in stock" is displayed

### A bike is soon to be available if:
- "Coming soon" is displayed with the dates

### A bike is not available if:
- "Sold out" is displayed

In [19]:
# takes in a string @url and returns the html content as a string with some cleaning done
def get_page_contents(url):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    contents = requests.get(url, headers=headers).text
    contents = contents.replace("\\n", " ")
    contents = contents.replace('\n', " ")
    return contents

In [20]:
def get_bike_model_name(url):
    first_match_pattern = re.search( '/outlet-bikes/road-bikes/(.*)/', url)
    second_match_pattern = re.search( '/endurace/cf-sl/(.*)/', url)
    return first_match_pattern or second_match_pattern

In [21]:
def get_size_info(content, size_html_pattern1, size_html_pattern2, size_options):
    output_sizes = []
    # find index locations of the bike sizes in the html string
    size_indices1  = [m.start() for m in re.finditer(size_html_pattern1, content)]
    size_indices2  = [m.start() for m in re.finditer(size_html_pattern2, content)]
    size_indices = size_indices1 + size_indices2
    size_indices.sort()
    for size_idx in size_indices:
        # update the index since the content we're looking for starts at the end of the pattern
        if size_idx in size_indices1:
            new_idx = size_idx + len(size_html_pattern1)
        else:
            new_idx = size_idx + len(size_html_pattern2)

        size_string = content[new_idx: new_idx+100].replace(' ', '').replace("</div>", '')
        if any(size_string.find(size_option) > -1 for size_option in size_options):
            output_sizes.append(size_string)
        else:
            print("invalid size")

    return output_sizes

In [22]:
def get_availability_info(content, stock_string_pattern):
    # find index locations of availability status 
    available_indices = [m.start() for m in re.finditer(stock_string_pattern, content)]
    output_availability = []
    for availability_idx in available_indices:
        # update the index since the content we're looking for starts at the end of the pattern
        new_idx = availability_idx + len(stock_string_pattern)
        stock_string = content[new_idx: new_idx+1000].lower().replace('  ', '')
        #print(stock_string)
        stock_status = ""
        if "coming soon" in stock_string:
            stock_status = "Coming Soon"
        elif "sold out" in stock_string:
            stock_status = "Sold Out"
        elif "low stock" in stock_string or "left in stock" in stock_string or "in-stock" in stock_string or 'saleprice' in stock_string:
            stock_status = "In-Stock"

        output_availability.append(stock_status)

    return output_availability

In [23]:
def make_dataframe(model, bike_sizes, bike_availabilities, url):
    output_df = pd.DataFrame(index = range(0,len(bike_sizes)), columns = ['model', 'bike_size', 'availability', 'expected_arrival', 'date_checked', 'url'])
    output_df['model'] = model
    output_df['bike_size'] = bike_sizes
    output_df['availability'] = bike_availabilities
    output_df['date_checked'] = datetime.now()
    output_df['isCurrentlyAvailable'] = False
    output_df['url'] = url
    output_df.loc[output_df.availability == 'Low Stock', 'isCurrentlyAvailable'] = True
    output_df.loc[output_df.availability == 'In-Stock', 'isCurrentlyAvailable'] = True
    return output_df


In [24]:
size_string_pattern1 = '<div class="productConfiguration__variantType js-productConfigurationVariantType">'
size_string_pattern2 = '<div class="productConfiguration__variantType">' 
size_options = ['2XS', 'XS', 'S', 'M', 'L', 'XL', '2XL']
stock_string_pattern = '<div class="productConfiguration__availabilityMessage'


In [25]:
all_outputs = []
for bike_url in urls:
    model = get_bike_model_name(bike_url).group(1)
    print("Processing model: " + model)
    contents = get_page_contents(bike_url)
    output_sizes = get_size_info(contents, size_string_pattern1, size_string_pattern2, size_options)
    output_availability = get_availability_info(contents, stock_string_pattern)
    if len(output_sizes) != len(output_availability):
        print("Data not matching") 

    each_output_df = make_dataframe(model, output_sizes, output_availability, bike_url)    
    all_outputs.append(each_output_df)

output_df = pd.concat(all_outputs)
display(output_df)


Processing model: endurace-cf-sl-8-disc
Processing model: endurace-cf-sl-8-disc-di2
Processing model: endurace-cf-sl-8-disc-etap
Processing model: endurace-cf-sl-7-disc
Processing model: endurace-cf-sl-disc-8.0-sl
Processing model: ultimate-cf-evo-frameset-electrical
Processing model: canyon-ultimate-cf-evo-frameset-electric
Processing model: endurace-cf-slx-disc-8.0-etap
Processing model: endurace-cf-slx-disc-9.0-di2
Processing model: endurace-cf-slx-disc-8.0-etap
Processing model: frs-aro-cf-slx-el-19-xl-bk


Unnamed: 0,model,bike_size,availability,expected_arrival,date_checked,url,isCurrentlyAvailable
0,endurace-cf-sl-8-disc,2XS,Sold Out,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,False
1,endurace-cf-sl-8-disc,XS,Sold Out,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,False
2,endurace-cf-sl-8-disc,S,Sold Out,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,False
3,endurace-cf-sl-8-disc,M,Sold Out,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,False
4,endurace-cf-sl-8-disc,L,Sold Out,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,False
5,endurace-cf-sl-8-disc,XL,In-Stock,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,True
6,endurace-cf-sl-8-disc,2XL,In-Stock,,2022-03-03 22:14:56.637964,https://www.canyon.com/en-us/road-bikes/endura...,True
0,endurace-cf-sl-8-disc-di2,2XS,Sold Out,,2022-03-03 22:14:58.103443,https://www.canyon.com/en-us/road-bikes/endura...,False
1,endurace-cf-sl-8-disc-di2,XS,Sold Out,,2022-03-03 22:14:58.103443,https://www.canyon.com/en-us/road-bikes/endura...,False
2,endurace-cf-sl-8-disc-di2,S,Sold Out,,2022-03-03 22:14:58.103443,https://www.canyon.com/en-us/road-bikes/endura...,False


In [26]:
SIZE_WANTED = 'S'
# Filter to any smalls in stock
in_stock_models =  output_df[(output_df['bike_size'] == SIZE_WANTED) & (output_df.isCurrentlyAvailable == True)]
send_email = in_stock_models.shape[0] > 0 
send_email

False

In [35]:

# Grab azure keyvault secret for authorizing email
keyVaultName = "bikealertkeyvault"
KVUri = f"https://{keyVaultName}.vault.azure.net"
email_password_secret_name = "emailSenderPassword"
sender_email_address_secret_name = 'SenderEmailAddress'
receiver_email_address_secret_name = 'ScottsEmailAddress' # change this later to Henry's email. And add it as a secret to KV


credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)

sender_email_password = client.get_secret(email_password_secret_name).value


# also grab email address used for sending and receiving the output mail
sender_email_address= client.get_secret(sender_email_address_secret_name).value
receiver_email_address = client.get_secret(receiver_email_address_secret_name).value

'bikeTester999'

### Send email report

In [37]:

msg = MIMEMultipart()
msg['Subject'] = "Canyon Bike Alert"
msg['From'] = sender_email_address


html = """\
<html>
  <head></head>
  <body>
    {0}
  </body>
</html>
""".format(in_stock_models.to_html())

part1 = MIMEText(html, 'html')
msg.attach(part1)

In [39]:
if send_email:
    port = 465  # For SSL

    # Create a secure SSL context
    context = ssl.create_default_context()

    with smtplib.SMTP_SSL("smtp.gmail.com", port, context=context) as server:
        server.login(sender_email_address, sender_email_password)
        server.sendmail(sender_email_address, receiver_email_address, msg.as_string())

else:
    print("Size not available")


In [None]:
###