In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time


In [3]:
def get_html_content(url, driver=None, interval=3):
    # Set up the WebDriver (using Chrome in this example)
    init_own_driver = driver is None
    if init_own_driver:
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode (no browser UI)
        
        #chrome_driver_path = '/path/to/chromedriver'  # Update with the path to your ChromeDriver
        #chrome_driver_path
        service = Service()
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
    try:
        # Open the URL
        driver.get(url)
        
        # Wait for the page to load
        time.sleep(3)  # You can use WebDriverWait for more complex scenarios
        if interval:
            time.sleep(interval)
        
        # Get the page source (HTML content)
        html_content = driver.page_source
        
        return html_content

    finally:
        if init_own_driver:
            # Close the WebDriver
            driver.quit()
        # else the driver will be returned to the caller
        

def get_seleium_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no browser UI)
    
    #chrome_driver_path = '/path/to/chromedriver'  # Update with the path to your ChromeDriver
    #chrome_driver_path
    service = Service()
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

In [4]:
url = 'https://developer.webex.com/docs'
html_content = get_html_content(url)

In [5]:
from askharrison.crawl.hrefCrawler import HrefCrawler

# Instantiate the crawler
crawler = HrefCrawler()

In [6]:
href_urls = crawler._extract_hrefs(html_content, base_url='https://developer.webex.com/docs')
len(href_urls)

233

In [7]:
# remove any non developer.webex.com links
href_urls = [url for url in href_urls if 'developer.webex.com' in url]
len(href_urls)

200

In [8]:
seleium_driver = get_seleium_driver()

In [9]:
href_content_dict = {}
error_urls = []
from tqdm import tqdm

for href_url in tqdm(href_urls):
    try:
        href_content = get_html_content(href_url, seleium_driver, interval=3)
        href_content_dict[href_url] = href_content
    except Exception as e:
        error_urls.append(href_url)
        print(f'Error: {e}')

  0%|          | 0/200 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [28:30<00:00,  8.55s/it] 


In [10]:
len(href_content_dict), len(error_urls)

(200, 0)

In [11]:
list(href_content_dict.values())[100]



In [12]:
from askharrison.crawl.html_to_text import html_to_text
from bs4 import BeautifulSoup

In [13]:
def remove_elements_with_class(html_content, class_name):
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find and remove elements with the specified class
    elements_to_remove = soup.find_all(class_=class_name)
    for element in elements_to_remove:
        element.decompose()
    
    # Return the modified HTML content
    return soup

In [14]:
href_markdown_content_dict = {}
not_found_urls = []
api_href_urls = []

for href_url, href_content in tqdm(href_content_dict.items()):
    href_markdown_content = html_to_text(href_content)
    # load beautiful soup
    soup = BeautifulSoup(href_content, 'html.parser')
    # # find html of dev class api_reference_entry__container
    # api_reference_entry__container = soup.find_all('div', class_='api_reference_entry__container')
    # documentation_entry__container = soup.find_all('div', class_='documentation_entry__container')
    # if api_reference_entry__container:
    #     content__container = api_reference_entry__container[0]
    #     #href_markdown_content_dict[href_url] = href_markdown_content
    #     href_markdown_content_dict[href_url] = html_to_text(str(content__container))
    # elif documentation_entry__container:
    #     content__container = documentation_entry__container[0]
    #     href_markdown_content_dict[href_url] = html_to_text(str(content__container))
    #     #href_markdown_content_dict[href_url] = href_markdown_content
    soup_without_sidebar = remove_elements_with_class(href_content, 'side-nav__wrapper side-nav-container')
    found_href = crawler._extract_hrefs(str(soup_without_sidebar), base_url='https://developer.webex.com/docs')
    # find only href with https://developer.webex.com/docs/api
    found_href = [url for url in found_href if 'https://developer.webex.com/docs/api' in url]
    if found_href:
        api_href_urls += found_href
    href_markdown_content_dict[href_url] = html_to_text(str(soup_without_sidebar))


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:41<00:00,  4.79it/s]


In [19]:
api_urls = set(api_href_urls).difference(set(href_urls))
len(api_urls)

902

In [16]:
len(href_markdown_content_dict), len(not_found_urls)

(200, 0)

In [20]:
api_urls

{'https://developer.webex.com/docs/api-and-sdk-support-policy',
 'https://developer.webex.com/docs/api/basics',
 'https://developer.webex.com/docs/api/getting-started',
 'https://developer.webex.com/docs/api/guides/compliance',
 'https://developer.webex.com/docs/api/guides/xapi',
 'https://developer.webex.com/docs/api/v1//meetings/sessionTypes',
 'https://developer.webex.com/docs/api/v1/admin-audit-events/list-admin-audit-event-categories',
 'https://developer.webex.com/docs/api/v1/admin-audit-events/list-admin-audit-events',
 'https://developer.webex.com/docs/api/v1/attachment-actions/create-an-attachment-action',
 'https://developer.webex.com/docs/api/v1/attachment-actions/get-attachment-action-details',
 'https://developer.webex.com/docs/api/v1/authorizations/delete-authorization',
 'https://developer.webex.com/docs/api/v1/authorizations/delete-authorization-of-org-and-client-id',
 'https://developer.webex.com/docs/api/v1/authorizations/list-authorizations-for-a-user',
 'https://dev

In [22]:
import json
with open("../data/cisco_developer/href_markdown_content_dict.json", "w") as f:
    json.dump(href_markdown_content_dict, f)

# export the list of api urls
with open("../data/cisco_developer/api_urls.json", "w") as f:
    json.dump(list(api_urls), f)

In [23]:
# get the contents of the api urls
api_content_dict = {}
error_urls = []

for api_url in tqdm(api_urls):
    try:
        api_content = get_html_content(api_url, seleium_driver, interval=3)
        api_content_dict[api_url] = api_content
    except Exception as e:
        error_urls.append(api_url)
        print(f'Error: {e}')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 902/902 [2:09:59<00:00,  8.65s/it]  


In [24]:
# 
api_markdown_content_dict = {}
not_found_urls = []
for api_url, api_content in tqdm(api_content_dict.items()):
    api_markdown_content = html_to_text(api_content)
    soup = BeautifulSoup(api_content, 'html.parser')
    soup_without_sidebar = remove_elements_with_class(api_content, 'side-nav__wrapper side-nav-container')
    api_markdown_content_dict[api_url] = html_to_text(str(soup_without_sidebar))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 902/902 [03:43<00:00,  4.03it/s]


In [25]:
len(api_markdown_content_dict), len(not_found_urls)

(902, 0)

In [26]:
key, value = list(api_markdown_content_dict.items())[0]

In [27]:
key

'https://developer.webex.com/docs/api/v1/features-call-queue/create-a-selective-call-forwarding-rule-for-a-call-queue'

In [28]:
print(value)

[__](/)

[Documentation](/docs)[ Blog](/blog)[Support](/support)Resources

 __

[Log in](/login)[ Sign up](/signup)

 __

[Log in](/login)[ Sign up](/signup)

Documentation __

[Blog](/blog)[ Support](/support)

Resources __

Build

[Getting Started](/docs)[Platform Introduction](/docs/platform-introduction)

 __

Embedded Apps

[What's New](/docs/embedded-apps-whats-new)[Overview](/docs/embedded-
apps)[Developer Guide](/docs/embedded-apps-guide)[Sidebar API Quick
Start](/docs/embedded-apps-framework-sidebar-api-quick-start)[Submission
Checklist for Embedded Apps](/docs/app-hub-submission-checklist-for-embedded-
apps)

 __

Design Guidelines

[Messaging](/docs/embedded-apps-design-guidelines-for-
spaces)[Meetings](/docs/embedded-apps-design-guidelines-for-
meetings)[Devices](/docs/api/guides/embedded-apps-for-webex-devices-developer-
guidelines)[Sidebar](/docs/api/guides/embedded-apps-design-and-experience-
guidelines-for-sidebar)

[API Reference](/docs/embedded-apps-api-reference)

[B

In [29]:
# export the api markdown content
with open("../data/cisco_developer/api_markdown_content_dict.json", "w") as f:
    json.dump(api_markdown_content_dict, f)