### WEB SCRAPING ON  https://apidocs.document360.com/apidoc

- **Selenium** is used for content extraction as certain elements on the webpage are dynamic.

- The contents such as API endpoints, http request, code samples, page title are scraped and stored in a CSV file for subsequent implementation.


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import pandas as pd
import os


driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

### SCRAPING DOCUMENT360 APIDOCS

All the pages of the website are scraped by extracting API endpoints and its respective code samples

In [4]:
def scrape(site):
    # storing all visited urls in a set to prevent duplicates
    visited = set()
    code_samples = []

    # creating a queue to keep track of urls to scrape
    queue = [site]

    while queue:
        current_url = queue.pop(0)
        if current_url in visited:
            continue

        visited.add(current_url)

        try:
            # making a request to the current url
            r = requests.get(current_url, timeout=5)
        except requests.exceptions.RequestException as e:
            # handling exceptions during request
            print(f"Error connecting to {current_url}: {e}")
            continue

        soup = BeautifulSoup(r.text, "html.parser")
        for link in soup.find_all("a"):
            href = link.get("href")
            full_url = urljoin(site, href)
            # checking if the url belongs to the same domain
            if site in full_url and full_url not in visited:
                queue.append(full_url)
                driver.get(full_url)
                # Get the page source after JavaScript execution
                page_source = driver.page_source
                
                # Find elements with class names
                http_req = driver.find_elements(By.CLASS_NAME, "api-http-method")   # to extract http request type
                api_endpoint = driver.find_elements(By.CLASS_NAME, "api-url")  # to extract api endpoints
                code_mirror_lines = driver.find_elements(By.CLASS_NAME, "CodeMirror-line") # to extract code samples for each api endpoints
                title = driver.find_elements(By.CLASS_NAME, "content_block_article_head") # to extract the title of each page
                        
                code_content=''
                if code_mirror_lines:
                    # Loop through the elements and extract information
                    for i in range(len(code_mirror_lines)):
                        if i < len(http_req): 
                            http = http_req[i].text 
                        if i < len(api_endpoint):
                            api_ep = api_endpoint[i].text  
                        if i < len(title):
                            page_title = title[i].text
                          
                        code_content = code_content + code_mirror_lines[i].text +'\n'
                
                    code_samples.append({"url":full_url, "Title":page_title, "http_Request": http, "API_endpoint": api_ep, "Code": code_content})
                print(code_samples)
            print('--'*70)
                
    return code_samples
    



In [5]:
code_samples = scrape("https://apidocs.document360.com/apidocs/")

--------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------
[]
----------

In [6]:
len(code_samples)

79

In [7]:
doc360_api_docs = pd.DataFrame(code_samples)
doc360_api_docs.head()

Unnamed: 0,url,Title,http_Request,API_endpoint,Code
0,https://apidocs.document360.com/apidocs/get-ar...,Gets an article,GET,/v2/Articles/{articleId}/{langCode},curl --request GET \\n --url 'https://apihub....
1,https://apidocs.document360.com/apidocs/gets-a...,Gets all version languages in the project,GET,/v2/Language/{projectVersionId},curl --request GET \\n --url https://apihub.d...
2,https://apidocs.document360.com/apidocs/get-st...,Get the status of import,GET,/v2/Project/Import/{importId},curl --request GET \\n --url https://apihub.d...
3,https://apidocs.document360.com/apidocs/import...,Import documentation,POST,/v2/Project/Import,"{\n ""source_documentation_url"": ""string"",\n ""p..."
4,https://apidocs.document360.com/apidocs/export...,Start a new export,POST,/v2/Project/Export,"\n{\n ""entity"": ""string"",\n ""version_id"": [\n ..."


In [8]:
#storing the scraped content in a csv file
doc360_api_docs.to_csv("./data/document360_api_docs.csv")