In [None]:
pip install requests beautifulsoup4

Next we need to ensure we've uploaded the glossary at glossary.htm; default location should be /content/... within colab

Had trouble parsing with beautiful soup, so here we save the latest version as as glossary.htm and upload it via the folder icon to the left, then the upload icon

We'll localize the glossary; had issues parsing the glossary and generating a list of urls via beatifulsoup sans localization

In [23]:
import requests

# URL of the OWASP Glossary page
glossary_url = "https://cheatsheetseries.owasp.org/Glossary.html"

# Send a GET request to the URL
response = requests.get(glossary_url)

# Check if the request was successful
if response.status_code == 200:
    # Save the content of the page to a local file
    with open('/content/glossary.htm', 'w', encoding='utf-8') as file:
        file.write(response.text)
    print("Glossary page downloaded and saved as /content/glossary.htm")
else:
    print(f"Failed to download the glossary page. Status code: {response.status_code}")


Glossary page downloaded and saved as /content/glossary.htm


scrapes the [OWASP Glossary page](https://cheatsheetseries.owasp.org/Glossary.html) to find all links to cheat sheets and writes them to a .txt file

In [19]:
from bs4 import BeautifulSoup
from google.colab import files

def get_cheat_sheet_links(local_html_path):
    try:
        with open(local_html_path, 'r') as file:
            soup = BeautifulSoup(file, 'html.parser')

        links = []
        for p_tag in soup.find_all('p'):
            a_tag = p_tag.find('a')
            if a_tag and 'href' in a_tag.attrs:
                full_url = a_tag['href']
                print("Found URL:", full_url)  # Debugging print statement
                links.append(full_url)
        return links
    except Exception as e:
        print(f"Error reading local HTML file: {e}")
        return []

# Path to the local HTML file (update this to the correct path in your environment)
local_html_path = '/content/glossary.htm'
cheat_sheet_links = get_cheat_sheet_links(local_html_path)

# Check if any links were found
if not cheat_sheet_links:
    print("No links were found in the local HTML file.")
else:
    # Write the URLs to a .txt file in the Colab environment
    file_path = '/content/cheat_sheet_urls.txt'
    with open(file_path, 'w') as file:
        for url in cheat_sheet_links:
            file.write(url + '\n')

    print("URL extraction completed. Data written to " + file_path)

    # Trigger download of the file to your local machine
    files.download(file_path)


Found URL: https://cheatsheetseries.owasp.org/Glossary.html#a
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Attack_Surface_Analysis_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/AJAX_Security_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Authorization_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Access_Control_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Authorization_Testing_Automation_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Authentication_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Abuse_Case_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Bean_Validation_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Content_Security_Policy_Cheat_Sheet.html
Found URL: https://cheatsheetseries.owasp.org/cheatsheets/Clickjacking_Defense_Cheat_Sheet.html
F

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
import requests
from bs4 import BeautifulSoup
import json
from google.colab import files

def extract_cheat_sheet_details(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extracting the concept
        concept = soup.find('h1').get_text().strip() if soup.find('h1') else "Unknown Concept"

        # Extracting and formatting application content
        application = ""
        for header in soup.find_all(['h2', 'h3', 'h4']):
            section_title = f"<{header.get_text().strip()}> "
            section_content = ""
            for sibling in header.next_siblings:
                if sibling.name in ['h2', 'h3', 'h4']:
                    break
                if sibling.name:
                    section_content += sibling.get_text(separator=' ', strip=True) + ' '
            application += section_title + section_content.strip() + " "

        return {
            "concept": concept,
            "application": application
        }
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return {
            "concept": "Error",
            "application": f"Error processing {url}: {str(e)}"
        }

# Read the URLs from the .txt file in the Colab environment
with open('/content/cheat_sheet_urls.txt', 'r') as file:
    urls = file.readlines()

# Filter out invalid URLs
valid_urls = [url.strip() for url in urls if url.startswith("https://cheatsheetseries.owasp.org/cheatsheets/")]

# Process each cheat sheet and store the details
cheat_sheet_details = []
for url in valid_urls:
    print(f"Processing: {url}")
    details = extract_cheat_sheet_details(url)
    cheat_sheet_details.append(details)

# Write the details to a JSON file in the Colab environment
with open('/content/cheat_sheet_details.json', 'w') as file:
    json.dump(cheat_sheet_details, file, indent=4)

print("Scraping completed. Data written to /content/cheat_sheet_details.json")

# Uncomment the next line to download the file in Google Colab
files.download('/content/cheat_sheet_details.json')


Processing: https://cheatsheetseries.owasp.org/cheatsheets/Attack_Surface_Analysis_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/AJAX_Security_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Authorization_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Access_Control_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Authorization_Testing_Automation_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Authentication_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Abuse_Case_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Bean_Validation_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Content_Security_Policy_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Clickjacking_Defense_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheats

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
import requests
from bs4 import BeautifulSoup
import json
import re

def clean_text(text):
    # Regular expression to match unwanted characters
    regex_pattern = r'[\u00b6\u274c\u2013\u2026\u00a0\u2018\u2019\u201c\u201d\u00a0\u2013\u2026'
    # Remove the matched characters
    return

def extract_cheat_sheet_details(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extracting the concept
        concept = soup.find('h1').get_text().strip() if soup.find('h1') else "Unknown Concept"
        concept = re.sub(r'[\u00b6\u274c]', '', concept)  # Remove specific unwanted characters

        # Extracting and formatting application content
        application = ""
        for header in soup.find_all(['h2', 'h3', 'h4']):
            section_title = f"<{header.get_text().strip()}> "
            section_content = ""
            for sibling in header.next_siblings:
                if sibling.name in ['h2', 'h3', 'h4']:
                    break
                if sibling.name:
                    section_content += sibling.get_text(separator=' ', strip=True) + ' '
            application += section_title + section_content.strip() + " "

        application = re.sub(r'[\u00b6\u274c]', '', application)  # Remove specific unwanted characters

        return {
            "concept": concept,
            "application": application
        }
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return {
            "concept": "Error",
            "application": f"Error processing {url}: {str(e)}"
        }

# Read the URLs from the .txt file in the Colab environment
with open('/content/cheat_sheet_urls.txt', 'r') as file:
    urls = file.readlines()

# Filter out invalid URLs
valid_urls = [url.strip() for url in urls if url.startswith("https://cheatsheetseries.owasp.org/cheatsheets/")]

# Process each cheat sheet and store the details
cheat_sheet_details = []
for url in valid_urls:
    print(f"Processing: {url}")
    details = extract_cheat_sheet_details(url)
    cheat_sheet_details.append(details)

# Write the details to a JSON file in the Colab environment
with open('/content/cheat_sheet_details.json', 'w') as file:
    json.dump(cheat_sheet_details, file, indent=4)

print("Scraping completed. Data written to /content/cheat_sheet_details.json")

# Uncomment the next line to download the file in Google Colab
files.download('/content/cheat_sheet_details.json')


Processing: https://cheatsheetseries.owasp.org/cheatsheets/Attack_Surface_Analysis_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/AJAX_Security_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Authorization_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Access_Control_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Authorization_Testing_Automation_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Authentication_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Abuse_Case_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Bean_Validation_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Content_Security_Policy_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheatsheets/Clickjacking_Defense_Cheat_Sheet.html
Processing: https://cheatsheetseries.owasp.org/cheats

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>