Import the packages

In [1]:
import pandas as pd
import sqlite3
import numpy as np
import json
import requests
import io
from bs4 import BeautifulSoup

In [2]:
with open("configuration.json") as config_file:
    config = json.load(config_file)

All the URLS for each of the port pages.

In [2]:
url_vbo = "https://helpcenter.veeam.com/docs/vbo365/guide/vbo_used_ports.html"
url_vbr = "https://helpcenter.veeam.com/docs/backup/vsphere/used_ports.html"
url_hyperv = "https://helpcenter.veeam.com/docs/backup/hyperv/used_ports.html"
url_ahv = "https://helpcenter.veeam.com/docs/vbahv/userguide/used_ports.html"
url_olvm_rhel = "https://helpcenter.veeam.com/docs/vbrhv/userguide/used_ports.html"
url_proxmox = "https://helpcenter.veeam.com/docs/vbproxmoxve/userguide/used_ports.html"
url_vcc = "https://helpcenter.veeam.com/docs/backup/cloud/ports.html"
url_vro = "https://helpcenter.veeam.com/docs/vro/userguide/ports.html"

url_vbaws = "https://helpcenter.veeam.com/docs/vbaws/guide/ports.html"
url_vbaz = "https://helpcenter.veeam.com/docs/vbazure/guide/ports.html"
url_gcp = "https://helpcenter.veeam.com/docs/vbgc/guide/ports.html"
url_windows = "https://helpcenter.veeam.com/docs/agentforwindows/userguide/ports.html"
url_linux = "https://helpcenter.veeam.com/docs/agentforlinux/userguide/used_ports.html"
url_agent_man = "https://helpcenter.veeam.com/docs/backup/agents/used_ports.html"
url_vone = "https://helpcenter.veeam.com/docs/one/deployment/ports.html"
url_vspc = "https://helpcenter.veeam.com/docs/vac/deployment/ports.html"

url_explore_ad = "https://helpcenter.veeam.com/docs/backup/explorers/vead_ports.html"
url_explore_sql = (
    "https://helpcenter.veeam.com/docs/backup/explorers/vesql_used_ports.html"
)
url_explore_orcle = (
    "https://helpcenter.veeam.com/docs/backup/explorers/veo_used_ports.html"
)
url_explore_postgres = (
    "https://helpcenter.veeam.com/docs/backup/explorers/vep_used_ports.html"
)
url_explore_saphana = (
    "https://helpcenter.veeam.com/docs/backup/explorers/vemdb_used_ports.html"
)
url_explore_mongo = (
    "https://helpcenter.veeam.com/docs/backup/explorers/vemdb_used_ports.html"
)

url_explore_exchange = (
    "https://helpcenter.veeam.com/docs/backup/explorers/vex_ports.html"
)
url_explore_sharepoint = (
    "https://helpcenter.veeam.com/docs/backup/explorers/vesp_ports.html"
)

url_list = [
    url_vbo,
    url_vbr,
    url_vbaws,
    url_vbaz,
    url_gcp,
    url_windows,
    url_linux,
    url_agent_man,
    url_vspc,
    url_explore_ad,
    url_explore_sql,
    url_explore_orcle,
    url_explore_postgres,
    url_explore_saphana,
    url_explore_mongo,
    url_explore_exchange,
    url_explore_sharepoint,
]

len(url_list)

17

Create a function to handle the HTML to DataFrame conversion.

In [3]:
import numpy as np


def html_to_dataframe2(url, product):
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")

    # Extract tables
    tables = pd.read_html(io.StringIO(html.text))

    # Extract headings in document order
    all_headings = []
    current_main_heading = None

    # Find all heading elements in order of appearance
    if product != "VCC":
        for element in soup.body.find_all(["span"]):
            if "class" in element.attrs:
                element_classes = element.get("class", [])

                if (
                    "Subheading" in element_classes
                    and "Subheading_L2" not in element_classes
                ):
                    # Main heading found
                    current_main_heading = element.text.strip()
                    all_headings.append(
                        {
                            "text": current_main_heading,
                            "type": "main",
                            "position": len(all_headings),
                            "combined": current_main_heading,
                        }
                    )
                elif "Subheading_L2" in element_classes and current_main_heading:
                    # Subheading found
                    subheading_text = element.text.strip()
                    combined_text = f"{current_main_heading} - {subheading_text}"
                    all_headings.append(
                        {
                            "text": subheading_text,
                            "type": "sub",
                            "position": len(all_headings),
                            "combined": combined_text,
                        }
                    )

    # Process each table from the HTML
    processed_tables = []
    current_heading_idx = 0

    for table in tables:
        if product == "VONE":
            # For VONE, we need to handle the table differently
            # Check if the table has a 'From' and 'To' column
            table["Section"] = np.where(
                table["From"] == table["To"], table["From"], np.nan
            )
            table["Section"] = table["Section"].ffill()
        else:
            # Only process tables with more than 1 column (to avoid irrelevant tables)
            if len(table.columns) > 1:
                # Check if we have headings to use
                if len(all_headings) > 0 and current_heading_idx < len(all_headings):
                    heading_info = all_headings[current_heading_idx]
                    table["Section"] = heading_info["combined"]
                    current_heading_idx += 1
                else:
                    # If we don't have headings, try to extract sections from the table itself
                    # If there's a 'From' column, check for common values that might indicate sections
                    if "From" in table.columns:
                        # Find groups of rows with the same 'From' value
                        section_groups = []
                        current_section = None
                        section_rows = []

                        for idx, row in table.iterrows():
                            if pd.notna(row["From"]):
                                if current_section != row["From"]:
                                    # New section found
                                    if current_section is not None and section_rows:
                                        section_groups.append(
                                            (current_section, section_rows)
                                        )
                                        section_rows = []
                                    current_section = row["From"]
                                section_rows.append(idx)

                        # Add the last section group if exists
                        if current_section is not None and section_rows:
                            section_groups.append((current_section, section_rows))

                        # Apply sections to the table
                        for section_name, row_indices in section_groups:
                            table.loc[row_indices, "Section"] = section_name

                        # If any rows didn't get a section, use the first non-empty From value
                        if table["Section"].isna().any():
                            from_values = table["From"].dropna().unique()
                            if len(from_values) > 0:
                                table["Section"] = table["Section"].fillna(
                                    from_values[0]
                                )
                            else:
                                table["Section"] = table["Section"].fillna(
                                    "Unknown Section"
                                )
                    else:
                        # No From column, use a default section
                        table["Section"] = "Unknown Section"

        # Add product information
        table["Product"] = product

        processed_tables.append(table)

    if not processed_tables:
        return pd.DataFrame()  # Return empty DataFrame if no tables processed

    # Combine all processed tables
    df = pd.concat(processed_tables, ignore_index=True)

    return df

In [4]:
def html_to_dataframe(url, product):
    html = requests.get(url)
    df = pd.read_html(io.StringIO(html.text))
    df = pd.concat(df)
    df["Product"] = product
    return df

In [5]:
# test_df = html_to_dataframe2(url_vone, "VONE")

Run the function on all the URLs.

In [6]:
df_vbo = html_to_dataframe2(url_vbo, "VB365")
df_vbr = html_to_dataframe2(url_vbr, "VBR")
df_vbr_hyperv = html_to_dataframe2(url_hyperv, "VBR Hyper-V")
df_ahv = html_to_dataframe2(url_ahv, "AHV")
df_url_olvm = html_to_dataframe2(url_olvm_rhel, "OLVM / RHV")
df_proxmox = html_to_dataframe2(url_proxmox, "Proxmox")
df_vro = html_to_dataframe2(url_vro, "VRO")

df_vbaws = html_to_dataframe2(url_vbaws, "VBAWS")
df_vbaz = html_to_dataframe2(url_vbaz, "VBAZ")
df_gcp = html_to_dataframe2(url_gcp, "VBGCP")

df_windows = html_to_dataframe2(url_windows, "Windows")
df_agent_man = html_to_dataframe2(url_agent_man, "Agent Management")
df_linux = html_to_dataframe2(url_linux, "Linux")

df_vone = html_to_dataframe2(url_vone, "VONE")

df_vspc = html_to_dataframe2(url_vspc, "VSPC")
df_vcc = html_to_dataframe2(url_vcc, "VCC")

df_explorer_ad = html_to_dataframe2(url_explore_ad, "Explorer AD")
df_explorer_sql = html_to_dataframe2(url_explore_sql, "Explorer SQL")
df_explorer_orcle = html_to_dataframe2(url_explore_orcle, "Explorer Oracle")
df_explorer_postgres = html_to_dataframe2(url_explore_postgres, "Explorer Postgres")
df_explorer_saphana = html_to_dataframe2(url_explore_saphana, "Explorer SAP HANA")
df_explorer_mongo = html_to_dataframe2(url_explore_mongo, "Explorer Mongo")
df_explorer_exchange = html_to_dataframe2(url_explore_exchange, "Explorer Exchange")
df_explorer_sharepoint = html_to_dataframe2(
    url_explore_sharepoint, "Explorer SharePoint"
)
# OneDrive needs the SharePoint ports
# Teams needs the SharePoint and Exchange ports

In [7]:
df_list = [
    df_vbo,
    df_vbr,
    df_vbr_hyperv,
    df_ahv,
    df_url_olvm,
    df_proxmox,
    df_vro,
    df_vbaws,
    df_vbaz,
    df_gcp,
    df_windows,
    df_agent_man,
    df_linux,
    df_vone,
    df_vspc,
    df_vcc,
    df_explorer_ad,
    df_explorer_sql,
    df_explorer_orcle,
    df_explorer_postgres,
    df_explorer_saphana,
    df_explorer_mongo,
    df_explorer_exchange,
    df_explorer_sharepoint,
]

len(df_list)

24

Concatinate all the DataFrames into a single DataFrame

In [8]:
combined_df = pd.concat(df_list, ignore_index=True)

Combine the Notes and Description columns into a single column.

In [9]:
combined_df["Description"] = np.where(
    combined_df["Notes"].notna(), combined_df["Notes"], combined_df["Description"]
)

In [10]:
combined_df["Description"] = combined_df["Description"].fillna("")

Remove the first and last column

In [11]:
combined_df.drop(columns=[0, combined_df.columns[-1]], inplace=True)

In [12]:
combined_df.drop(columns=["Notes"], inplace=True)

Drop all the NaN rows

In [13]:
combined_df.dropna(inplace=True)

Remove some unneeded rows

In [14]:
combined_df = combined_df[combined_df["To"] != "Other Communications"]
combined_df = combined_df[
    combined_df["To"] != "Communication with Virtualization Servers"
]
combined_df = combined_df[combined_df["To"] != "Communication with Backup Server"]
combined_df = combined_df[
    combined_df["To"] != "Communication with Backup Infrastructure Components"
]

Replace some hex values with their actual symbol.

In [15]:
combined_df.replace("â\x80\x94", "—", regex=True, inplace=True)

In [16]:
combined_df.replace("Â\xa0", " ", regex=True, inplace=True)

Check they have been removed

In [17]:
len(combined_df[combined_df["Description"].str.contains(r"Â\xa0")])

0

In [18]:
len(combined_df[combined_df["Description"].str.contains(r"Â\xa0BackupÂ\xa0&Â\xa0")])

0

In [19]:
len(combined_df[combined_df["To"].str.contains(r"Â\xa0BackupÂ\xa0&Â\xa0")])

0

Create the database connection

In [20]:
con = sqlite3.connect("allports.db")

In [21]:
cur = con.cursor()

Create the table

In [22]:
cur.execute(
    "CREATE TABLE IF NOT EXISTS all_ports(product TEXT, section TEXT, from_port TEXT, to_port TEXT, protocol TEXT, port TEXT, description TEXT)"
)

<sqlite3.Cursor at 0x1f6ac4b7bc0>

In [23]:
for row in combined_df.itertuples(index=False):
    cur.execute(
        "INSERT INTO all_ports VALUES(?, ?, ?, ?, ?, ?, ?)",
        (
            str(row[0]),
            str(row[6]),
            str(row[1]),
            str(row[2]),
            str(row[3]),
            str(row[4]),
            str(row[5]),
        ),
    )
con.commit()

Run some checks to ensure it is all working as expected.

In [24]:
cur.execute("SELECT DISTINCT product FROM all_ports")
res = cur.fetchall()
for r in res:
    print(r[0])

VB365
VBR
VBR Hyper-V
AHV
OLVM / RHV
Proxmox
VRO
VBAWS
VBAZ
VBGCP
Windows
Agent Management
Linux
VONE
VSPC
VCC
Explorer AD
Explorer SQL
Explorer Oracle
Explorer Postgres
Explorer SAP HANA
Explorer Mongo
Explorer Exchange
Explorer SharePoint


In [25]:
cur.execute(
    "SELECT DISTINCT port FROM all_ports WHERE product = 'VB365' AND from_port = 'Backup proxy server1' AND protocol = 'TCP'"
)
res = cur.fetchall()
for i in res:
    print(i[0])

9191
9193
443
80 or 443
25 or 465 or 587
22
5432 (used by default)
4222 (used by default)


In [26]:
cur.execute("SELECT COUNT(*)FROM all_ports WHERE Product = 'Linux'")
res = cur.fetchall()
res[0][0]

16

In [27]:
cur.execute(
    'SELECT * FROM all_ports WHERE Product = "VBR" AND Description LIKE "%Threat hunter%"'
)

<sqlite3.Cursor at 0x1f6ac4b7bc0>

In [28]:
res = cur.fetchall()

In [29]:
res[0]

('VBR',
 'Backup Server',
 'Backup server',
 'Veeam License Update Server',
 'TCP',
 '443',
 'Default port used to automatically update license from the Veeam License Update Server over HTTPS. Veeam Threat Hunter and Veeam Data Cloud Vault also require this communication to work properly.Veeam License Update Server endpoints:vbr.butler.veeam.comautolk.veeam.com')