## Updated Scrape ports

The updated scrape ports has some new efficiencies.

1. It reads the URLs from a config file instead of hardcoding them in the script.
2. It uses a loop to iterate through the URLs, making it easier to add or remove URLs in the future.
3. It iterates through the HTML looking for Subheadings which it tracks then when a table is found it creates a dataframe of which the subheadings are added. This provides a more structured output which aligns with how the data is presented on the website.
4. It adds up to three levels of subheadings to the dataframe. This allows for a more detailed representation of the data.
5. Iterating through the HTML like this means that there are less unwanted artifacts in the data.
6. Doing all of this means that this code is less fragile and can be used to automate the updating of the data regularly.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import json
import io
import numpy as np

with open('configuration.json') as f:
    config = json.load(f)

Above reads all the configuration data from the configuration file as well as loading all the required libraries.

Below is the code that does the scraping. 

1. It iterates through the URLs in the config file and gets the HTML.
2. It uses BeautifulSoup to parse the HTML and find all the subheadings and tables.
3. It tracks the subheadings Subheading, Subheading_L2, and Subheading_L3.
4. When it finds a table, it creates a DataFrame and adds the subheadings to the DataFrame.
5. It appends the DataFrame to a list of DataFrames for that URL.
6. When all the tables for a URL have been processed, it concatenates the DataFrames and saves them to a central list.
7. Finally, it concatenates all the DataFrames from all URLs for a final output.

In [None]:
# Initialize a list to store all intermediate DataFrames
all_dataframes = []

# Iterate over each item in the config array
for entry in config:
    product = entry['product']
    url = entry['url']
    
    try:
        # Fetch the HTML content
        html = requests.get(url)
        soup = bs(html.content, "html.parser")

        # Initialize variables to track headings
        current_subheading = ""
        current_subheading_l2 = ""
        current_subheading_l3 = ""

        # List to store DataFrames for this product
        product_dataframes = []

        # Find all elements in the body
        all_elements = soup.body.find_all(['span', 'table'])

        # Iterate through the elements
        for element in all_elements:
            if element.name == 'span' and 'class' in element.attrs:
                element_classes = element.get('class', [])
                if 'Subheading' in element_classes:
                    current_subheading = element.get_text(strip=True)
                    current_subheading_l2 = ""
                    current_subheading_l3 = ""
                elif 'Subheading_L2' in element_classes:
                    current_subheading_l2 = element.get_text(strip=True)
                    current_subheading_l3 = ""
                elif 'Subheading_L3' in element_classes:
                    current_subheading_l3 = element.get_text(strip=True)
            elif element.name == 'table':
                # Convert the table to a DataFrame
                df = pd.read_html(io.StringIO(str(element)))[0]

                # Add the headings as columns
                df['Subheading'] = current_subheading
                df['Subheading_L2'] = current_subheading_l2
                df['Subheading_L3'] = current_subheading_l3

                # Add the product column
                df['Product'] = product

                # Append the DataFrame to the product-specific list
                product_dataframes.append(df)

        # Combine all DataFrames for this product
        if product_dataframes:
            combined_product_df = pd.concat(product_dataframes, ignore_index=True)
            all_dataframes.append(combined_product_df)

    except Exception as e:
        print(f"Error processing {product} ({url}): {e}")

# Combine all intermediate DataFrames into a single DataFrame
final_combined_df = pd.concat(all_dataframes, ignore_index=True)


The follow code is design to clean up the data and remove any unneeded information.

In [None]:
final_combined_df["Description"] = np.where(
    final_combined_df["Notes"].notna(), final_combined_df["Notes"], final_combined_df["Description"]
)

In [None]:
final_combined_df['Description'] = final_combined_df['Description'].fillna('')

In [None]:
final_combined_df['Subheading'] = np.where(final_combined_df['Subheading'] == '', final_combined_df['From'], final_combined_df['Subheading'])

In [None]:
final_combined_df['Port'] = np.where(
    (final_combined_df['Port'].isna()) & (~final_combined_df['Port/Endpoint'].isna()),
    final_combined_df['Port/Endpoint'],
    final_combined_df['Port']
)

In [None]:
columns_to_drop = [0, 'Port/Endpoint', 'Notes']

In [None]:
final_combined_df.drop(columns=columns_to_drop, inplace=True, axis=1, errors='ignore')

In [None]:
final_combined_df = final_combined_df.dropna(subset=['Port'], how='all')

In [None]:
rows_to_drop = ["Other Communications",  "Communication with Backup Server", "Communication with Backup Infrastructure Components", "Depends on device configuration", "Communication with Virtualization Servers"]

In [None]:
final_combined_df['Port'] = final_combined_df['Port'].astype(str)

In [None]:
final_combined_df = pd.read_parquet('final_combined_df.parquet', engine='pyarrow')

In [None]:
final_combined_df

In [None]:
# final_combined_df[final_combined_df['Subheading_L2'] == 'IBM FlashSystem (formerly Spectrum Virtualize) Storage']

final_combined_df['Subheading_L2'] = np.where((final_combined_df['Subheading_L2'] == 'IBM FlashSystem (formerly Spectrum Virtualize) Storage') & (final_combined_df['Subheading_L3'] != "" ), final_combined_df['Subheading_L3'], final_combined_df['Subheading_L2'])

In [None]:
final_combined_df[(final_combined_df['Subheading_L2'] == final_combined_df['Subheading_L3']) & (final_combined_df['Subheading_L2'] != "")]

In [None]:
final_combined_df['Subheading_L3'] = np.where(
    (final_combined_df['Subheading_L2'] == final_combined_df['Subheading_L3']) & 
    (final_combined_df['Subheading_L2'] != "") & 
    (final_combined_df['Subheading_L3'] != ""), 
    "", 
    final_combined_df['Subheading_L3']
)

In [None]:
final_combined_df = final_combined_df[~final_combined_df['Port'].isin(rows_to_drop)]

In [None]:
final_combined_df.loc[:, 'Port'] = final_combined_df['Port'].str.split('(').str[0].str.strip()

This section creates the sqlite database, the all_ports table and then inserts the data into the table.

This has been updated since the first version as it now uses: subheading, subheading_l2 and subheading_l3 to create a more structured table.

In [None]:
# Ensure the SQLite connection is established
import sqlite3

# Connect to the SQLite database
con = sqlite3.connect("allports_updated.db")
cur = con.cursor()

# Create the table if it doesn't already exist
cur.execute(
    """
    CREATE TABLE IF NOT EXISTS all_ports (
        product TEXT,
        subheading TEXT,
        subheading_l2 TEXT,
        subheading_l3 TEXT,
        from_port TEXT,
        to_port TEXT,
        protocol TEXT,
        port TEXT,
        description TEXT
    )
    """
)

# Insert all rows from final_combined_df into the database
columns = ['Product', 'Subheading', 'Subheading_L2', 'Subheading_L3', 'From', 'To', 'Protocol', 'Port', 'Description']
data = final_combined_df[columns].values.tolist()

cur.executemany(
    """
    INSERT INTO all_ports (product, subheading, subheading_l2, subheading_l3, from_port, to_port, protocol, port, description)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
    """,
    data
)

# Commit the transaction and close the connection
con.commit()
con.close()