# SpecPDF Requirements Extraction

In [None]:
# Regular expression library, used for pattern matching and text manipulation.
%pip install regex

# PyMuPDF library, used for working with PDF files.
%pip install PyMuPDF

# Pandas library, used for data manipulation and analysis with powerful data structures like DataFrames.
%pip install pandas

# SentenceTransformer library, used for encoding sentences into numerical vectors for NLP tasks.
%pip install sentence-transformers

# HDBSCAN, library, used for hierarchical clustering on high dimensional data without encountering curse of dimensionality
%pip install hdbscan

In [None]:
# Import Dependencies

# Regular expression library, used for pattern matching and text manipulation.
import re

# PyMuPDF library, used for working with PDF files.
import fitz

# Provides a way to interact with the operating system, enabling file and directory operations, among other things.
import os

# Pandas library, used for data manipulation and analysis with powerful data structures like DataFrames.
import pandas

# SentenceTransformer library, used for encoding sentences into numerical vectors for NLP tasks.
from sentence_transformers import SentenceTransformer

# Import the DBSCAN clustering algorithm from scikit-learn (a machine learning library)
# This will be used as an alternative clustering method for the system requirements.
from sklearn.cluster import DBSCAN

# Import the HDBSCAN library for Hierarchical Density-Based Spatial Clustering of Applications with Noise
# This library provides an implementation of the HDBSCAN clustering algorithm, which can be used for clustering the system requirements based on sentence embeddings.
import hdbscan

In [None]:
# Extract requirements from a PDF document and print them

# Path to the PDF document
DATA_PATH = os.getcwd() + "/SpecPage.pdf"

# Open the PDF document
doc = fitz.open(DATA_PATH)

# Initialize an empty string to store the extracted text
text = ""

# Iterate over each page in the document and extract the text
for page in doc:
    text += page.get_text()

# Use regular expressions to extract the requirements based on a specific pattern
requirements = re.findall(r"\d+\.\d+\.\d{2}\s(.*?)(?=\s\d+\.\d+\.\d{2}\s|$)", text, re.DOTALL)

# Remove leading/trailing whitespace and line breaks from each requirement
requirements = [requirement.strip() for requirement in requirements]

# Prepend the number to the first requirement using regex matching
requirements[0] = re.search(r"\d+\.\d+\.\d{2}", text).group() + " " + requirements[0]

# Print the extracted requirements
for requirement in requirements:
    print(requirement)

# MCH System Performance Specification Requirements Extraction

In [None]:
# Extract requirements from a PDF document and print them

# Path to the PDF document
DATA_PATH = os.getcwd() + "/SystemSpecPage.pdf"

# Open the PDF document
doc = fitz.open(DATA_PATH)

# Initialize an empty string to store the extracted text
text = ""

# Iterate over each page in the document and extract the text
for page in doc:
    text += page.get_text()

# Use regular expressions to extract the requirements based on a specific pattern
pattern = r"(?<!UNCLASSIFIED//FOR OFFICIAL USE ONLY\n)(\d+\.\d+\.\d+\.\d+)\s(.*?)(?=\n\d+\.\d+\.\d+\.\d+\s|$)(?!\nUNCLASSIFIED//FOR OFFICIAL USE ONLY)"
matches = re.findall(pattern, text, re.DOTALL)

# Initialize an empty list to store the extracted requirements
requirements = []

# Iterate over each match in the matches list
for match in matches:
    number = match[0]
    subpattern = r"(UNCLASSIFIED//FOR OFFICIAL USE ONLY\s+\d+\s+)"
    
    # Remove the "UNCLASSIFIED//FOR OFFICIAL USE ONLY" prefix and leading whitespace from the requirement
    sentence = re.sub(subpattern, "", match[1].strip())
    
    # Remove any remaining "UNCLASSIFIED//FOR OFFICIAL USE ONLY" occurrences
    subpattern = r"UNCLASSIFIED//FOR OFFICIAL USE ONLY"
    sentence = re.sub(subpattern, "", sentence)
    
    # Check if the requirement contains the "Acronyms" keyword
    if "Acronyms" in sentence:
        break
    
    # Append the cleaned requirement sentence to the requirements list
    requirements.append(sentence)
    
    # Print the number and sentence of the extracted requirement
    print(f"Number: {number}")
    print(f"Sentence: {sentence}")

# Tagging Similar Requirements

In [None]:
# Load a pretrained model for sentence embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# How many samples do you want to process 
N_EXAMPLES = 20

# Select a subset of requirements to process
ex_reqs = requirements[:N_EXAMPLES]

# Encode subset of requirements as numerical representation
sentence_embeddings = model.encode(ex_reqs)

# Using DBSCAN for clustering
# Create a DBSCAN clustering object with specified hyperparameters.
# 'eps' sets the maximum distance between two samples to be considered in the same neighborhood.
# 'min_samples' specifies the minimum number of samples in a neighborhood for a point to be considered as a core point.
# 'metric' determines the distance metric used for clustering.
dbscan_cluster = DBSCAN(eps=3, min_samples=5, metric='euclidean')

# Perform clustering on the sentence embeddings using DBSCAN.
# The 'fit_predict' method finds the clusters and returns an array of cluster labels for each input sentence.
dbscan_labels = dbscan_cluster.fit_predict(sentence_embeddings)

# Using HDBSCAN for clustering
# Create an HDBSCAN clustering object with specified hyperparameters.
# 'min_cluster_size' sets the minimum number of points required to form a cluster in HDBSCAN.
# 'metric' determines the distance metric used for clustering.
hdbscan_cluster = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')

# Perform clustering on the sentence embeddings using HDBSCAN.
# The 'fit_predict' method finds the clusters and returns an array of cluster labels for each input sentence.
hdbscan_labels = hdbscan_cluster.fit_predict(sentence_embeddings)

# Function to print out the requirements that belong to each cluster.
# It takes a list of sentences and an array of cluster labels as input.
def print_clusters(sentences, cluster_labels):
    clusters = {}
    for idx, label in enumerate(cluster_labels):
        if label == -1:  # Outlier points in HDBSCAN are labeled as -1
            continue
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(sentences[idx])
    
    # Sort the clusters based on cluster_id before printing.
    sorted_clusters = dict(sorted(clusters.items()))
    
    for cluster_id, sentences in sorted_clusters.items():
        print(f"Cluster {cluster_id}:")
        for sentence in sentences:
            print(sentence)
        print()

# Print the results for the DBSCAN clustering.
print("Using DBSCAN:")
print_clusters(ex_reqs, dbscan_labels)

In [None]:
# Print the results for the HDBSCAN clustering.
print("Using HDBSCAN:")

# Call the 'print_clusters' function to print out the requirements that belong to each cluster for HDBSCAN.
# It takes a list of sentences ('ex_reqs') and an array of cluster labels ('hdbscan_labels') as input.
print_clusters(ex_reqs, hdbscan_labels)