In [153]:
import re

def identify_structured_sections(content):
    """
    Identifies potential structured sections (like lists or tables) in the PDF text content by
    detecting patterns such as bullet points, numbered lists, or tabular-like text structures.

    Parameters
    ----------
    content : str
        The text content of the PDF to analyze.

    Returns
    -------
    dict
        A dictionary with plain text and structured sections.
    """
    lines = content.splitlines()

    structured_content = []
    plain_text_content = []
    
    in_structured_section = False
    structured_section = []

    # Regex pattern to capture technique IDs like T1190, T1059.001, etc.
    ttp_pattern = r'\b(T[0-9]{4}(?:\.[0-9]{3})?)\b'

    for line in lines:
        stripped_line = line.strip()

        # Detect section headers (like "Initial Access", "Execution")
        if re.match(r'^[A-Za-z\s]+$', stripped_line) and len(stripped_line.split()) > 1:
            if in_structured_section:
                structured_content.append('\n'.join(structured_section))
                structured_section = []
            plain_text_content.append(stripped_line)  # This is a section header (e.g., "Execution")
            continue

        # Detect technique entries by matching lines with "TXXXX" or "TXXXX.XXX"
        if re.match(r'.*\sT[0-9]{4}(\.[0-9]{3})?\s+.*', stripped_line):
            if not in_structured_section:
                in_structured_section = True
            structured_section.append(stripped_line)

        # Detect general indented lines (possible rows of a table or continuation of a list)
        elif re.match(r'^\s{2,}', stripped_line):  # Indentation indicates a row in a table or list
            if not in_structured_section:
                in_structured_section = True
            structured_section.append(stripped_line)

        # If this is plain text and not part of structured content, capture it
        else:
            if in_structured_section:
                structured_content.append('\n'.join(structured_section))
                structured_section = []
                in_structured_section = False
            plain_text_content.append(stripped_line)

    # If there's any unprocessed structured content at the end, add it
    if structured_section:
        structured_content.append('\n'.join(structured_section))

    plain_text = '\n'.join(plain_text_content)

    return {
        "structured_content": '\n'.join(structured_content),
        "plain_text_content": plain_text
    }


def extract_ttps(content, pattern):
    """
    Extracts TTPs from the given content using the specified regex pattern.

    Parameters
    ----------
    content : str
        The text content to search for TTPs.
    pattern : str
        The regex pattern to match TTPs.

    Returns
    -------
    list
        A list of matched TTPs.
    """
    return re.findall(pattern, content)


def parse_pdf_cti_report(pdf_content):
    """
    Separates plain text content from structured (list/table-like) content in a PDF, 
    then extracts TTPs from each section.

    Parameters
    ----------
    pdf_content : str
        The complete content of a CTI report extracted from a PDF.

    Returns
    -------
    dict
        A dictionary with TTPs found in text, list, and table-like content.
    """
    ttp_pattern = r'\b(T[0-9]{4}(?:\.[0-9]{3})?)\b'

    # Identify structured sections like lists or table-like data
    sections = identify_structured_sections(pdf_content)
    
    # Extract TTPs from structured content (detected as lists or table-like)
    ttps_in_structured_content = extract_ttps(sections['structured_content'], ttp_pattern)
    ttps_in_structured_content = list(set(ttps_in_structured_content))

    # Extract TTPs from plain text content (excluding structured sections)
    ttps_in_text = extract_ttps(sections['plain_text_content'], ttp_pattern)
    ttps_in_text = list(set(ttps_in_text))

    return {
        "ttps_in_text": ttps_in_text,
        "ttps_in_table_or_list": ttps_in_structured_content
    }



In [157]:
import os
import PyPDF2
from PyPDF2.errors import WrongPasswordError

def read_cti_reports(folder_path, file_hashes, password=None):
    """
    Reads the CTI report files with .download extension for each hash in file_hashes. Handles text files and encrypted PDFs.

    Parameters
    ----------
    folder_path : str
        The path to the folder containing the CTI reports.
    file_hashes : list
        A list of file hashes to look for and read.
    password : str, optional
        Password to decrypt encrypted PDF files, by default None.

    Returns
    -------
    dict
        A dictionary where keys are file hashes and values are the file content or error message.
    """
    report_results = {}

    for file_hash in file_hashes:
        file_path = os.path.join(folder_path, f"{file_hash}.download")

        # Check if the file exists and is readable
        if os.path.exists(file_path) and os.path.isfile(file_path):
            try:
                # Attempt to read the file as UTF-8 text
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                parsed_data = parse_cti_report(content)  # This function parses the report
                report_results[file_hash] = parsed_data

            except UnicodeDecodeError:
                # If a UnicodeDecodeError occurs, the file might be a binary PDF
                #print(f"Error reading file as text. Attempting to read as PDF: {file_path}")
                try:
                    with open(file_path, 'rb') as file:
                        pdf_reader = PyPDF2.PdfReader(file)

                        # Handle encrypted PDF if a password is required
                        if pdf_reader.is_encrypted:
                            try:
                                pdf_reader.decrypt(password or "")
                            except WrongPasswordError:
                                report_results[file_hash] = {"error": "Failed to decrypt PDF, wrong password."}
                                continue  # Skip to next file hash

                        text = ""
                        # Extract text from each page of the PDF
                        for page_num in range(len(pdf_reader.pages)):
                            text += pdf_reader.pages[page_num].extract_text()

                        #print(text)

                        parsed_pdfdata = parse_pdf_cti_report(text)
                        report_results[file_hash] = parsed_pdfdata

                except Exception as e:
                    report_results[file_hash] = {"error": f"Failed to read file as PDF. Error: {e}"}

            except Exception as e:
                # Handle any other errors during the file read process
                report_results[file_hash] = {"error": f"Error reading file: {e}"}

        else:
            report_results[file_hash] = {"error": "File does not exist or is not readable."}

    return report_results

In [159]:
file_hashes = ["64fc1708824717655847a396d4b9eb3a0455a247fb13c940404d7a198d000520"]
folder_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\downloads\20241008_downloads"
password = "infected"
cti_reports = read_cti_reports(folder_path, file_hashes, password=password)
total_reports = len(cti_reports)

for file_hash, result in cti_reports.items():
    if "error" in result:
        print(f"Error processing {file_hash}: {result['error']}")
    else:
        # Get the lengths of TTPs in text and in structured content (tables/lists)
        ttps_in_text_length = len(result['ttps_in_text'])
        ttps_in_table_or_list_length = len(result['ttps_in_table_or_list'])
        
        print(f"File Hash: {file_hash}")
        print(f"TTPs in text: {ttps_in_text_length} entries")
        print(f"TTPs in text: {result['ttps_in_text']}")
        print(f"TTPs in table or list: {ttps_in_table_or_list_length} entries")
        print(f"TTPs in table or list: {result['ttps_in_table_or_list']}")
        print("-" * 50)

File Hash: 64fc1708824717655847a396d4b9eb3a0455a247fb13c940404d7a198d000520
TTPs in text: 45 entries
TTPs in text: ['T1204.002', 'T1074.001', 'T1074', 'T1120', 'T1007', 'T1025', 'T1591.001', 'T1020', 'T1589.002', 'T1005', 'T1041', 'T1082', 'T1574.002', 'T1059.005', 'T1602.002', 'T1124', 'T1566.002', 'T1591.003', 'T1059', 'T1078', 'T1583.004', 'T1039', 'T1119', 'T1057', 'T1203', 'T1583', 'T1087.001', 'T1071', 'T1069', 'T1518', 'T1204.001', 'T1204', 'T1566.001', 'T1574', 'T1591.004', 'T1589.003', 'T1059.007', 'T1602', 'T1591.002', 'T1591', 'T1083', 'T1589', 'T1087', 'T1059.001', 'T1583.001']
TTPs in table or list: 0 entries
TTPs in table or list: []
--------------------------------------------------
