In [25]:
import os
def get_file_hashes(folder_path):
    """
    Extracts file hashes (part before .download.iocs) from filenames in a folder.

    Parameters
    ----------
    folder_path : str
        The path to the folder.

    Returns
    -------
    list
        A list of file hashes extracted from the filenames.
    """
    try:
        # List all files in the specified folder and keep only the hashes
        file_hashes = [
            file.split('.download.iocs')[0] 
            for file in os.listdir(folder_path) 
            if os.path.isfile(os.path.join(folder_path, file))
        ]
        return file_hashes
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [26]:
# Set the path to the folder
folder_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\downloads\20241008_downloads\iocs2"
# Get and print the list of file hashes
file_hashes = get_file_hashes(folder_path)
#print("File hashes:", file_hashes)
len(file_hashes)

120

In [27]:
import os
import PyPDF2
from PyPDF2.errors import WrongPasswordError

def read_cti_reports(folder_path, file_hashes, password=None):
    """
    Reads the CTI report files with .download extension for each hash in file_hashes. Handles text files and encrypted PDFs.

    Parameters
    ----------
    folder_path : str
        The path to the folder containing the CTI reports.
    file_hashes : list
        A list of file hashes to look for and read.
    password : str, optional
        Password to decrypt encrypted PDF files, by default None.

    Returns
    -------
    dict
        A dictionary where keys are file hashes and values are the file content or error message.
    """
    report_results = {}

    for file_hash in file_hashes:
        file_path = os.path.join(folder_path, f"{file_hash}.download")

        # Check if the file exists and is readable
        if os.path.exists(file_path) and os.path.isfile(file_path):
            try:
                # Attempt to read the file as UTF-8 text
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                parsed_data = parse_cti_report(content)  # This function parses the report
                report_results[file_hash] = parsed_data

            except UnicodeDecodeError:
                # If a UnicodeDecodeError occurs, the file might be a binary PDF
                #print(f"Error reading file as text. Attempting to read as PDF: {file_path}")
                try:
                    with open(file_path, 'rb') as file:
                        pdf_reader = PyPDF2.PdfReader(file)

                        # Handle encrypted PDF if a password is required
                        if pdf_reader.is_encrypted:
                            try:
                                pdf_reader.decrypt(password or "")
                            except WrongPasswordError:
                                report_results[file_hash] = {"error": "Failed to decrypt PDF, wrong password."}
                                continue  # Skip to next file hash

                        text = ""
                        # Extract text from each page of the PDF
                        for page_num in range(len(pdf_reader.pages)):
                            text += pdf_reader.pages[page_num].extract_text()

                        parsed_pdfdata = parse_pdf_cti_report(text)
                        report_results[file_hash] = parsed_pdfdata

                except Exception as e:
                    report_results[file_hash] = {"error": f"Failed to read file as PDF. Error: {e}"}

            except Exception as e:
                # Handle any other errors during the file read process
                report_results[file_hash] = {"error": f"Error reading file: {e}"}

        else:
            report_results[file_hash] = {"error": "File does not exist or is not readable."}

    return report_results

In [29]:
folder_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\downloads\20241008_downloads"
password = "infected"
cti_reports = read_cti_reports(folder_path, file_hashes, password=password)
total_reports = len(cti_reports)
file_hashes_count_table = 0
file_hashes_count_text = 0
print(f"Total number of reports processed: {total_reports}\n")

for file_hash, result in cti_reports.items():
    if "error" in result:
        print(f"Error processing {file_hash}: {result['error']}")
    else:
        # Get the lengths of TTPs in text and in structured content (tables/lists)
        ttps_in_text_length = len(result['ttps_in_text'])
        ttps_in_table_or_list_length = len(result['ttps_in_table_or_list'])

        if ttps_in_text_length == 0 and ttps_in_table_or_list_length > 0:
            file_hashes_count_table += 1
            print(f"File Hash: {file_hash}")
        if ttps_in_table_or_list_length == 0 and ttps_in_text_length > 0:
            file_hashes_count_text += 1
        
        
        #print(f"File Hash: {file_hash}")
        #print(f"TTPs in text: {ttps_in_text_length} entries")
        #print(f"TTPs in text: {result['ttps_in_text']}")
        #print(f"TTPs in table or list: {ttps_in_table_or_list_length} entries")
        #print(f"TTPs in table or list: {result['ttps_in_table_or_list']}")
        #print("-" * 50)

# Output the total count of file hashes that meet the criteria
print(f"Total file hashes where TTPs in text = 0 and TTPs in table or list > 0: {file_hashes_count_table}")
print(f"Total file hashes where TTPs in table or list = 0 and TTPs in text > 0: {file_hashes_count_text}")

Total number of reports processed: 120

File Hash: 02f2fc63bd20b7487591a4c3e833971ae4595c873a10bbb6b9bc683cdf90f4fb
File Hash: 0382a53442f0a3f298a88663c233b9dd8b0673be66032978e6d70d62e5ce7de7
File Hash: 06620af18d054b4fc4d325ab3b704b3f2d0a1edc253180222296cd3da6570fc3
File Hash: 0b539f609729a6955e96fb233af7f739264d7be59e39370d5c88a8c49bc7de88
File Hash: 0cb04099bd009700bddc94a322c531e0f3dabd6b8942a7759a213203fff35d1a
File Hash: 12d5737b92ff5b884e1d9b810778a4f3a26d021917b4500a7804b68eebf8fef4
File Hash: 169b2dec8cbbefbecfaa2d364350ca27f61934d1532142fae21a85013fe10e1b
File Hash: 21424dec735d0c5d53c61f7a811a5f971a3c84ea1391dd1e4b07002d243401b0
File Hash: 26c4b9714fd65fa07ac6c1dd514e7a8507a31a161ac66b9d1a1095a986ecbeef
File Hash: 2a35b2945366c76a5f14636df5bd1720051d66106a86e49ac3065122ba11ca8a
File Hash: 2e65ebf1966a9ea0fd8c15056ec082ed4eeed63843536fc699519a7262ec6231
File Hash: 305233a3c56b1983ae4a42cd850bc68b92ecbe4af2e2f3aaeb17927d75cb4dfd
File Hash: 3063db67750089075a77b4367ced61d82f307

In [34]:
#Analysis hashes: 

#0c4e350517502f10c3986a7e20d7ce2b78fbc417a36aa22a370a56b124026e9b - https://www.cisa.gov/news-events/cybersecurity-advisories/aa22-055a
#2ef3aa28b21deee943bed752a1ac7950382c951a54fd50323bf67b39ee3f5476 - https://www.lacework.com/blog/taking-teamtnt-docker-images-offline/
#5d39c90de10384fa86bafb4016761bf84f6d83964a06a38ef4c0db7f0d3b4532 - https://us-cert.cisa.gov/ncas/alerts/aa21-048a
#a6c1cbd286cdc07366367c3a9313719dac1c472eb8cd65361f211781eeaf809c - https://us-cert.cisa.gov/ncas/alerts/aa20-301a
#e4c8a7ace1cc91c65aadd49419614b71da517763443105b77ee7379d2421528e - https://www.cisa.gov/news-events/cybersecurity-advisories/aa23-320a


## Hashes to remove
###bff733b5ddd507076bcef720c7d068d3c970c7bb934bde080a29a091432973e8 - https://blog.trendmicro.com/trendlabs-security-intelligence/gamaredon-apt-group-use-covid-19-lure-in-campaigns/
##15e4f4ac4a0a5c1b6876c4b4907c66110d319ba03917c991214e8a6edffdb817 - https://blog.trendmicro.com/trendlabs-security-intelligence/gamaredon-apt-group-use-covid-19-lure-in-campaigns/
#e9240c31bbb736b7a49df9da3d79f0b65e1173743b2de749da4c55c44fedddf4 - https://unit42.paloaltonetworks.com/molerats-delivers-spark-backdoor/
##False list
#424f242d75014b9a8f8eda8dbed34a324c32a4106cafe832a5fb5a54f3512230 - https://www.lacework.com/blog/taking-teamtnt-docker-images-offline/
##-->This is duplicate of 2ef3
##9acadc6a13214395837884616121f6254b86dcab0951564b4d5dbc58c64c5166 - https://www.mandiant.com/resources/blog/sandworm-disrupts-power-ukraine-operational-technology
##->This ttp in the text is not really correct because it is a part of the YARA signature content

In [42]:
excluded_hashes = {
    "bff733b5ddd507076bcef720c7d068d3c970c7bb934bde080a29a091432973e8",
    "15e4f4ac4a0a5c1b6876c4b4907c66110d319ba03917c991214e8a6edffdb817",
    "e9240c31bbb736b7a49df9da3d79f0b65e1173743b2de749da4c55c44fedddf4",
    "424f242d75014b9a8f8eda8dbed34a324c32a4106cafe832a5fb5a54f3512230",
    "9acadc6a13214395837884616121f6254b86dcab0951564b4d5dbc58c64c5166"   
}

# Creating reports_with_both_ttps without the excluded file hashes
reports_with_both_ttps = {
    hash_key: result for hash_key, result in cti_reports.items()
    if hash_key not in excluded_hashes and "error" not in result and result["ttps_in_text"] and result["ttps_in_table_or_list"]
}

# Printing only those results
if reports_with_both_ttps:
    #print(f"Reports with TTPs in both text and table/list: {len(reports_with_both_ttps)}\n")

    for hash_key, result in reports_with_both_ttps.items():
        ttps_in_text_length = len(result['ttps_in_text'])
        ttps_in_table_or_list_length = len(result['ttps_in_table_or_list'])
        
        print(f"File Hash: {hash_key}")
        print(f"TTPs in text: {ttps_in_text_length} entries")
        #print(f"TTPs in text: {result['ttps_in_text']}")
        print(f"TTPs in table or list: {ttps_in_table_or_list_length} entries")
        #print(f"TTPs in table or list: {result['ttps_in_table_or_list']}")

        # Extracting just the TTP IDs (without the suffixes)
        base_ttp_text = set([ttp[0] for ttp in result['ttps_in_text']])
        base_ttp_table = set([ttp[0] for ttp in result['ttps_in_table_or_list']])

        # Find common TTPs (Intersection)
        common_ttps = base_ttp_text.intersection(base_ttp_table)
        
        # Find TTPs in text but not in the table (A - B)
        ttps_in_text_only = base_ttp_text.difference(base_ttp_table)
        
        # Find TTPs in table but not in the text (B - A)
        ttps_in_table_only = base_ttp_table.difference(base_ttp_text)

        print(f"Number of common TTPs (Intersection): {len(common_ttps)}")
        print(f"Common TTPs: {common_ttps}")
        # Printing lengths of TTPs that are in text but not in the table/list (A - B) and vice versa (B - A)
        print(f"TTPs in text but not in table/list (A - B): {len(ttps_in_text_only)}")
        print(f"TTPs in text but not in table/list (A - B): {ttps_in_text_only}")
        print(f"TTPs in table/list but not in text (B - A): {len(ttps_in_table_only)}")
        print(f"TTPs in table/list but not in text (B - A): {ttps_in_table_only}")

        print("***After excluding the subtecniques***")

        ###Identifying overlap when the sub-technique IDs are not included
        # Extracting just the main TTP IDs (ignoring the sub-technique part if present)
        base_ttp_text_1 = set([ttp[0].split('.')[0] for ttp in result['ttps_in_text']])
        base_ttp_table_1 = set([ttp[0].split('.')[0] for ttp in result['ttps_in_table_or_list']])

        # Find common TTPs (Intersection)
        common_ttps_1 = base_ttp_text_1.intersection(base_ttp_table_1)
        
        # Find TTPs in text but not in the table (A - B)
        ttps_in_text_only_1 = base_ttp_text_1.difference(base_ttp_table_1)
        
        # Find TTPs in table but not in the text (B - A)
        ttps_in_table_only_1 = base_ttp_table_1.difference(base_ttp_text_1)

        print(f"Number of common TTPs (Intersection): {len(common_ttps_1)}")
        print(f"Common TTPs: {common_ttps_1}")
        # Printing lengths of TTPs that are in text but not in the table/list (A - B) and vice versa (B - A)
        print(f"TTPs in text but not in table/list (A - B): {len(ttps_in_text_only_1)}")
        print(f"TTPs in text but not in table/list (A - B): {ttps_in_text_only_1}")
        print(f"TTPs in table/list but not in text (B - A): {len(ttps_in_table_only_1)}")
        print(f"TTPs in table/list but not in text (B - A): {ttps_in_table_only_1}")

        print("-" * 50)
       
else:
    print("No reports found with both TTPs in text and table/list.")



File Hash: 0c4e350517502f10c3986a7e20d7ce2b78fbc417a36aa22a370a56b124026e9b
TTPs in text: 15 entries
TTPs in table or list: 56 entries
Number of common TTPs (Intersection): 12
Common TTPs: {'T1041', 'T1071.001', 'T1036.005', 'T1059.006', 'T1027', 'T1204.002', 'T1480', 'T1059.001', 'T1566.001', 'T1132.002', 'T1574.002', 'T1547.001'}
TTPs in text but not in table/list (A - B): 3
TTPs in text but not in table/list (A - B): {'T1001.001', 'T1572', 'T1005'}
TTPs in table/list but not in text (B - A): 44
TTPs in table/list but not in text (B - A): {'T1049', 'T1559.002', 'T1560.001', 'T1566.002', 'T1059.007', 'T1102.002', 'T1003.001', 'T1105', 'T1059.005', 'T1548.002', 'T1033', 'T1016', 'T1082', 'T1027.004', 'T1218.005', 'T1113', 'T1053.005', 'T1588.002', 'T1589.002', 'T1087.002', 'T1559.001', 'T1090.002', 'T1047', 'T1518', 'T1219', 'T1132.001', 'T1204.001', 'T1518.001', 'T1218.003', 'T1137.001', 'T1027.003', 'T1083', 'T1218.011', 'T1057', 'T1555', 'T1583.006', 'T1104', 'T1003.004', 'T1555.003

In [32]:
import re
def identify_structured_sections(content):
    """
    Identifies potential structured sections (like lists or tables) in the PDF text content by
    detecting patterns such as bullet points, numbered lists, or tabular-like text structures.

    Parameters
    ----------
    content : str
        The text content of the PDF to analyze.

    Returns
    -------
    dict
        A dictionary with plain text and structured sections.
    """
    # Split the content into lines for easier processing
    lines = content.splitlines()

    structured_content = []
    plain_text_content = []
    
    in_structured_section = False
    structured_section = []

    for line in lines:
        stripped_line = line.strip()
        
        # Detect simple bullet points, numbered lists, or table-like sections
        if re.match(r'^(?:\d+\.\d+|\d+\.)', stripped_line):  # This matches numeric lists (1., 1.1, etc.)
            in_structured_section = True
            structured_section.append(stripped_line)
        elif re.match(r'^\s{2,}', stripped_line):  # Indentation in the content indicates tabular format
            in_structured_section = True
            structured_section.append(stripped_line)
        else:
            if in_structured_section:
                # We assume the structured section ends here
                structured_content.append(' '.join(structured_section))
                structured_section = []
                in_structured_section = False
            plain_text_content.append(stripped_line)

    # In case there's an unclosed structured section
    if structured_section:
        structured_content.append(' '.join(structured_section))

    # Join plain text content together into a single block
    plain_text = ' '.join(plain_text_content)

    return {
        "structured_content": ' '.join(structured_content),
        "plain_text_content": plain_text
    }


def parse_pdf_cti_report(pdf_content):
    """
    Separates plain text content from structured (list/table-like) content in a PDF, 
    then extracts TTPs from each section.

    Parameters
    ----------
    pdf_content : str
        The complete content of a CTI report extracted from a PDF.

    Returns
    -------
    dict
        A dictionary with TTPs found in text, list, and table-like content.
    """
    ttp_pattern = r'\b(T[0-9]{4}(?:\.[0-9]{3})?)\b'

    # Identify structured sections like lists or table-like data
    sections = identify_structured_sections(pdf_content)
    
    # Extract TTPs from structured content (detected as lists or table-like)
    ttps_in_structured_content = extract_ttps(sections['structured_content'], ttp_pattern)
    ttps_in_structured_content = list(set(ttps_in_structured_content))

    # Extract TTPs from plain text content (excluding structured sections)
    ttps_in_text = extract_ttps(sections['plain_text_content'], ttp_pattern)
    ttps_in_text = list(set(ttps_in_text))

    # Ensure that TTPs in text and structured data are truly separate
    #if set(ttps_in_text).issubset(ttps_in_structured_content):
    #    ttps_in_text = []

    return {
        "ttps_in_text": ttps_in_text,
        "ttps_in_table_or_list": ttps_in_structured_content
    }

def extract_ttps(content, pattern):
    """
    Extracts TTPs from the given content using the specified regex pattern.

    Parameters
    ----------
    content : str
        The text content to search for TTPs.
    pattern : str
        The regex pattern to match TTPs.

    Returns
    -------
    list
        A list of matched TTPs.
    """
    return re.findall(pattern, content)

In [33]:
import os
import re
from bs4 import BeautifulSoup

def extract_ttps(content, pattern):
    """
    Extracts TTPs from the given content using the specified regex pattern.

    Parameters
    ----------
    content : str
        The text content to search for TTPs.
    pattern : str
        The regex pattern to match TTPs.

    Returns
    -------
    list
        A list of matched TTPs.
    """
    return re.findall(pattern, content)

def parse_cti_report(content):
    """
    Separates plain text content from table or list content, then extracts TTPs from each section.

    Parameters
    ----------
    content : str
        The complete content of a CTI report.

    Returns
    -------
    dict
        A dictionary with TTPs found in text, list, and table content.
    """
    ttp_pattern = r'\b(T[0-9]{4}([.][0-9]{3})?)\b'
    
    # Convert the content to a BeautifulSoup object for easier parsing
    soup = BeautifulSoup(content, "html.parser")
    
    # 1. Extract all structured content: tables and lists with TTPs
    ttps_in_table_or_list = []

    # Extract from tables
    for table in soup.find_all("table"):
        for row in table.find_all("tr"):
            for cell in row.find_all("td"):
                ttps_in_table_or_list.extend(extract_ttps(cell.get_text(), ttp_pattern))
    
    # Extract from unordered lists
    for ul in soup.find_all("ul"):
        for li in ul.find_all("li"):
            ttps_in_table_or_list.extend(extract_ttps(li.get_text(), ttp_pattern))

    # Extract from ordered lists
    for ol in soup.find_all("ol"):
        for li in ol.find_all("li"):
            ttps_in_table_or_list.extend(extract_ttps(li.get_text(), ttp_pattern))

    # Remove duplicates from structured content
    ttps_in_table_or_list = list(set(ttps_in_table_or_list))

    # 2. Extract plain text content, excluding tables and lists
    for element in soup(["table", "ul", "ol"]):
        element.extract()  # Remove structured elements from soup to isolate plain text

    # Find TTPs in plain text content
    plain_text_content = soup.get_text()
    ttps_in_text = extract_ttps(plain_text_content, ttp_pattern)
    ttps_in_text = list(set(ttps_in_text))

    # 3. Check if TTPs in text and structured data are truly separate
    #if set(ttps_in_text).issubset(ttps_in_table_or_list):
    #    ttps_in_text = []

    return {
        "ttps_in_text": ttps_in_text,
        "ttps_in_table_or_list": ttps_in_table_or_list
    }