In [8]:
import os
def get_file_hashes(folder_path):
    """
    Extracts file hashes (part before .download.iocs) from filenames in a folder.

    Parameters
    ----------
    folder_path : str
        The path to the folder.

    Returns
    -------
    list
        A list of file hashes extracted from the filenames.
    """
    try:
        # List all files in the specified folder and keep only the hashes
        file_hashes = [
            file.split('.download.iocs')[0] 
            for file in os.listdir(folder_path) 
            if os.path.isfile(os.path.join(folder_path, file))
        ]
        return file_hashes
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [28]:
# Set the path to the folder
folder_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\downloads\20241008_downloads\iocs2"
# Get and print the list of file hashes
file_hashes = get_file_hashes(folder_path)
#print("File hashes:", file_hashes)
len(file_hashes)

120

In [33]:
import os
def read_cti_reports(folder_path, file_hashes):
    """
    Reads the CTI report files with .download extension for each hash in file_hashes.

    Parameters
    ----------
    folder_path : str
        The path to the folder containing the CTI reports.
    file_hashes : list
        A list of file hashes to look for and read.

    Returns
    -------
    dict
        A dictionary where keys are file hashes and values are the file content or error message.
    """
    report_results = {}
    
    for file_hash in file_hashes:
        file_path = os.path.join(folder_path, f"{file_hash}.download")
        
        # Check if the file exists and is readable
        if os.path.exists(file_path) and os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                    parsed_data = parse_cti_report(content)
                    report_results[file_hash] = parsed_data

            except Exception as e:
                report_results[file_hash] = {"error": f"Error reading file: {e}"}
        else:
            report_results[file_hash] = {"error": "File does not exist or is not readable."}
    
    return report_results

In [34]:
folder_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\downloads\20241008_downloads"
cti_reports = read_cti_reports(folder_path, file_hashes)
# for hash_key, result in cti_reports.items():
#     if "error" not in result:
#         print(f"Hash: {hash_key}")
#         print("TTPs in Text Content:", result["ttps_in_text"])
#         print("TTPs in Table or List Content:", result["ttps_in_table_or_list"])
#     else:
#         print(f"Hash: {hash_key} - {result['error']}")
#     print("-" * 80)

In [37]:
reports_with_both_ttps = {
    hash_key: result for hash_key, result in cti_reports.items()
    if "error" not in result and result["ttps_in_text"] and result["ttps_in_table_or_list"]
}

# Output hashes with both TTPs in text and structured content
# print("Hashes with TTPs in both Text Content and Table/List Content:")
# for hash_key, result in reports_with_both_ttps.items():
#     print(f"Hash: {hash_key}")
#     print("TTPs in Text Content:", result["ttps_in_text"])
#     print("TTPs in Table or List Content:", result["ttps_in_table_or_list"])
#     print("-" * 80)

for hash_key, data in reports_with_both_ttps.items():
    # Convert lists of TTP tuples to sets for easier comparison
    ttps_in_text = set(data["ttps_in_text"])
    ttps_in_table_or_list = set(data["ttps_in_table_or_list"])

    # Calculate True Positives (TTPs in both text and table/list)
    true_positives = ttps_in_text & ttps_in_table_or_list

    # Calculate False Positives (TTPs in table/list but not in text) and False Negatives (TTPs in text but not in table/list)
    false_positives = ttps_in_table_or_list - ttps_in_text
    false_negatives = ttps_in_text - ttps_in_table_or_list

    # Calculate Precision and Recall
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if true_positives or false_positives else 0
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if true_positives or false_negatives else 0

    # Print the hash, TTPs in text content, TTPs in table or list content, and metrics
    print(f"\nHash: {hash_key}")
    print("TTPs in Text Content:", sorted(ttps_in_text))
    print("TTPs in Table or List Content:", sorted(ttps_in_table_or_list))
    print("True Positives (TP):", sorted(true_positives))
    print("False Positives (FP):", sorted(false_positives))
    print("False Negatives (FN):", sorted(false_negatives))
    print("Precision:", precision)
    print("Recall:", recall)
    print("-" * 80)



Hash: 0c4e350517502f10c3986a7e20d7ce2b78fbc417a36aa22a370a56b124026e9b
TTPs in Text Content: [('T1001.001', '.001'), ('T1005', ''), ('T1027', ''), ('T1036.005', '.005'), ('T1041', ''), ('T1059.001', '.001'), ('T1059.006', '.006'), ('T1071.001', '.001'), ('T1132.002', '.002'), ('T1204.002', '.002'), ('T1480', ''), ('T1547.001', '.001'), ('T1566.001', '.001'), ('T1572', ''), ('T1574.002', '.002')]
TTPs in Table or List Content: [('T1003.001', '.001'), ('T1003.004', '.004'), ('T1003.005', '.005'), ('T1016', ''), ('T1027', ''), ('T1027.003', '.003'), ('T1027.004', '.004'), ('T1033', ''), ('T1036.005', '.005'), ('T1041', ''), ('T1047', ''), ('T1049', ''), ('T1053.005', '.005'), ('T1057', ''), ('T1059.001', '.001'), ('T1059.005', '.005'), ('T1059.006', '.006'), ('T1059.007', '.007'), ('T1071.001', '.001'), ('T1082', ''), ('T1083', ''), ('T1087.002', '.002'), ('T1090.002', '.002'), ('T1102.002', '.002'), ('T1104', ''), ('T1105', ''), ('T1113', ''), ('T1132.001', '.001'), ('T1132.002', '.002'

In [36]:
import os
import re
from bs4 import BeautifulSoup

def extract_ttps(content, pattern):
    """
    Extracts TTPs from the given content using the specified regex pattern.

    Parameters
    ----------
    content : str
        The text content to search for TTPs.
    pattern : str
        The regex pattern to match TTPs.

    Returns
    -------
    list
        A list of matched TTPs.
    """
    return re.findall(pattern, content)

def parse_cti_report(content):
    """
    Separates plain text content from table or list content, then extracts TTPs from each section.

    Parameters
    ----------
    content : str
        The complete content of a CTI report.

    Returns
    -------
    dict
        A dictionary with TTPs found in text, list, and table content.
    """
    ttp_pattern = r'\b(T[0-9]{4}([.][0-9]{3})?)\b'
    
    # Convert the content to a BeautifulSoup object for easier parsing
    soup = BeautifulSoup(content, "html.parser")
    
    # 1. Extract all structured content: tables and lists with TTPs
    ttps_in_table_or_list = []
    for table in soup.find_all("table"):
        # Extract TTPs from each table cell
        ttps_in_table_or_list.extend(extract_ttps(table.get_text(), ttp_pattern))
    
    for ul in soup.find_all("ul"):
        # Extract TTPs from each list item in unordered lists
        for li in ul.find_all("li"):
            ttps_in_table_or_list.extend(extract_ttps(li.get_text(), ttp_pattern))

    for ol in soup.find_all("ol"):
        # Extract TTPs from each list item in ordered lists
        for li in ol.find_all("li"):
            ttps_in_table_or_list.extend(extract_ttps(li.get_text(), ttp_pattern))

    # Remove duplicate TTPs from structured content
    ttps_in_table_or_list = list(set(ttps_in_table_or_list))

    # 2. Extract plain text content, excluding tables and lists
    for element in soup(["table", "ul", "ol"]):
        element.extract()  # Remove structured elements from soup to isolate plain text

    # Find TTPs in plain text content
    plain_text_content = soup.get_text()
    ttps_in_text = extract_ttps(plain_text_content, ttp_pattern)
    ttps_in_text = list(set(ttps_in_text))

    # 3. Check if TTPs in text and structured data are truly separate
    # If TTPs are only in tables/lists, set ttps_in_text to empty
    if set(ttps_in_text).issubset(ttps_in_table_or_list):
        ttps_in_text = []

    return {
        "ttps_in_text": ttps_in_text,
        "ttps_in_table_or_list": ttps_in_table_or_list
    }

In [None]:
Analysis hashes: 
0c4e350517502f10c3986a7e20d7ce2b78fbc417a36aa22a370a56b124026e9b
bff733b5ddd507076bcef720c7d068d3c970c7bb934bde080a29a091432973e8
5d39c90de10384fa86bafb4016761bf84f6d83964a06a38ef4c0db7f0d3b4532
a6c1cbd286cdc07366367c3a9313719dac1c472eb8cd65361f211781eeaf809c
