In [4]:
##17 hashes are duplicated i.e, have duplicate URLs and have been downled as a same report with same TTPs as two different hashes ->
##120 - 17 is the true ground truth
# Function to find unique hashes with URLs and track removed hashes
import json
import os
import re
from collections import defaultdict


In [5]:
# Step 1: Locate and load config.json from one level up
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
config_path = os.path.normpath(os.path.join(parent_dir, 'config.json'))

with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

# Step 2: Build absolute path prefix based on where config.json was loaded from
config_dir = os.path.dirname(config_path)

In [10]:
def read_jsonl_to_dict(file_path):
    """
    Reads a JSONL file and converts it into a list of dictionaries.

    Parameters
    ----------
    file_path : str
        The path to the JSONL file.

    Returns
    -------
    list of dict
        A list of dictionaries where each dictionary represents one line in the JSONL file.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Parse each line into a dictionary and append to the list
            data.append(json.loads(line))
    
    return data

In [11]:
# Example usage:
mitre_enterprise_jsonl = os.path.normpath(os.path.join(config_dir, config["jsonl_files"]["MITRE_enterprise"]))
mitre_mobileics_jsonl = os.path.normpath(os.path.join(config_dir, config["jsonl_files"]["MITRE_mobileics"]))
combined_json_files = [mitre_enterprise_jsonl, mitre_mobileics_jsonl]

data = read_jsonl_to_dict(mitre_enterprise_jsonl)
print(f"Total MITRE JSONL entries in enterprise: {len(data)}")
data = read_jsonl_to_dict(mitre_mobileics_jsonl)
print(f"Total MITRE JSONL entries in mobile/ics:: {len(data)}")

Total MITRE JSONL entries in enterprise: 1417
Total MITRE JSONL entries in mobile/ics:: 25


In [14]:
def count_files_in_folders(folder_path):
    """
    Count the number of files in the given folder paths.

    Parameters
    ----------
    folder_path : str
        The primary folder containing the files.

    Returns
    -------
    int
        The total count of files in both folders.
    """
    # Count files in the primary folder
    primary_folder_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]


    # Calculate the total count of files 
    total_files_count = len(primary_folder_files) 

    print(f"Total number of files: {total_files_count}")

    return total_files_count

In [15]:
mitre_ttp_path = os.path.normpath(os.path.join(config_dir, config["directory_paths_ioc"]["TTP_MITRE"]))
mitre_threatreport_path = os.path.normpath(os.path.join(config_dir, config["threat_report_directories"]["MITRE"]))

# Get total file count
total_reports = count_files_in_folders(mitre_threatreport_path)
total_reports_ttps = count_files_in_folders(mitre_ttp_path)

Total number of files: 920
Total number of files: 122


In [17]:
def count_cumulative_unique_ttps(folder_path):
    """
    Count cumulative unique TTP IDs across all files in the given folder paths.
    
    Parameters
    ----------
    folder_path : str
        Path to the first folder.
    secondary_folder_path : str
        Path to the second folder.
    
    Returns
    -------
    set
        A set of unique TTP IDs found in both folders.
    """
    unique_ttps = set()

    # Regex pattern to capture full TTP IDs (e.g., T1003, T1003.001)
    ttp_pattern = re.compile(r'\bT\d{4}(?:\.\d+)?\b')

    # Loop through both folders
    for folder in [folder_path]:
        for file_name in os.listdir(folder):
            file_path = os.path.join(folder, file_name)

            # Skip hidden or backup files
            if file_name.startswith('_'):
                print(f"Skipping hidden or backup file: {file_name}")
                continue

            if os.path.isfile(file_path):
                #print(f"Reading TTPs from file: {file_name}")
                try:
                    # Open the file and check each line for TTP patterns
                    with open(file_path, 'r', errors='ignore') as file:
                        for line in file:
                            # Find all TTP IDs in the line and add them directly to the cumulative set
                            matches = ttp_pattern.findall(line)
                            unique_ttps.update(matches)  # Add each unique TTP ID found in the line
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")
    
    return unique_ttps

In [18]:
unique_ttps = count_cumulative_unique_ttps(mitre_ttp_path)

# Display the unique TTPs
print(f"Cumulative Unique TTPs found: {len(unique_ttps)}")
#for ttp in unique_ttps:
#   print(ttp)


Cumulative Unique TTPs found: 470
