In [1]:
import os
import json
import difflib
import re

from collections import defaultdict

In [2]:
# Step 1: Locate and load config.json from one level up
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
config_path = os.path.normpath(os.path.join(parent_dir, 'config.json'))

with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

# Step 2: Build absolute path prefix based on where config.json was loaded from
config_dir = os.path.dirname(config_path)

In [3]:
config_path, config_dir

('C:\\Users\\ricewater\\Documents\\CTITTP\\config.json',
 'C:\\Users\\ricewater\\Documents\\CTITTP')

In [4]:
def read_jsonl_file(file_path):
    """
    Reads a specified JSONL file and yields each line as a JSON object.

    Parameters
    ----------
    file_path : str
        The path of the JSONL file to read.

    Yields
    ------
    dict
        A dictionary representation of each line in the JSONL file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield json.loads(line)

def merge_jsonl_data(file_path1, file_path2):
    """
    Merges JSONL data from two files into a single list of dictionaries.

    Parameters
    ----------
    file_path1 : str
        The path to the first JSONL file.
    file_path2 : str
        The path to the second JSONL file.

    Returns
    -------
    list
        A list of dictionaries containing merged data from both files.
    """
    combined_data = []
    
    # Read and merge data from both files
    for data in read_jsonl_file(file_path1):
        combined_data.append(data)
    
    for data in read_jsonl_file(file_path2):
        combined_data.append(data)
    
    return combined_data



In [5]:
# Build fully resolved and normalized paths
file_path1 = os.path.normpath(os.path.join(config_dir, config['jsonl_files']['Malpedia_2022']))
file_path2 = os.path.normpath(os.path.join(config_dir, config['jsonl_files']['Malpedia_2024']))

# Use the paths
merged_data = merge_jsonl_data(file_path1, file_path2)

In [6]:
print(f"Total merged entries: {len(merged_data)}")

Total merged entries: 15768


In [7]:
def count_jsonl_entries(file_path):
    """
    Counts the number of entries in a JSONL file.

    Parameters
    ----------
    file_path : str
        The path of the JSONL file to count entries.

    Returns
    -------
    int
        The count of entries in the JSONL file.
    """
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in file:
            count += 1
    return count


In [8]:
# Show the count of entries in each file
count1 = count_jsonl_entries(file_path1)
count2 = count_jsonl_entries(file_path2)

print(f"File 1 has {count1} entries.")
print(f"File 2 has {count2} entries.")

File 1 has 12050 entries.
File 2 has 3718 entries.


In [9]:
def find_duplicate_urls(merged_data):
    """
    Identifies entries with duplicate URLs and returns their associated download_sha256 values.

    Parameters
    ----------
    merged_data : list of dict
        The merged JSONL data.

    Returns
    -------
    dict
        A dictionary where each key is a URL and the value is a list of download_sha256
        values for entries with the same URL.
    """
    # Dictionary to store URLs as keys and lists of download_sha256 as values
    url_to_sha256 = defaultdict(list)
    
    # Iterate through all entries in the merged data
    for entry in merged_data:
        url = entry.get('url')  # Safely get the 'url', if it exists
        download_sha256 = entry.get('download_sha256')  # Safely get the 'download_sha256'
        
        # Only proceed if both url and download_sha256 are present
        if url and download_sha256:
            # Add the download_sha256 to the list corresponding to the URL
            url_to_sha256[url].append(download_sha256)
    
    # Filter to keep only URLs with more than one associated download_sha256 (duplicate URLs)
    duplicate_urls = {url: sha256s for url, sha256s in url_to_sha256.items() if len(sha256s) > 1}
    
    return duplicate_urls


# Find and print duplicate URLs and their download_sha256 values
duplicate_urls = find_duplicate_urls(merged_data)
#print(duplicate_urls)


if duplicate_urls:
    print(f"Total entries with duplicate URLs: {len(duplicate_urls)}")
    #print("Entries with duplicate URLs:")
    
    #for url, sha256s in duplicate_urls.items():
    #   print(f"URL: {url}")
    #   print(f"Download SHA256 values: {sha256s}\n")
else:
    print("No duplicate URLs found.")

Total entries with duplicate URLs: 67


In [10]:
def count_files_in_folders(folder_path, secondary_folder_path):
    """
    Count the number of files in the given folder paths.

    Parameters
    ----------
    folder_path : str
        The primary folder containing the files.
    secondary_folder_path : str
        A secondary folder to check if files are not found in the primary folder.

    Returns
    -------
    int
        The total count of files in both folders.
    """
    # Count files in the primary folder
    primary_folder_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Count files in the secondary folder
    secondary_folder_files = [f for f in os.listdir(secondary_folder_path) if os.path.isfile(os.path.join(secondary_folder_path, f))]

    # Calculate the total count of files in both folders
    total_files_count = len(primary_folder_files) + len(secondary_folder_files)

    # Output the count
    print(f"Total number of files in {folder_path}: {len(primary_folder_files)}")
    print(f"Total number of files in {secondary_folder_path}: {len(secondary_folder_files)}")
    print(f"Total number of files in both folders: {total_files_count}")

    return total_files_count


In [11]:
# Step 4: Build full normalized paths dynamically
malpedia_2022_reports = os.path.normpath(os.path.join(config_dir, config["threat_report_directories"]["Malpedia_2022"]))
malpedia_2024_reports = os.path.normpath(os.path.join(config_dir, config["threat_report_directories"]["Malpedia_2024"]))


In [12]:
# Get total file count
total_files = count_files_in_folders(malpedia_2022_reports, malpedia_2024_reports)

Total number of files in C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\documents: 11363
Total number of files in C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_malpedia_downloads: 3620
Total number of files in both folders: 14983


In [13]:
malpedia_2022_ttp = os.path.normpath(os.path.join(config_dir, config["directory_paths_ioc"]["TTP_Malpedia_2022"]))
malpedia_2024_ttp = os.path.normpath(os.path.join(config_dir, config["directory_paths_ioc"]["TTP_Malpedia_2024"]))

# Get total file count
total_files = count_files_in_folders(malpedia_2022_ttp, malpedia_2024_ttp)

Total number of files in C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\iocs: 1060
Total number of files in C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\iocs: 590
Total number of files in both folders: 1650


In [14]:
def count_cumulative_unique_ttps_malpedia(folder_path, secondary_folder_path):
    """
    Count cumulative unique TTP IDs across all files in the given folder paths.
    
    Parameters
    ----------
    folder_path : str
        Path to the first folder.
    secondary_folder_path : str
        Path to the second folder.
    
    Returns
    -------
    set
        A set of unique TTP IDs found in both folders.
    """
    unique_ttps = set()

    # Regex pattern to capture full TTP IDs (e.g., T1003, T1003.001)
    ttp_pattern = re.compile(r'\bT\d{4}(?:\.\d+)?\b')

    # Loop through both folders
    for folder in [folder_path, secondary_folder_path]:
        for file_name in os.listdir(folder):
            file_path = os.path.join(folder, file_name)

            # Skip hidden or backup files
            if file_name.startswith('_'):
                print(f"Skipping hidden or backup file: {file_name}")
                continue

            if os.path.isfile(file_path):
                #print(f"Reading TTPs from file: {file_name}")
                try:
                    # Open the file and check each line for TTP patterns
                    with open(file_path, 'r', errors='ignore') as file:
                        for line in file:
                            # Find all TTP IDs in the line and add them directly to the cumulative set
                            matches = ttp_pattern.findall(line)
                            unique_ttps.update(matches)  # Add each unique TTP ID found in the line
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")
    
    return unique_ttps


In [15]:
unique_ttps_malpedia= count_cumulative_unique_ttps_malpedia(malpedia_2022_ttp, malpedia_2024_ttp)

# Display the unique TTPs
print(f"Cumulative Unique TTPs found: {len(unique_ttps_malpedia)}")

Cumulative Unique TTPs found: 879


In [16]:
def count_cumulative_unique_ttps_mitre(folder_path):
    """
    Count cumulative unique TTP IDs across all files in the given folder paths.
    
    Parameters
    ----------
    folder_path : str
        Path to the first folder.
    secondary_folder_path : str
        Path to the second folder.
    
    Returns
    -------
    set
        A set of unique TTP IDs found in both folders.
    """
    unique_ttps = set()

    # Regex pattern to capture full TTP IDs (e.g., T1003, T1003.001)
    ttp_pattern = re.compile(r'\bT\d{4}(?:\.\d+)?\b')

    # Loop through both folders
    for folder in [folder_path]:
        for file_name in os.listdir(folder):
            file_path = os.path.join(folder, file_name)

            # Skip hidden or backup files
            if file_name.startswith('_'):
                print(f"Skipping hidden or backup file: {file_name}")
                continue

            if os.path.isfile(file_path):
                #print(f"Reading TTPs from file: {file_name}")
                try:
                    # Open the file and check each line for TTP patterns
                    with open(file_path, 'r', errors='ignore') as file:
                        for line in file:
                            # Find all TTP IDs in the line and add them directly to the cumulative set
                            matches = ttp_pattern.findall(line)
                            unique_ttps.update(matches)  # Add each unique TTP ID found in the line
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")
    
    return unique_ttps

In [17]:
mitre_ttp_path = os.path.normpath(os.path.join(config_dir, config["directory_paths_ioc"]["TTP_MITRE"]))
unique_ttps_mitre = count_cumulative_unique_ttps_mitre(mitre_ttp_path)

# Display the unique TTPs
print(f"Cumulative Unique TTPs found: {len(unique_ttps_mitre)}")


Cumulative Unique TTPs found: 470


In [18]:
union_ttps = set(unique_ttps_mitre).union(set(unique_ttps_malpedia))

In [19]:
len(union_ttps)

883