In [1]:
import json

def read_jsonl_file(file_path):
    """
    Reads a specified JSONL file and yields each line as a JSON object.

    Parameters
    ----------
    file_path : str
        The path of the JSONL file to read.

    Yields
    ------
    dict
        A dictionary representation of each line in the JSONL file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield json.loads(line)

def merge_jsonl_data(file_path1, file_path2):
    """
    Merges JSONL data from two files into a single list of dictionaries.

    Parameters
    ----------
    file_path1 : str
        The path to the first JSONL file.
    file_path2 : str
        The path to the second JSONL file.

    Returns
    -------
    list
        A list of dictionaries containing merged data from both files.
    """
    combined_data = []
    
    # Read and merge data from both files
    for data in read_jsonl_file(file_path1):
        combined_data.append(data)
    
    for data in read_jsonl_file(file_path2):
        combined_data.append(data)
    
    return combined_data



In [2]:
# Example usage:
file_path1 = r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\malpedia-db_2022-07-18_downloader.jsonl"
file_path2 = r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_downloads.jsonl"

merged_data = merge_jsonl_data(file_path1, file_path2)

In [3]:
print(f"Total merged entries: {len(merged_data)}")

Total merged entries: 15765


In [4]:
# Print only the first 10 entries
for entry in merged_data[-1:]:
    print(entry)
for entry in merged_data[:1]:
    print(entry)

{'download_mime': 'text/html', 'download_sha256': '4aa0f97c38e6e8cd1a25f74e477c5e3192da4cad09d3d31d1af2528eeb11657d', 'download_size': 312435, 'download_status': 200, 'download_ts': '2024-12-05 10:52:00.830787+00:00', 'url': 'https://www.trendmicro.com/en_us/research/22/k/deimosc2-what-soc-analysts-and-incident-responders-need-to-know.html'}
{'author': 'CERT Division', 'date': '2000', 'download_mime': 'text/html', 'download_redirects': ['https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'], 'download_sha256': 'f2c405b383ebaf4d0793f8d5162841b953d06947a711f7d34242faa20e285a04', 'download_size': 41745, 'download_status': 200, 'download_ts': '2022-07-19 12:43:41.400938+00:00', 'language': 'English', 'organization': 'Carnegie Mellon University', 'origin': ['malpedia:CarnegieMellonUniversity'], 'title': '2000 CERT Advisories', 'url': 'https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'}


In [5]:
def count_jsonl_entries(file_path):
    """
    Counts the number of entries in a JSONL file.

    Parameters
    ----------
    file_path : str
        The path of the JSONL file to count entries.

    Returns
    -------
    int
        The count of entries in the JSONL file.
    """
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in file:
            count += 1
    return count


In [6]:
# Show the count of entries in each file
count1 = count_jsonl_entries(file_path1)
count2 = count_jsonl_entries(file_path2)

print(f"File 1 has {count1} entries.")
print(f"File 2 has {count2} entries.")

File 1 has 12047 entries.
File 2 has 3718 entries.


In [8]:
from collections import defaultdict

def find_duplicate_urls(merged_data):
    """
    Identifies entries with duplicate URLs and returns their associated download_sha256 values.

    Parameters
    ----------
    merged_data : list of dict
        The merged JSONL data.

    Returns
    -------
    dict
        A dictionary where each key is a URL and the value is a list of download_sha256
        values for entries with the same URL.
    """
    # Dictionary to store URLs as keys and lists of download_sha256 as values
    url_to_sha256 = defaultdict(list)
    
    # Iterate through all entries in the merged data
    for entry in merged_data:
        url = entry.get('url')  # Safely get the 'url', if it exists
        download_sha256 = entry.get('download_sha256')  # Safely get the 'download_sha256'
        
        # Only proceed if both url and download_sha256 are present
        if url and download_sha256:
            # Add the download_sha256 to the list corresponding to the URL
            url_to_sha256[url].append(download_sha256)
    
    # Filter to keep only URLs with more than one associated download_sha256 (duplicate URLs)
    duplicate_urls = {url: sha256s for url, sha256s in url_to_sha256.items() if len(sha256s) > 1}
    
    return duplicate_urls


# Find and print duplicate URLs and their download_sha256 values
duplicate_urls = find_duplicate_urls(merged_data)
#print(duplicate_urls)


if duplicate_urls:
    print(f"Total entries with duplicate URLs: {len(duplicate_urls)}")
    print("Entries with duplicate URLs:")
    
    for url, sha256s in duplicate_urls.items():
        print(f"URL: {url}")
        print(f"Download SHA256 values: {sha256s}\n")
else:
    print("No duplicate URLs found.")


Total entries with duplicate URLs: 65
Entries with duplicate URLs:
URL: https://www.fireeye.com/content/dam/fireeye-www/global/en/current-threats/pdfs/wp-operation-ke3chang.pdf
Download SHA256 values: ['f66df25e85e87a89716affa3390147e946239d579911c6f855c33f6c51c3e1fb', 'e94bcd0505a3658455ccc516ba084c19042c550bfee458ef71b3bbc124d2e1df']

URL: https://www.zdnet.com/article/source-code-of-iranian-cyber-espionage-tools-leaked-on-telegram/
Download SHA256 values: ['8a098de5ed486c2c4c4b0eac01a97c5b765c4b9278927973ddd904bc860bda04', '2ca83af9f155da0378010e7f3fe580b7bdbe9e08533e3e6b3c3b6adfb31332ae']

URL: https://raw.githubusercontent.com/eric-erki/APT_CyberCriminal_Campagin_Collections/master/2017/2017.05.30.Lazarus_Arisen/Group-IB_Lazarus.pdf
Download SHA256 values: ['acd626acf50af8e30a681ccf88662b2bcecd5ec6053c18d6b460a42d9d726764', 'acd626acf50af8e30a681ccf88662b2bcecd5ec6053c18d6b460a42d9d726764']

URL: https://blog.talosintelligence.com/2017/03/dnsmessenger.html
Download SHA256 values: 

In [None]:
import os
import difflib

def compare_files_with_similarity(folder_paths: list, file1_name: str, file2_name: str, threshold: float = 0.9):
    """
    Compares the content of two files in one of the two directories to see if they are similar based on a threshold.
    Returns a similarity ratio and compares files using fuzzy matching.

    Args:
        folder_paths (list): A list of possible directories to search for the files.
        file1_name (str): The name of the first file.
        file2_name (str): The name of the second file.
        threshold (float): The similarity threshold (between 0 and 1) to consider the files as similar.

    Returns:
        bool: True if the content of the files is similar based on the threshold, False otherwise.
    """
    file1_path = None
    file2_path = None
    
    # Check both directories for the files
    for folder_path in folder_paths:
        temp_file1_path = os.path.join(folder_path, file1_name)
        if os.path.exists(temp_file1_path):
            file1_path = temp_file1_path
        temp_file2_path = os.path.join(folder_path, file2_name)
        if  os.path.exists(temp_file2_path):
             file2_path = temp_file2_path
    
    # If the files are not found in any of the directories, return False
    if not file1_path or not file2_path:
        print(f"Could not find {file1_name} or {file2_name} in the provided directories.")
        return False

    # Open both files and compare their content
    try:
        with open(file1_path, 'r', encoding='utf-8', errors='replace') as file1, \
            open(file2_path, 'r', encoding='utf-8', errors='replace') as file2:
            content1 = file1.read()
            content2 = file2.read()
            
            # Strip leading/trailing whitespaces and normalize line breaks
            content1 = content1.strip().replace('\r\n', '\n').replace('\r', '\n')
            content2 = content2.strip().replace('\r\n', '\n').replace('\r', '\n')
    
            # Use difflib to compare the files and get a similarity ratio
            seq_matcher = difflib.SequenceMatcher(None, content1, content2)
            similarity_ratio = seq_matcher.ratio()
    
            print(f"Similarity ratio between {file1_name} and {file2_name}: {similarity_ratio:.2f}")
    
            # If similarity ratio is above the threshold, consider the files similar
            if similarity_ratio >= threshold:
                print("The content of the files is similar.")
                return True
            else:
                print("The content of the files is not similar enough.")
                return False

    except Exception as e:
        print(f"Error reading files: {e}")
        return False

def compare_files_for_urls(duplicate_urls, folder_paths, threshold=0.9):
    """
    Compares files for each URL in the dictionary using fuzzy matching based on a threshold.
    
    Args:
        duplicate_urls (dict): A dictionary where keys are URLs and values are lists of file hashes.
        folder_paths (list): A list of possible directories where the files are located.
        threshold (float): The similarity threshold (between 0 and 1) to consider the files as similar.
    """
    for url, file_hashes in duplicate_urls.items():
        print(f"Comparing files for URL: {url}")
        
        # Compare each pair of files for this URL
        for i in range(len(file_hashes)):
            for j in range(i + 1, len(file_hashes)):
                file1_name = f"{file_hashes[i]}.download"
                file2_name = f"{file_hashes[j]}.download"
                
                print(f"Comparing {file1_name} with {file2_name}")
                
                # Compare the files
                compare_files_with_similarity(folder_paths, file1_name, file2_name, threshold)

# Example usage:
folder_paths = [
    r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\documents", 
    r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_malpedia_downloads"
]  # List of possible directories

# duplicate_urls = {
#     'https://www.fireeye.com/content/dam/fireeye-www/global/en/current-threats/pdfs/wp-operation-ke3chang.pdf': [
#         'f66df25e85e87a89716affa3390147e946239d579911c6f855c33f6c51c3e1fb', 
#         'e94bcd0505a3658455ccc516ba084c19042c550bfee458ef71b3bbc124d2e1df'
#     ],
#     'https://www.zdnet.com/article/source-code-of-iranian-cyber-espionage-tools-leaked-on-telegram/': [
#         '8a098de5ed486c2c4c4b0eac01a97c5b765c4b9278927973ddd904bc860bda04', 
#         '2ca83af9f155da0378010e7f3fe580b7bdbe9e08533e3e6b3c3b6adfb31332ae'
#     ]
# }

compare_files_for_urls(duplicate_urls, folder_paths, threshold=0.9)


Comparing files for URL: https://www.fireeye.com/content/dam/fireeye-www/global/en/current-threats/pdfs/wp-operation-ke3chang.pdf
Comparing f66df25e85e87a89716affa3390147e946239d579911c6f855c33f6c51c3e1fb.download with e94bcd0505a3658455ccc516ba084c19042c550bfee458ef71b3bbc124d2e1df.download


In [24]:
import os

def count_files_in_folders(folder_path, secondary_folder_path):
    """
    Count the number of files in the given folder paths.

    Parameters
    ----------
    folder_path : str
        The primary folder containing the files.
    secondary_folder_path : str
        A secondary folder to check if files are not found in the primary folder.

    Returns
    -------
    int
        The total count of files in both folders.
    """
    # Count files in the primary folder
    primary_folder_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Count files in the secondary folder
    secondary_folder_files = [f for f in os.listdir(secondary_folder_path) if os.path.isfile(os.path.join(secondary_folder_path, f))]

    # Calculate the total count of files in both folders
    total_files_count = len(primary_folder_files) + len(secondary_folder_files)

    # Output the count
    print(f"Total number of files in {folder_path}: {len(primary_folder_files)}")
    print(f"Total number of files in {secondary_folder_path}: {len(secondary_folder_files)}")
    print(f"Total number of files in both folders: {total_files_count}")

    return total_files_count

# Example usage:
folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\documents"
secondary_folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_malpedia_downloads"

# Get total file count
total_files = count_files_in_folders(folder_path, secondary_folder_path)


Total number of files in C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\documents: 11363
Total number of files in C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_malpedia_downloads: 3620
Total number of files in both folders: 14983


In [7]:
# Example usage:
folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\iocs"
secondary_folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\iocs"

# Get total file count
total_files = count_files_in_folders(folder_path, secondary_folder_path)

Total number of files in C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\iocs: 1058
Total number of files in C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\iocs: 590
Total number of files in both folders: 1648


In [8]:
import os
import re

def count_cumulative_unique_ttps(folder_path, secondary_folder_path):
    """
    Count cumulative unique TTP IDs across all files in the given folder paths.
    
    Parameters
    ----------
    folder_path : str
        Path to the first folder.
    secondary_folder_path : str
        Path to the second folder.
    
    Returns
    -------
    set
        A set of unique TTP IDs found in both folders.
    """
    unique_ttps = set()

    # Regex pattern to capture full TTP IDs (e.g., T1003, T1003.001)
    ttp_pattern = re.compile(r'\bT\d{4}(?:\.\d+)?\b')

    # Loop through both folders
    for folder in [folder_path, secondary_folder_path]:
        for file_name in os.listdir(folder):
            file_path = os.path.join(folder, file_name)

            # Skip hidden or backup files
            if file_name.startswith('_'):
                print(f"Skipping hidden or backup file: {file_name}")
                continue

            if os.path.isfile(file_path):
                #print(f"Reading TTPs from file: {file_name}")
                try:
                    # Open the file and check each line for TTP patterns
                    with open(file_path, 'r', errors='ignore') as file:
                        for line in file:
                            # Find all TTP IDs in the line and add them directly to the cumulative set
                            matches = ttp_pattern.findall(line)
                            unique_ttps.update(matches)  # Add each unique TTP ID found in the line
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")
    
    return unique_ttps

# Example usage
folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\iocs"
secondary_folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\iocs"

unique_ttps = count_cumulative_unique_ttps(folder_path, secondary_folder_path)

# Display the unique TTPs
print(f"Cumulative Unique TTPs found: {len(unique_ttps)}")
#for ttp in unique_ttps:
#   print(ttp)

Cumulative Unique TTPs found: 876
