In [5]:
import pandas as pd
import json
import os
from statistics import mode
from collections import Counter, defaultdict
import requests
import re

In [6]:
def load_config():
    # Get the absolute path of the project root (one directory up)
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

    # Normalize the project_root to ensure it's correctly formatted
    project_root = os.path.normpath(project_root)
    
    config_path = os.path.join(project_root, 'config.json')

    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Config file not found at expected location: {config_path}")

    with open(config_path, 'r') as f:
        config = json.load(f)

    return config, project_root


In [7]:
config, project_root = load_config()

In [8]:
def extract_citations(citations_raw):
    """
    This function takes a raw string of citations, extracts individual citations,
    removes duplicates, cleans up irrelevant characters, and ensures no extra spaces or invalid characters.

    Args:
        citations_raw (str): The raw citation string from which citations are to be extracted.

    Returns:
        list: A list of unique citations.
    """
    #print(citations_raw)
    if pd.isna(citations_raw) or citations_raw == '':
        return []

    # Replace any occurrence of 'Citation: ' to standardize the string
    citations_raw = citations_raw.replace("Citation: ", "")
    
    # Split citations by both the delimiters '), (' and ')('
    citations = re.split(r"\)\s*,?\s*\(|\)\(", citations_raw)

    # Clean each citation by removing unwanted characters and extra spaces
    cleaned_citations = []
    for citation in citations:
        citation = citation.replace("(", "").replace(")", "").strip(", ").strip()
        
        # Use regex to extract only the relevant citation portion before any period or embedded sentence
        # Here, we match the first part (e.g., "US-CERT HIDDEN COBRA June 2017") and ignore the rest
        citation = re.sub(r'\s*\..*$', '', citation)  # Remove everything after the first period
        
        # We also use another regex to handle extra commas, which might still linger after cleaning
        citation = re.sub(r',\s*$', '', citation)  # Remove comma at the end of citation
        
        if citation:  # Only add to the list if there is any valid citation
            cleaned_citations.append(citation)
    
    # Remove duplicates by converting the list to a set and back to a list
    cleaned_citations = list(set(cleaned_citations))

    #print(cleaned_citations)

    return cleaned_citations

In [9]:
def build_group_citation_map(excel_file):
    """
    Build a group citation map from a given MITRE ATT&CK group Excel file.

    Parameters
    ----------
    excel_file : str
        Path to the Excel file containing 'groups' and 'citations' sheets.

    Returns
    -------
    dict
        A mapping of group ID to associated and relationship citations (with URLs).
    """
    df_groups = pd.read_excel(excel_file, sheet_name='groups')
    df_citations = pd.read_excel(excel_file, sheet_name='citations')

    citation_map = dict(zip(df_citations['reference'], df_citations['url']))
    group_citations = {}

    for _, row in df_groups.iterrows():
        group_id = row['ID']

        associated_raw = row.get('associated groups citations')
        relationship_raw = row.get('relationship citations')

        associated = extract_citations(associated_raw) if pd.notna(associated_raw) else []
        relationship = extract_citations(relationship_raw) if pd.notna(relationship_raw) else []

        associated_with_urls = {
            c: citation_map.get(c, "URL not found") for c in associated
        }
        relationship_with_urls = {
            c: citation_map.get(c, "URL not found") for c in relationship
        }

        group_citations[group_id] = {
            "associated_groups_citations": associated_with_urls,
            "relationship_citations": relationship_with_urls
        }

    return group_citations

In [10]:
def merge_group_citation_maps(group_citations_map, new_data):
    """
    Merge new group citation data into the existing map.

    Parameters
    ----------
    group_citations_map : dict
        The existing group citation map.
    new_data : dict
        New citation data to merge in.
    """
    for group_id, data in new_data.items():
        if group_id not in group_citations_map:
            group_citations_map[group_id] = data
        else:
            # Merge associated citations
            group_citations_map[group_id]["associated_groups_citations"].update(
                data["associated_groups_citations"]
            )
            # Merge relationship citations
            group_citations_map[group_id]["relationship_citations"].update(
                data["relationship_citations"]
            )


In [11]:
group_paths = [
    os.path.normpath(os.path.join(project_root, config["data_directory"], config["file_paths_groups_v15"]["enterprise"])),
    os.path.normpath(os.path.join(project_root, config["data_directory"], config["file_paths_groups_v15"]["mobile"])),
    os.path.normpath(os.path.join(project_root, config["data_directory"], config["file_paths_groups_v15"]["ics"]))
]

In [12]:
group_citations_map = {}

for file_path in group_paths:
    new_data = build_group_citation_map(file_path)
    merge_group_citation_maps(group_citations_map, new_data)


In [13]:
len(group_citations_map)

152

In [14]:
# Initialize counters
total_urls = 0  # Total number of URLs across both associated_groups_citations and relationship_citations

# Iterate through each group in the dictionary
for group_id, group_data in group_citations_map.items():
    # Count URLs in 'associated_groups_citations'
    associated_urls = group_data.get('associated_groups_citations', {})
    total_urls += len(associated_urls)  # Add the number of URLs in associated_groups_citations
    
    # Count URLs in 'relationship_citations'
    relationship_urls = group_data.get('relationship_citations', {})
    total_urls += len(relationship_urls)  # Add the number of URLs in relationship_citations

# Output the total number of URLs
print(f"Total number of URLs across the whole dictionary: {total_urls}")
num_groups = len(group_citations_map)  
print(f"Totak number of groups: {num_groups}")


Total number of URLs across the whole dictionary: 1212
Totak number of groups: 152


In [15]:
# Function to load JSONL data
def load_jsonl(file_path):
    """
    Loads a JSONL file into a list of dictionaries.
    Each line of the file is parsed as a JSON object.
    """
    with open(file_path, 'r') as file:
        json_data = [json.loads(line) for line in file]
    return json_data

In [16]:
mitre_jsonl_path = os.path.normpath(os.path.join(project_root, config['jsonl_files']['MITRE_enterprise']))

# Step 1: Load the JSONL data
jsonl_data = load_jsonl(mitre_jsonl_path)

In [17]:
len(jsonl_data)

1417

In [18]:
def extract_file_hash_from_filename(filename):
    """
    Extracts the file hash from a given filename.
    
    - If the filename ends with '.download.iocs', it removes this suffix.
    - Otherwise, if the filename ends with '.iocs', it removes only '.iocs'.

    Parameters
    ----------
    filename : str
        The input filename.

    Returns
    -------
    str or None
        The extracted file hash, or None if the filename is invalid.
    """
    if filename.endswith('.download.iocs'):
        return filename[:-14]  # Remove '.download.iocs' (14 characters)

    if filename.endswith('.iocs'):
        return filename[:-5]  # Remove '.iocs'

    return None

In [19]:
# Folder where the CVE files are stored
folder_path = os.path.normpath(os.path.join(project_root, config['directory_paths_ioc']['CVE_MITRE']))

# Folder where the TTP files are stored
#folder_path = os.path.normpath(os.path.join(project_root, config['directory_paths_ioc']['TTP_MITRE']))

In [20]:
# Dictionary to store file hash -> URL mapping
hash_to_url_map = {}
# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Extract file hash from the filename
    file_hash = extract_file_hash_from_filename(filename)
    if file_hash:
        # Look for the matching download_sha256 in the JSONL data
        for entry in jsonl_data:
            if entry.get('download_sha256') == file_hash:
                # If the hashes match, store the URL
                hash_to_url_map[file_hash] = entry.get('url')
                break  # Exit loop once a match is found

In [21]:
print(len(hash_to_url_map))

266


In [22]:
hash_to_url_map.get("e3394ddac9c861e5be26eb67123da606de98786a1f1c6cc06cfeaf151bc67ac8")

'https://www.dragos.com/wp-content/uploads/CRASHOVERRIDE2018.pdf'

In [23]:
# Create a new dictionary to store the hash, group_id, and URL
hash_group_url_map = {}

# Iterate through each hash, url pair in hash_to_url_map
for file_hash, url in hash_to_url_map.items():
    # Iterate through each group in group_citations_map
    for group_id, group_data in group_citations_map.items():
        # Search for the URL in associated_groups_citations
        if url in group_data.get('associated_groups_citations', {}).values():
            # If the URL is found, add the file_hash, group_id, and URL to the hash_group_url_map
            hash_group_url_map[file_hash] = {'group_id': group_id, 'url': url}
            break  # Exit once we find the match (no need to check further in this group)

        # Search for the URL in relationship_citations
        if url in group_data.get('relationship_citations', {}).values():
            # If the URL is found, add the file_hash, group_id, and URL to the hash_group_url_map
            hash_group_url_map[file_hash] = {'group_id': group_id, 'url': url}
            break  # Exit once we find the match (no need to check further in this group)

In [24]:
hash_group_url_map.get("e3394ddac9c861e5be26eb67123da606de98786a1f1c6cc06cfeaf151bc67ac8")

{'group_id': 'G0034',
 'url': 'https://www.dragos.com/wp-content/uploads/CRASHOVERRIDE2018.pdf'}

In [25]:
# Find the hashes that are in hash_to_url_map but not in hash_group_url_map
missing_hashes = {key: value for key, value in hash_to_url_map.items() if key not in hash_group_url_map}

# Print the list of hashes and URLs that are missing in hash_group_url_map
for hash_key, url in missing_hashes.items():
    print(f"Hash: {hash_key}, URL: {url}")

Hash: 05bc7a68fdfe54c95ba1fb7360f2cb73bbfbdbbe939c29d764abf59f975a6a3a, URL: https://www.infosecurity-magazine.com/news/microsoft-zero-day-traced-russian/
Hash: 2c9e582e0194bacc4e4bbb37ffe61ed7e89af5cc5748fdc001e9dd65ddfaa32f, URL: https://securelist.com/apt-trends-report-q1-2018/85280/
Hash: 5b6328ed41cb49229d8d47046caabae1fdb90045c467d6509ae1f459a9b5b518, URL: https://www.intezer.com/wp-content/uploads/2021/09/TeamTNT-Cryptomining-Explosion.pdf
Hash: 72beb22ceed285d666ec7912dfcb95e7107c4232e622026915ef1bcd3c593490, URL: https://unit42.paloaltonetworks.com/ukraine-targeted-outsteel-saintbot/
Hash: 73eac7a13e4c15ce849d7a12a8d56eb3d831b6b442bf9ce7bc43afc1caafde9c, URL: https://www.us-cert.gov/ncas/alerts/TA17-164A
Hash: cabd66802a057829a0113bc5e53ac0c2c48f91142e8a40e10aac0d9d6aebbe98, URL: https://www.bleepingcomputer.com/news/security/ukraine-links-members-of-gamaredon-hacker-group-to-russian-fsb/
Hash: e2f84d3c77547f31ba782c0bb5525980059f651931e2b1dbbcd0a81f4430a1db, URL: https://secu

In [26]:
def read_cves_in_folder(folder_path):
    """
    Reads all the files in a specified folder with .iocs extension and creates a dictionary
    with the file hash (from the filename) as the key and the list of CVEs as the value.
    It also removes the 'cve' prefix from each CVE and handles the file content appropriately.

    Args:
        folder_path (str): The path to the folder containing the .iocs files.

    Returns:
        dict: A dictionary where the key is the file hash and the value is a list of CVEs.
    """
    iocs_dict = {}

    # Iterate through each file in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a '.iocs' extension
        if filename.endswith(".iocs"):
            file_hash = filename.split(".")[0]  # Extract file hash from the filename
            
            # Construct the full path to the file
            file_path = os.path.join(folder_path, filename)
            
            try:
                # Open the file and read its content
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()  # Read and strip any unwanted whitespace

                    # Remove "cve" prefix and the tab character, and store CVEs in a list
                    content_lines = content.split('\n')  # Split content by lines
                    cleaned_content = []
                    for line in content_lines:
                        if line.startswith("cve"):
                            # Remove 'cve' and the tab character '\t'
                            cleaned_line = line[4:].strip()  # Strip the 'cve' prefix and any leading/trailing spaces
                            cleaned_content.append(cleaned_line)

                # Store the file hash and list of CVEs in the dictionary
                iocs_dict[file_hash] = cleaned_content

            except Exception as e:
                print(f"Error reading file {filename}: {e}")

    return iocs_dict

In [27]:
cve_data = read_cves_in_folder(folder_path)

In [28]:
def read_ttps_in_folder(folder_path):
    """
    Reads all .download.iocs files in a specified folder and extracts TTPs (Technique IDs).
    
    - Extracts the file hash from the filename.
    - Parses the file to extract TTPs (entries starting with 'ttp\tT').

    Parameters
    ----------
    folder_path : str
        The path to the folder containing .download.iocs files.

    Returns
    -------
    dict
        A dictionary where keys are file hashes and values are lists of extracted TTPs.
    """
    ttps_dict = {}

    # Iterate through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".download.iocs"):
            file_hash = extract_file_hash_from_filename(filename)
            
            if not file_hash:
                print(f"Skipping invalid filename: {filename}")
                continue  # Skip files that don't match expected format
            
            file_path = os.path.join(folder_path, filename)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    # Extract TTPs from each line that starts with 'ttp\tT'
                    ttps = [
                        line.split('\t')[1].split(' ')[0]  # Extract only the TTP ID (e.g., T1204)
                        for line in file.read().strip().split('\n')
                        if line.startswith("ttp\tT")
                    ]

                # Store results in dictionary
                ttps_dict[file_hash] = ttps
            
            except Exception as e:
                print(f"Error reading file {filename}: {e}")
    
    return ttps_dict


In [29]:
ttps_data = read_ttps_in_folder(folder_path)

In [30]:
def combine_hash_data(iocs_data, hash_group_url_map, data_type='cves'):
    """
    Combines hash-based IOC data with group and URL information.
    
    Args:
        iocs_data (dict): Dictionary where keys are file hashes and values are lists of CVEs or TTPs.
        hash_group_url_map (dict): Dictionary mapping file hashes to group information (group_id, URL).
        data_type (str): Type of data to process ('cves' or 'ttps').

    Returns:
        dict: A dictionary where keys are group IDs and values contain hashes and associated data.
    """
    hash_data_map = {}
    
    # Step 1: Iterate through the hashes in iocs_data
    for file_hash, data_list in iocs_data.items():
        # Check if the file_hash exists in hash_group_url_map
        if file_hash in hash_group_url_map:
            group_info = hash_group_url_map[file_hash]
            
            # Combine the data: CVEs/TTPs, group_id, and URL
            hash_data_map[file_hash] = {
                data_type: data_list,  # 'cves' or 'ttps'
                'group_id': group_info['group_id'],
                'url': group_info['url']
            }
    
    # Step 2: Transform hash_data_map into a group-based structure
    group_data_map = {}
    
    for hash_val, data in hash_data_map.items():
        group_id = data['group_id']
        extracted_data = data[data_type]
        url = data['url']
        
        # Initialize group entry if not present
        if group_id not in group_data_map:
            group_data_map[group_id] = {
                'hashes': [],
                'url': url,
            }
        
        # Append hash and extracted data (CVEs or TTPs)
        group_data_map[group_id]['hashes'].append({
            'hash': hash_val,
            data_type: extracted_data,
        })
    
    return group_data_map

In [31]:
# For CVEs
group_cve_map = combine_hash_data(cve_data, hash_group_url_map, data_type='cves')

# For TTPs
#group_ttp_map = combine_hash_data(ttps_data, hash_group_url_map, data_type='ttps')

In [32]:
len(group_cve_map)

86

In [33]:
def dump_and_count_data(data, data_type='cve'):
    """
    Dumps the provided data to a JSON file and counts the number of keys.

    Args:
        data (dict): The data dictionary to be saved.
        data_type (str): The type of data ('cves' or 'ttps').
        file_prefix (str): Prefix for the filename.

    Returns:
        int: The number of keys in the JSON file.
    """
    
    # Dump data to JSON file
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    # Read and count the number of keys in the JSON file
    with open(file_path, 'r') as file:
        loaded_data = json.load(file)
    
    return len(loaded_data)

In [34]:
base_dir = os.getcwd()
json_output_data_dir = os.path.join(base_dir, "group_analysis_json_outputs")

data_type = "cve"
#data_type = "ttp" 
# Change to "cve" when dealing with CVE data
file_name = f'MITRE_{data_type}_group_analysis.json'
file_path = os.path.join(json_output_data_dir, file_name)

# Example usage:
num_cve_keys = dump_and_count_data(group_cve_map, data_type='cve')
#num_ttp_keys = dump_and_count_data(group_ttp_map, data_type='ttp')
print(f"Number of groups in data: {num_cve_keys}")


Number of groups in data: 86


In [37]:
def analyze_data(group_data_map, data_type="cves"):
    """
    Analyzes the given group data (CVE or TTP) and provides statistical insights.

    Args:
        group_data_map (dict): Dictionary where keys are group IDs and values contain hashes and data lists.
        data_type (str): Either "cves" or "ttps" to specify the type of data being analyzed.

    Returns:
        dict: Analysis results including total count, unique count, top 10 common items overall,
              top 10 by group frequency, and unique items per group.
    """
    all_items = []
    overall_counter = Counter()
    groupwise_counter = Counter()
    unique_items_per_group = {}

    for group_id, group_data in group_data_map.items():
        group_items = set()
        for hash_data in group_data.get("hashes", []):
            items = hash_data.get(data_type, [])
            all_items.extend(items)
            overall_counter.update(items)
            group_items.update(items)

        # Update groupwise counter (count each item once per group)
        groupwise_counter.update(group_items)

        # Compute unique items for this group
        other_items = set()
        for other_id, other_data in group_data_map.items():
            if other_id != group_id:
                for hash_data in other_data.get("hashes", []):
                    other_items.update(hash_data.get(data_type, []))
        unique_items = group_items - other_items
        if unique_items:
            unique_items_per_group[group_id] = list(unique_items)

    total_items = len(all_items)
    unique_items = set(all_items)
    total_unique_items = len(unique_items)

    top_10_overall = overall_counter.most_common(10)
    top_10_by_group = groupwise_counter.most_common(10)

    # Print analysis results
    print(f"Total number of {data_type.upper()}: {total_items}")
    print(f"Total number of unique {data_type.upper()}: {total_unique_items}")
    print(f"Top 10 most common {data_type.upper()} (overall occurrences):")
    for item, count in top_10_overall:
        print(f"{item}: {count}")
    print(f"\nTop 10 most common {data_type.upper()} (across different groups):")
    for item, count in top_10_by_group:
        print(f"{item}: {count}")
    print(f"\nNumber of groups with unique {data_type.upper()}: {len(unique_items_per_group)}")

    print(f"\nExample groups with unique {data_type.upper()}:")
    for group_id, uniques in list(unique_items_per_group.items())[:50]:  # Show first 5
            print(f"  Group: {group_id}")
            for item in uniques[:5]:  # Show up to 5 CVEs/TTPs per group
                print(f"    - {item}")
            if len(uniques) > 5:
                print(f"    ... and {len(uniques) - 5} more")

    

    return {
        "total_count": total_items,
        "unique_count": total_unique_items,
        "top_10_overall": top_10_overall,
        "top_10_by_group": top_10_by_group,
        "unique_per_group": unique_items_per_group
    }

In [38]:
cve_analysis = analyze_data(group_cve_map, data_type="cves")
#ttp_analysis = analyze_data(group_ttp_map, data_type="ttps")


Total number of CVES: 959
Total number of unique CVES: 325
Top 10 most common CVES (overall occurrences):
CVE-2012-0158: 29
CVE-2017-11882: 22
CVE-2017-0199: 21
CVE-2022-38028: 15
CVE-2024-3400: 12
CVE-2021-26855: 11
CVE-2021-27065: 11
CVE-2010-3333: 10
CVE-2014-6332: 10
CVE-2018-13379: 10

Top 10 most common CVES (across different groups):
CVE-2012-0158: 17
CVE-2017-0199: 16
CVE-2017-11882: 14
CVE-2022-38028: 9
CVE-2018-13379: 8
CVE-2016-4117: 8
CVE-2010-3333: 7
CVE-2018-0802: 7
CVE-2024-37085: 7
CVE-2023-3519: 6

Number of groups with unique CVES: 48

Example groups with unique CVES:
  Group: G0029
    - CVE-2010-2572
    - CVE-2012-4969
  Group: G1015
    - CVE-2015-2291
    - CVE-2021-3490
    - CVE-2021-35464
  Group: G0121
    - CVE-2019-2215
    - CVE-2020-0674
    - CVE-2024-9284
  Group: G1023
    - CVE-2022-27518
    - CVE-2021-20021
    - CVE-2021-20023
  Group: G0139
    - CVE-2024-6387
    - CVE-2019-5736
  Group: G0032
    - CVE-2021-1647
    - CVE-2018-202501
  Group: G0