In [123]:
import os
import json
import pandas as pd
import re

def extract_citations(citations_raw):
    """
    This function takes a raw string of citations, extracts individual citations,
    removes duplicates, cleans up irrelevant characters, and ensures no extra spaces or invalid characters.

    Args:
        citations_raw (str): The raw citation string from which citations are to be extracted.

    Returns:
        list: A list of unique citations.
    """
    #print(citations_raw)
    if pd.isna(citations_raw) or citations_raw == '':
        return []

    # Replace any occurrence of 'Citation: ' to standardize the string
    citations_raw = citations_raw.replace("Citation: ", "")
    
    # Split citations by both the delimiters '), (' and ')('
    citations = re.split(r"\)\s*,?\s*\(|\)\(", citations_raw)

    # Clean each citation by removing unwanted characters and extra spaces
    cleaned_citations = []
    for citation in citations:
        citation = citation.replace("(", "").replace(")", "").strip(", ").strip()
        
        # Use regex to extract only the relevant citation portion before any period or embedded sentence
        # Here, we match the first part (e.g., "US-CERT HIDDEN COBRA June 2017") and ignore the rest
        citation = re.sub(r'\s*\..*$', '', citation)  # Remove everything after the first period
        
        # We also use another regex to handle extra commas, which might still linger after cleaning
        citation = re.sub(r',\s*$', '', citation)  # Remove comma at the end of citation
        
        if citation:  # Only add to the list if there is any valid citation
            cleaned_citations.append(citation)
    
    # Remove duplicates by converting the list to a set and back to a list
    cleaned_citations = list(set(cleaned_citations))

    #print(cleaned_citations)

    return cleaned_citations


In [124]:
# Step 1: Load configuration from the JSON file located outside the current directory
config_file_path = os.path.join("..", "Malpedia Bib files Analysis", "config.json")

with open(config_file_path, "r") as config_file:
    config = json.load(config_file)

# Step 2: Extract the data directory and file paths for groups and software from the configuration
data_directory = config["data_directory"]
groups_file_paths = {key: os.path.normpath(os.path.join(data_directory, value)) for key, value in config["file_paths_groups_v15"].items()}

# Example to access one of the group files (you can select enterprise, ics, or mobile)
enterprise_groups_file = groups_file_paths['enterprise']
print(f"Enterprise Groups File Path: {enterprise_groups_file}")

Enterprise Groups File Path: ..\ATTACK Excel sheets\enterprise-attack-v15.1-groups.xlsx


In [125]:
# Step 3: Load Excel sheets for groups and citations
df_groups = pd.read_excel(enterprise_groups_file, sheet_name='groups')
df_citations = pd.read_excel(enterprise_groups_file, sheet_name='citations')

# Step 4: Extract citation key and URL mapping from 'citations' sheet
citation_map = dict(zip(df_citations['reference'], df_citations['url']))

# Step 5: Initialize a dictionary to store the final data structure
group_citations_map = {}

# Process each group in the 'groups' sheet
for index, row in df_groups.iterrows():
    group_id = row['ID']
    
    # Extract and clean associated groups citations
    associated_citations_raw = row['associated groups citations']
    associated_citations = extract_citations(associated_citations_raw) if pd.notna(associated_citations_raw) else []
    
    # Extract and clean relationship citations
    relationship_citations_raw = row['relationship citations']
    relationship_citations = extract_citations(relationship_citations_raw) if pd.notna(relationship_citations_raw) else []
    
    # Map citations to URLs with uniqueness check
    associated_citations_with_urls = {}
    for citation in associated_citations:
        url = citation_map.get(citation, "URL not found")
        # Only add to map if citation and URL are not already present
        if citation not in associated_citations_with_urls:
            associated_citations_with_urls[citation] = url

    relationship_citations_with_urls = {}
    for citation in relationship_citations:
        url = citation_map.get(citation, "URL not found")
        if citation not in relationship_citations_with_urls:
            relationship_citations_with_urls[citation] = url
    
    # Store the data in the final structure
    group_citations_map[group_id] = {
        "associated_groups_citations": associated_citations_with_urls,
        "relationship_citations": relationship_citations_with_urls
    }

In [126]:
#group_citations_map

In [127]:
# Initialize counters
total_urls = 0  # Total number of URLs across both associated_groups_citations and relationship_citations

# Iterate through each group in the dictionary
for group_id, group_data in group_citations_map.items():
    # Count URLs in 'associated_groups_citations'
    associated_urls = group_data.get('associated_groups_citations', {})
    total_urls += len(associated_urls)  # Add the number of URLs in associated_groups_citations
    
    # Count URLs in 'relationship_citations'
    relationship_urls = group_data.get('relationship_citations', {})
    total_urls += len(relationship_urls)  # Add the number of URLs in relationship_citations

# Output the total number of URLs
print(f"Total number of URLs across the whole dictionary: {total_urls}")
num_groups = len(group_citations_map)  
print(f"Totak number of groups: {num_groups}")


Total number of URLs across the whole dictionary: 1176
Totak number of groups: 148


In [128]:
import json

# Function to load JSONL data
def load_jsonl(file_path):
    """
    Loads a JSONL file into a list of dictionaries.
    Each line of the file is parsed as a JSON object.
    """
    with open(file_path, 'r') as file:
        json_data = [json.loads(line) for line in file]
    return json_data

In [129]:
jsonl_file_path = r"C:\Users\ricewater\Documents\CTIDownloads\downloads\downloads\20241008_downloads.jsonl"

# Step 1: Load the JSONL data
jsonl_data = load_jsonl(jsonl_file_path)

In [130]:
import os
import json

# Function to load the JSONL data into a Python dictionary
def load_jsonl(jsonl_file_path):
    with open(jsonl_file_path, 'r') as file:
        return [json.loads(line.strip()) for line in file.readlines()]

# Function to extract file hash from the filename (assuming the hash is the entire filename without extension)
def extract_file_hash_from_filename(filename):
    # Extract the file hash from the filename by removing the .iocs extension
    if filename.endswith('.iocs'):
        return filename[:-5]  # Remove the last 5 characters to strip the '.iocs' extension
    return None

# Folder where the files are stored
folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\CVEs"

# Dictionary to store file hash -> URL mapping
hash_to_url_map = {}

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Extract file hash from the filename
    file_hash = extract_file_hash_from_filename(filename)
    if file_hash:
        # Look for the matching download_sha256 in the JSONL data
        for entry in jsonl_data:
            if entry.get('download_sha256') == file_hash:
                # If the hashes match, store the URL
                hash_to_url_map[file_hash] = entry.get('url')
                break  # Exit loop once a match is found

In [131]:
print(len(hash_to_url_map))

263


In [132]:
# Create a new dictionary to store the hash, group_id, and URL
hash_group_url_map = {}

# Iterate through each hash, url pair in hash_to_url_map
for file_hash, url in hash_to_url_map.items():
    # Iterate through each group in group_citations_map
    for group_id, group_data in group_citations_map.items():
        # Search for the URL in associated_groups_citations
        if url in group_data.get('associated_groups_citations', {}).values():
            # If the URL is found, add the file_hash, group_id, and URL to the hash_group_url_map
            hash_group_url_map[file_hash] = {'group_id': group_id, 'url': url}
            break  # Exit once we find the match (no need to check further in this group)

        # Search for the URL in relationship_citations
        if url in group_data.get('relationship_citations', {}).values():
            # If the URL is found, add the file_hash, group_id, and URL to the hash_group_url_map
            hash_group_url_map[file_hash] = {'group_id': group_id, 'url': url}
            break  # Exit once we find the match (no need to check further in this group)

# Output the final dictionary with hash, group_id, and url
print(len(hash_group_url_map))


255


In [133]:
# Find the hashes that are in hash_to_url_map but not in hash_group_url_map
missing_hashes = {key: value for key, value in hash_to_url_map.items() if key not in hash_group_url_map}

# Print the list of hashes and URLs that are missing in hash_group_url_map
for hash_key, url in missing_hashes.items():
    print(f"Hash: {hash_key}, URL: {url}")

Hash: 05bc7a68fdfe54c95ba1fb7360f2cb73bbfbdbbe939c29d764abf59f975a6a3a, URL: https://www.infosecurity-magazine.com/news/microsoft-zero-day-traced-russian/
Hash: 2c9e582e0194bacc4e4bbb37ffe61ed7e89af5cc5748fdc001e9dd65ddfaa32f, URL: https://securelist.com/apt-trends-report-q1-2018/85280/
Hash: 5b6328ed41cb49229d8d47046caabae1fdb90045c467d6509ae1f459a9b5b518, URL: https://www.intezer.com/wp-content/uploads/2021/09/TeamTNT-Cryptomining-Explosion.pdf
Hash: 72beb22ceed285d666ec7912dfcb95e7107c4232e622026915ef1bcd3c593490, URL: https://unit42.paloaltonetworks.com/ukraine-targeted-outsteel-saintbot/
Hash: 73eac7a13e4c15ce849d7a12a8d56eb3d831b6b442bf9ce7bc43afc1caafde9c, URL: https://www.us-cert.gov/ncas/alerts/TA17-164A
Hash: cabd66802a057829a0113bc5e53ac0c2c48f91142e8a40e10aac0d9d6aebbe98, URL: https://www.bleepingcomputer.com/news/security/ukraine-links-members-of-gamaredon-hacker-group-to-russian-fsb/
Hash: e2f84d3c77547f31ba782c0bb5525980059f651931e2b1dbbcd0a81f4430a1db, URL: https://secu

In [136]:
 import os

def read_files_in_folder(folder_path):
    """
    Reads all the files in a specified folder with .iocs extension and creates a dictionary
    with the file hash (from the filename) as the key and the list of CVEs as the value.
    It also removes the 'cve' prefix from each CVE and handles the file content appropriately.

    Args:
        folder_path (str): The path to the folder containing the .iocs files.

    Returns:
        dict: A dictionary where the key is the file hash and the value is a list of CVEs.
    """
    iocs_dict = {}

    # Iterate through each file in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a '.iocs' extension
        if filename.endswith(".iocs"):
            file_hash = filename.split(".")[0]  # Extract file hash from the filename
            
            # Construct the full path to the file
            file_path = os.path.join(folder_path, filename)
            
            try:
                # Open the file and read its content
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()  # Read and strip any unwanted whitespace

                    # Remove "cve" prefix and the tab character, and store CVEs in a list
                    content_lines = content.split('\n')  # Split content by lines
                    cleaned_content = []
                    for line in content_lines:
                        if line.startswith("cve"):
                            # Remove 'cve' and the tab character '\t'
                            cleaned_line = line[4:].strip()  # Strip the 'cve' prefix and any leading/trailing spaces
                            cleaned_content.append(cleaned_line)

                # Store the file hash and list of CVEs in the dictionary
                iocs_dict[file_hash] = cleaned_content

            except Exception as e:
                print(f"Error reading file {filename}: {e}")

    return iocs_dict

In [137]:
folder_path = r"C:\Users\ricewater\Documents\CTIDownloads\CVEs"  
iocs_data = read_files_in_folder(folder_path)

In [138]:
# Step 1: Initialize a new dictionary to store the combined data
hash_cve_map = {}

# Step 2: Iterate through the hashes in iocs_data
for file_hash, cve_list in iocs_data.items():
    # Check if the file_hash exists in hash_group_url_map
    if file_hash in hash_group_url_map:
        # Retrieve the group information from hash_group_url_map
        group_info = hash_group_url_map[file_hash]
        
        # Combine the data: CVEs, group_id, and url
        hash_cve_map[file_hash] = {
            'cves': cve_list,
            'group_id': group_info['group_id'],
            'url': group_info['url']
        }

# Step 3: Print the combined data (or return it for further use)
#print(combined_data)

In [140]:
# Initialize the new structure where the key is group_id and the values are dictionaries
# containing hashes, CVEs, and URLs
group_cve_map = {}

# Iterate over each entry in the hash_cve_map
for hash_val, data in hash_cve_map.items():
    group_id = data['group_id']
    cves = data['cves']
    url = data['url']
    
    # If the group_id doesn't exist in group_cve_map, initialize it
    if group_id not in group_cve_map:
        group_cve_map[group_id] = {
            'hashes': [],
            'url': url,
        }
    
    # Add the hash and associated CVEs to the group
    group_cve_map[group_id]['hashes'].append({
        'hash': hash_val,
        'cves': cves,
    })


In [143]:
total_group_ids = len(group_cve_map)
print(f"Total number of group IDs with CVE data: {total_group_ids}")

Total number of group IDs with CVE data: 85


In [144]:
#Initialize a list to store all CVEs across groups for counting and determining common CVEs
all_cves = []

# Go through each group and gather all CVEs from the hashes
for group_id, group_data in group_cve_map.items():
    for hash_data in group_data['hashes']:
        all_cves.extend(hash_data['cves'])

# Calculate the total number of CVEs
total_cves = len(all_cves)

# Calculate unique CVEs
unique_cves = set(all_cves)
total_unique_cves = len(unique_cves)

# Calculate the top 10 most common CVEs
cve_counter = Counter(all_cves)
top_10_cves = cve_counter.most_common(10)

# Output the results
print(f"Total number of CVEs: {total_cves}")
print(f"Total number of unique CVEs: {total_unique_cves}")
print("Top 10 most common CVEs:")
for cve, count in top_10_cves:
    print(f"{cve}: {count}")

Total number of CVEs: 955
Total number of unique CVEs: 322
Top 10 most common CVEs:
CVE-2012-0158: 29
CVE-2017-11882: 22
CVE-2017-0199: 21
CVE-2022-38028: 15
CVE-2024-3400: 12
CVE-2021-26855: 11
CVE-2021-27065: 11
CVE-2010-3333: 10
CVE-2014-6332: 10
CVE-2018-13379: 10


In [145]:
unique_cves_per_group = {}

for group_id, data in group_cve_map.items():
    group_cves = set()
    for hash_entry in data['hashes']:
        group_cves.update(hash_entry['cves'])

    # Find CVEs unique to this group (not shared with any other group)
    other_groups_cves = set()
    for other_group_id, other_data in group_cve_map.items():
        if other_group_id != group_id:
            for other_hash_entry in other_data['hashes']:
                other_groups_cves.update(other_hash_entry['cves'])

    unique_cves = group_cves - other_groups_cves
    if unique_cves:
        unique_cves_per_group[group_id] = list(unique_cves)

print(len(unique_cves_per_group))
# Output unique CVEs per group
for group_id, unique_cves in unique_cves_per_group.items():
    print(f"Group ID: {group_id} - Unique CVEs: {unique_cves}")


47
Group ID: G0029 - Unique CVEs: ['CVE-2012-4969', 'CVE-2010-2572']
Group ID: G1015 - Unique CVEs: ['CVE-2015-2291', 'CVE-2021-3490', 'CVE-2021-35464']
Group ID: G0121 - Unique CVEs: ['CVE-2019-2215', 'CVE-2020-0674', 'CVE-2024-9284']
Group ID: G1023 - Unique CVEs: ['CVE-2021-20023', 'CVE-2021-20021', 'CVE-2022-27518']
Group ID: G0139 - Unique CVEs: ['CVE-2019-5736', 'CVE-2024-6387']
Group ID: G0032 - Unique CVEs: ['CVE-2018-202501', 'CVE-2021-1647']
Group ID: G0069 - Unique CVEs: ['CVE-2017-01995']
Group ID: G0027 - Unique CVEs: ['CVE-2014-6324', 'CVE-2017-15303', 'CVE-2017-0213', 'CVE-2011-3544', 'CVE-2010-0738']
Group ID: G1017 - Unique CVEs: ['CVE-2021-27860']
Group ID: G0046 - Unique CVEs: ['CVE-2024-47076', 'CVE-2024-47176', 'CVE-2024-29847', 'CVE-2024-47175', 'CVE-2024-47177']
Group ID: G0034 - Unique CVEs: ['CVE-2018-8405', 'CVE-2014-3828', 'CVE-2015-5374', 'CVE-2018-8406', 'CVE-2018-4878']
Group ID: G0040 - Unique CVEs: ['CVE-2017-8570']
Group ID: G0125 - Unique CVEs: ['CVE-2