In [42]:
import pandas as pd
import json
import math
import os
import csv

# Step 1: Load configuration from a JSON file
with open("config.json", "r") as config_file:
    config = json.load(config_file)


# Step 2: Extract the data directory and file paths from the configuration
data_directory = config["data_directory"]
file_paths = {key: os.path.join(data_directory, value) for key, value in config["file_paths_software_v16"].items()}

In [26]:
# Function to load software from an Excel file and create a dictionary
def load_software_from_excel(file_path, sheet_name):
    # Read the Excel file
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    # Extract the relevant columns: 'ID', 'name', 'aliases', 'type'
    df_subset = df[['ID', 'name', 'aliases', 'type']]
    # Create a dictionary for the software
    software_dict = {}
    for _, row in df_subset.iterrows():
        software_id = row['ID']
        software_name = row['name']
        aliases = row['aliases']
        type_software = row['type']

        # Check if aliases is NaN (or empty) and replace it with an empty list
        if isinstance(aliases, float) and math.isnan(aliases):
            aliases = []  # Replace NaN with an empty list
        # If aliases is a string, convert it to a list
        elif isinstance(aliases, str):
            aliases = [aliases]  # Convert single string to list
        
        # Add the software information to the dictionary
        software_dict[software_id] = {
            'name': software_name,
            'aliases': aliases, 
            'type': type_software
        }
    return software_dict

In [27]:
# Step 3: Define a function to load the software mappings for the given attack type
def load_attack_software(attack_type, sheet_name="software"):
    file_path = file_paths.get(attack_type)
    if file_path:
        return load_software_from_excel(file_path, sheet_name=sheet_name)
    else:
        raise ValueError(f"Invalid attack type: {attack_type}")

# Step 4: Load the software mappings by specifying the attack type
enterprise_software = load_attack_software("enterprise")
ics_software = load_attack_software("ics")
mobile_software = load_attack_software("mobile")


# Step 4: Identify and add unique ICS and Mobile software IDs not in Enterprise
for software_id, software_info in ics_software.items():
    if software_id not in enterprise_software:
        # Add the unique ICS software to the enterprise software mapping
        enterprise_software[software_id] = software_info

for software_id, software_info in mobile_software.items():
    if software_id not in enterprise_software:
        # Add the unique Mobile software to the enterprise software mapping
        enterprise_software[software_id] = software_info

In [28]:
# Step 6: Display the counts and first few entries
print(f"\nTotal software in updated mapping: {len(enterprise_software)}")
print("\nSome example software mappings:")
for software_id, software_info in list(enterprise_software.items())[:5]:  # Display first 5 for example
    print(f"ID: {software_id}, Name: {software_info['name']}, Type: {software_info['type']}, Aliases: {software_info['aliases']}")



Total software in updated mapping: 826

Some example software mappings:
ID: S0066, Name: 3PARA RAT, Type: malware, Aliases: []
ID: S0065, Name: 4H RAT, Type: malware, Aliases: []
ID: S0677, Name: AADInternals, Type: tool, Aliases: []
ID: S0469, Name: ABK, Type: malware, Aliases: []
ID: S0045, Name: ADVSTORESHELL, Type: malware, Aliases: ['AZZY, EVILTOSS, NETUI, Sedreco']


In [29]:
# # Step 5: Optionally, export the updated software mapping to a new JSON file (relative path)
# output_directory = "data/output"
# os.makedirs(output_directory, exist_ok=True)  # Create the output directory if it doesn't exist
# output_file_path = os.path.join(output_directory, 'software_mapping_MITRE.json')
# with open(output_file_path, 'w') as f:
#     json.dump(enterprise_software, f, indent=4)


In [31]:
# Initialize counters for malware and tool
malware_count = 0
tool_count = 0

# Loop through the enterprise_software to count malware and tool types
for software_info in enterprise_software.values():
    if software_info['type'] == 'malware':
        malware_count += 1
    elif software_info['type'] == 'tool':
        tool_count += 1

# Step 6: Display the counts and first few entries
print(f"\nTotal software in updated mapping: {len(enterprise_software)}")
print(f"Total malware count: {malware_count}")
print(f"Total tool count: {tool_count}")



Total software in updated mapping: 826
Total malware count: 736
Total tool count: 90


In [32]:
# Function to read malware families from the JSON file
def read_malware_families_from_file():
    try:
        with open('malware_families.json', 'r') as json_file:
            families_data = json.load(json_file)
            return families_data
    except FileNotFoundError:
        print("Malware families JSON file not found.")
        return None

In [33]:
# Load the MITRE software mappings (from the generated JSON file)
def read_mitre_software_file():
    software_mappings_file_path = 'data/output/software_mapping_MITRE.json'  # Replace with actual path to MITRE software mappings
    try:
        with open(software_mappings_file_path, 'r') as f:
            mitre_software_mappings = json.load(f)
            return mitre_software_mappings
    except FileNotFoundError:
        print("MITRE Software Mapping JSON file not found.")
        return None            

In [34]:
import re

# Helper function to normalize names for better matching
def normalize_name(name):
    # Convert to lowercase
    name = name.lower()
    
    # Replace underscores followed by 'rat' with ' rat'
    name = re.sub(r'_rat$', ' rat', name)
    
    # Replace underscores in general with spaces
    name = name.replace('_', ' ')
    
    # Remove common malware prefixes like 'trojan.' or 'win.'
    name = re.sub(r'^(trojan\.|win\.)', '', name)
    
    return name

In [38]:
def find_software_intersection_with_malpedia(malpedia_families_data, mitre_software_mappings):
    intersection = []  # Store MITRE software that intersect with Malpedia actors
    unique_group_ids = set()  # To store unique group IDs for intersection
    mitre_software_not_in_malpedia = []  # Store MITRE software not found in Malpedia

    # Loop through each MITRE software in mitre_software_mappings
    for software_id, software_info in mitre_software_mappings.items():
        software_name = normalize_name(software_info['name'])# Normalize to lowercase for comparison

        # Handle cases where 'aliases' might be NaN or other non-list types
        aliases = software_info.get('aliases', [])
        if isinstance(aliases, float) or aliases is None:  # Check if it's NaN or None
            aliases = []  # Set to empty list
        else:
            aliases = [normalize_name(alias) for alias in aliases]   # Normalize aliases to lowercase

        found_match = False

        # Loop through each Malpedia family in malpedia_families_data
        for family_id, family_info in malpedia_families_data.items():
            family_name = normalize_name(family_id)  # Normalize Malpedia family name
            common_name = normalize_name(family_info.get('common_name', ''))  # Normalize common name
            alt_names = [normalize_name(alt_name) for alt_name in family_info.get('alt_names', [])]  # Normalize alt_names

            # Check if software_name or aliases match with Malpedia family_name, common_name, or alt_names
            if software_name == family_name or software_name == common_name or software_name in alt_names:
                intersection.append((software_name, family_name, software_id))  # Found intersection with software_name
                unique_group_ids.add(software_id)
                found_match = True
                break  # No need to check further if match is found
            elif any(alias == family_name or alias == common_name or alias in alt_names for alias in aliases):
                intersection.append((software_name, family_name, software_id))  # Found intersection with alias
                unique_group_ids.add(software_id)
                found_match = True
                break

        # If no match was found for this MITRE software, add it to the "not in Malpedia" list
        if not found_match:
            mitre_software_not_in_malpedia.append((software_name, software_id))

    return intersection, unique_group_ids, mitre_software_not_in_malpedia


In [39]:
malpedia_families_data = read_malware_families_from_file()
mitre_software_mappings = read_mitre_software_file()
# Find the intersection
intersection, unique_group_ids, mitre_software_not_in_malpedia = find_software_intersection_with_malpedia(malpedia_families_data, mitre_software_mappings)



In [40]:
# # # Print the intersection results
# print(f"Total intersection based on unique software IDs: {len(unique_group_ids)}")
# if intersection:
#     print("\nIntersection found between MITRE software and Malpedia families:")
#     for software_name, family_name, software_id in intersection:
#         print(f"MITRE Software: {software_name} | Malpedia Family: {family_name} | Software ID: {software_id}")
# else:
#     print("No intersection found between MITRE software and Malpedia families.")

# # Initialize counters for malware and tool
# malware_count = 0
# tool_count = 0

# # Print the MITRE software not present in Malpedia
# if mitre_software_not_in_malpedia:
#     print(f"\nTotal MITRE software not present in Malpedia: {len(mitre_software_not_in_malpedia)}")

#     for software_name, software_id in mitre_software_not_in_malpedia:
#         # Get the type of software from mitre_software_mappings using the software_id
#         software_type = mitre_software_mappings.get(software_id, {}).get('type', 'Unknown').strip().lower()

#         # Increment the malware/tool counters based on the type
#         if software_type == 'malware':
#             malware_count += 1
#         elif software_type == 'tool':
#             tool_count += 1

#         print(f"MITRE Software: {software_name.strip()} | Software ID: {software_id} | Type: {software_type.capitalize()}")

#     # Print the summary of malware and tools count
#     print(f"\nTotal Malwares: {malware_count}")
#     print(f"Total Tools: {tool_count}")


In [45]:
# Load the MITRE software mappings file (this should be the mappings created from the Excel files software ID: alias : Type)
mitre_software_mappings = read_mitre_software_file()
malpedia_families_data = read_malware_families_from_file()

# Open file in append mode (or write mode, if needed) with utf-8 encoding
with open('software_name_alias_normalization.csv', 'a', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)

        # Check if the file is empty and write the header if necessary
        try:
            # Try to move to the beginning of the file
            csvfile.seek(0, 2)  # Go to the end of the file
            if csvfile.tell() == 0:  # If file size is 0, write header
                csvwriter.writerow(['Software Name', 'Aliases', 'Normalized Name'])
        except Exception as e:
            print(f"Error checking file for header: {e}")

        # Loop through each MITRE software in mitre_software_mappings
        for software_id, software_info in mitre_software_mappings.items():
            original_software_name = software_info['name']
            normalized_software_name = normalize_name(software_info['name'])

                    # Handle cases where 'aliases' might be NaN or other non-list types
            aliases = software_info.get('aliases', [])
            if isinstance(aliases, float) or aliases is None:  # Check if it's NaN or None
               aliases = []  # Set to empty list
            else:
               aliases = [(alias) for alias in aliases]
            

             # Write MITRE group name, aliases (comma-separated), and normalized name to file
            csvwriter.writerow([f"mitre_{original_software_name}", ', '.join(aliases), normalized_software_name])


                # Loop through each Malpedia family in malpedia_families_data
        for family_id, family_info in malpedia_families_data.items():
            original_family_name = family_id 
            common_name = family_info.get('common_name', '')
            normalize_family_name = normalize_name(family_id)  # Normalize Malpedia family name
            alt_names = [(alt_name) for alt_name in family_info.get('alt_names', [])]  

            # Write Malpedia group name, aliases (comma-separated), and normalized name to file
            csvwriter.writerow([f"malpedia_{common_name}", ', '.join(alt_names), normalize_family_name])

In [None]:
## normalize aliases? 
## malpedia has a common name and another name with apk, win, elf prefix...