In [37]:
import pandas as pd
from statistics import mode

def apt_group_software_mapping(file_path: str):
    """
    Reads the relevant columns from the Excel file and maps each APT group to the software they use.
    Additionally, provides the mean, median, mode, and the max/min software counts with the respective APT group.

    Args:
        file_path (str): Path to the Excel file.

    Returns:
        dict: Mapping of APT groups to the software they use.
        int: Count of unique APT groups.
        int: Count of unique software.
        dict: Statistics including mean, median, mode, max, and min with respective APT groups.
    """
    # Load the sheet 'associated groups'
    xls = pd.ExcelFile(file_path)
    df = pd.read_excel(xls, sheet_name='associated groups')

    # Ensure the relevant columns exist
    if 'source name' in df.columns and 'target ID' in df.columns:
        # Create a mapping of APT groups to software
        apt_to_software = df.groupby('source name')['target ID'].apply(list).to_dict()

        # Get the count of software used by each APT group
        software_counts = df.groupby('source name')['target ID'].nunique()

        # Calculate statistics
        mean_count = software_counts.mean()
        median_count = software_counts.median()
        try:
            mode_count = mode(software_counts)
        except:
            mode_count = 'No unique mode'

        max_count = software_counts.max()
        min_count = software_counts.min()

        # Find the APT groups corresponding to the max and min counts
        max_apt_group = software_counts.idxmax()
        min_apt_group = software_counts.idxmin()

        stats = {
            'mean_count': mean_count,
            'median_count': median_count,
            'mode_count': mode_count,
            'max_count': max_count,
            'max_apt_group': max_apt_group,
            'min_count': min_count,
            'min_apt_group': min_apt_group
        }

        # Get unique counts
        unique_apt_groups = df['source name'].nunique()
        unique_software = df['target ID'].nunique()

        return apt_to_software, unique_apt_groups, unique_software, stats
    else:
        raise ValueError("The required columns 'source name' and 'target ID' are not found in the sheet.")



In [38]:
file_path = r"C:\Users\Aakanksha Saha\Documents\CTITTP\enterprise-attack-v16.1-software.xlsx"  # Replace with your Excel file path
try:
    apt_mapping, unique_apt_groups, unique_software, stats = apt_group_software_mapping(file_path)
    #print(f"APT Group to Software Mapping:\n{apt_mapping}")
    print(f"\nUnique APT Groups Count: {unique_apt_groups}")
    print(f"Unique Software Count: {unique_software}")
    print(f"\nStatistics:\n{stats}")
except ValueError as e:
    print(e)


Unique APT Groups Count: 145
Unique Software Count: 523

Statistics:
{'mean_count': np.float64(6.889655172413793), 'median_count': np.float64(4.0), 'mode_count': 2, 'max_count': np.int64(48), 'max_apt_group': 'APT29', 'min_count': np.int64(1), 'min_apt_group': 'APT-C-23'}


In [39]:
import pandas as pd
from collections import Counter

def get_top_software(file_path: str, top_n: int = 10):
    """
    Identifies the top N software used by count and includes the software name for readability.

    Args:
        file_path (str): Path to the Excel file.
        top_n (int): Number of top software to identify.

    Returns:
        pd.DataFrame: A DataFrame containing the top N software with their counts and corresponding names.
    """
    # Load the sheet 'associated groups'
    xls = pd.ExcelFile(file_path)
    df = pd.read_excel(xls, sheet_name='associated groups')

    # Ensure the relevant columns exist
    if 'source name' in df.columns and 'target ID' in df.columns and 'target name' in df.columns:
        # Count occurrences of each target ID
        software_counts = df['target ID'].value_counts().head(top_n).reset_index()
        software_counts.columns = ['target ID', 'count']

        # Merge with 'target name' for readability
        software_details = software_counts.merge(df[['target ID', 'target name']].drop_duplicates(), on='target ID')

        # Ensure no duplicates from merging and return
        return software_details
    else:
        raise ValueError("Required columns ('source name', 'target ID', 'target name') are not found in the sheet.")



In [40]:
# Example Usage
file_path = r"C:\Users\Aakanksha Saha\Documents\CTITTP\enterprise-attack-v16.1-software.xlsx" 
try:
    top_software = get_top_software(file_path, top_n=20)
    print("Top Software:")
    print(top_software)
except ValueError as e:
    print(e)

Top Software:
   target ID  count    target name
0      S0002     48       Mimikatz
1      S0029     35         PsExec
2      S0039     31            Net
3      S0154     26  Cobalt Strike
4      S0363     17         Empire
5      S0013     14          PlugX
6      S0012     14      PoisonIvy
7      S0100     13       ipconfig
8      S0357     13       Impacket
9      S0097     13           Ping
10     S0349     12        LaZagne
11     S0104     11        netstat
12     S0160     11       certutil
13     S0032     11      gh0st RAT
14     S0057     11       Tasklist
15     S0096     10     Systeminfo
16     S0552      9         AdFind
17     S0590      9        NBTscan
18     S0194      9    PowerSploit
19     S0075      8            Reg
