In [None]:
# Import Package And Function, Load Data
import os
import time
import logging
import pickle
from copy import copy
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import math
import tqdm
import numpy as np
from math import log, e
from multiprocessing import Pool
from functools import partial

def filter_graph_by_behave_conf(graph):
    new_graph = nx.Graph()
    for edge in graph.edges:
        if graph[edge[0]][edge[1]].get("behave_conf", 0) > 0.5:
            new_graph.add_edge(edge[0], edge[1], **graph[edge[0]][edge[1]])
    new_graph.remove_nodes_from(list(nx.isolates(new_graph)))
    return new_graph

def get_graph_tactics(graph):
    tactic_list = []
    for edge in graph.edges:
        one_hot = graph[edge[0]][edge[1]].get("tactic_conf")
        tactic_this = [i for i, x in enumerate(one_hot) if x == 1]
        tactic_list.append(tactic_this)
    return tactic_list

def check_unique_tactic(tactic_list):
    total_tactic_list = set()
    for single_tactic_list in tactic_list:
        for tactic in single_tactic_list:
            total_tactic_list.add(tactic)
    return len(total_tactic_list)

def check_graph_by_unique_article_id(graph):
    id_set = set()
    for edge in graph.edges:
        if graph[edge[0]][edge[1]].get("article_id") is not None:
            id_set.add(graph[edge[0]][edge[1]]["article_id"])
    if len(id_set) >= 2:
        return True
    return False

def check_graph_has_tactic(graph):
    for edge in graph.edges:
        if graph[edge[0]][edge[1]].get("tactic_conf") is not None:
            if "1" in str(graph[edge[0]][edge[1]]["tactic_conf"]):
                return True
    return False

def new_draw_graph(
    graph, draw_edges=True, save_folder="defaultgraph", saveorshow="show"
):
    pos = nx.circular_layout(graph)
    plt.figure(num=None, figsize=(20, 20), dpi=100)
    nx.draw_networkx(
        graph,
        pos,
        with_labels=True,
        node_size=20,
        arrowsize=90,
        linewidths=1.5,
        arrowstyle="->",
        edge_color="red",
        node_shape="o",
        bbox=dict(facecolor="black", edgecolor="black", boxstyle="round,pad=0.3"),
        node_color="black",
        font_size=15,
        font_color="white",
    )
    if draw_edges:
        edge_labels = nx.get_edge_attributes(graph, "relation")
        articles_labels = nx.get_edge_attributes(graph, "article_id")
        one_hot = nx.get_edge_attributes(graph, "tactic_conf")
        true_tactic = {}
        for key in one_hot.keys():
            true_tactic[key] = [i for i, x in enumerate(one_hot[key]) if x == 1]
        data = articles_labels
        article_dict = {}
        article_id = 0
        for key, value in data.items():
            if value not in article_dict:
                article_dict[value] = "from article" + str(article_id)
                article_id = article_id + 1
        result_dict = {}
        for key, value in data.items():
            result_dict[value] = article_dict[value]
        for key, value in edge_labels.items():
            new_value = "relation:" + value
            article_true_value = articles_labels[key]
            article_012_value = result_dict[article_true_value]
            new_value += f"\n{article_012_value}"
            if key in true_tactic:
                if len(true_tactic[key]) > 0:
                    new_value = new_value + "\nattack tactic:\n"
                    for tactic in true_tactic[key]:
                        new_value += f"{big_label_list[tactic]},\n"
            edge_labels[key] = new_value
        nx.draw_networkx_edge_labels(
            graph,
            pos,
            edge_labels=edge_labels,
            font_color="red",
            font_size=12,
        )
    if saveorshow == "show":
        plt.show()
    if saveorshow == "save":
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        plt.savefig(save_folder + "/" + str(random.randint(0, 1000000)) + ".png")
        plt.close()

def calculate_percentage(folder_path):
    total_count = 0
    meaningful_count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            total_count += 1
            if filename[-5] == "y":
                meaningful_count += 1
            elif filename[-5] == "n":
                continue
            else:
                print(f"Warning: unexpected filename {filename}")
    if total_count == 0:
        return 0
    else:
        return meaningful_count / total_count * 100

def entropy(lst):
    n = len(lst)
    counts = {}
    for x in lst:
        counts[x] = counts.get(x, 0) + 1
    probs = [count / n for count in counts.values()]
    return -sum(p * math.log2(p) for p in probs)

def entropy_one_hot(list):
    if len(list) == 0:
        return 0
    arr = np.array(list)
    n_rows, n_cols = arr.shape
    ent = 0
    for i in range(n_cols):
        col = arr[:, i]
        ones = np.count_nonzero(col)
        zeros = n_rows - ones
        p_ones = ones / n_rows
        p_zeros = zeros / n_rows
        if p_ones > 0 and p_zeros > 0:
            ent += -p_ones * log(p_ones, e) - p_zeros * log(p_zeros, e)
    return ent

def calculate_graph_stats(graph):
    nodes_data = pd.DataFrame(graph.nodes(data=True), columns=["node", "data"])
    edges_data = pd.DataFrame(
        graph.edges(data=True), columns=["source", "target", "data"]
    )
    nodes_data_df = pd.DataFrame(graph.nodes(data=True), columns=["node", "data"])
    entity_count = (
        nodes_data_df["data"].apply(lambda x: x.get("entity_conf") == 1).sum()
    )
    behave_count = (
        edges_data["data"]
        .apply(lambda x: x.get("behave_conf") and x.get("behave_conf") > 0.5)
        .sum()
    )
    tactic_count = (
        edges_data["data"]
        .apply(lambda x: x.get("tactic_conf") and x.get("tactic_conf") != [0] * 10)
        .sum()
    )
    entity_percent = entity_count / len(graph.nodes()) if len(graph.nodes()) > 0 else 0
    behave_percent = behave_count / len(graph.edges()) if len(graph.edges()) > 0 else 0
    tactic_percent = tactic_count / len(graph.edges()) if len(graph.edges()) > 0 else 0
    avg_precent = (
        (entity_percent + behave_percent + tactic_percent) / 3
        if len(graph.nodes()) > 0
        else 0
    )
    df = pd.DataFrame(
        {
            "entity_percent": [entity_percent],
            "behave_percent": [behave_percent],
            "tactic_percent": [tactic_percent],
        }
    )
    all_article_id = edges_data["data"].apply(lambda x: x.get("article_id")).tolist()
    all_tactic = edges_data["data"].apply(lambda x: x.get("tactic_conf")).tolist()
    article_entropy = entropy(all_article_id)
    tactic_entropy = entropy_one_hot(all_tactic)
    return (
        avg_precent,
        entity_percent,
        behave_percent,
        tactic_percent,
        article_entropy,
        tactic_entropy,
    )

def calculate_community_scores(listofcommunities, inputG):
    df_community_and_scores = pd.DataFrame(
        columns=[
            "community_nodes",
            "entity_percent",
            "behave_percent",
            "tactic_percent",
            "avg_precent",
            "article_entropy",
        ]
    )
    listofcommunities = listofcommunities.copy()
    np.random.shuffle(listofcommunities)
    for one_community in (listofcommunities):
        graph_one_community = create_new_graph(one_community, inputG)

        (
            avg_precent,
            entity_percent,
            behave_percent,
            tactic_percent,
            article_entropy,
            tactic_entropy,
        ) = calculate_graph_stats(graph_one_community)

        df_community_and_scores = pd.concat(
            [
                df_community_and_scores,
                pd.DataFrame(
                    {
                        "community_nodes": [one_community],
                        "entity_percent": [entity_percent],
                        "behave_percent": [behave_percent],
                        "tactic_percent": [tactic_percent],
                        "avg_precent": [avg_precent],
                        "article_entropy": [article_entropy],
                        "tactic_entropy": [tactic_entropy],
                    }
                ),
            ],
            ignore_index=True,
        )

    min_x = df_community_and_scores["article_entropy"].min()
    max_x = df_community_and_scores["article_entropy"].max()
    df_community_and_scores["normalized_article_entropy"] = 0
    for i, row in df_community_and_scores.iterrows():
        x_i = row["article_entropy"]
        z_i = (x_i - min_x) / (max_x - min_x)
        df_community_and_scores.loc[i, "normalized_article_entropy"] = z_i

    min_x = df_community_and_scores["tactic_entropy"].min()
    max_x = df_community_and_scores["tactic_entropy"].max()
    df_community_and_scores["normalized_tactic_entropy"] = 0
    for i, row in df_community_and_scores.iterrows():
        x_i = row["tactic_entropy"]
        if max_x == min_x:
                    z_i = 0
        else:
            z_i = (x_i - min_x) / (max_x - min_x)        
        df_community_and_scores.loc[i, "normalized_tactic_entropy"] = z_i

    df_community_and_scores["avg_score"] = (
        df_community_and_scores["entity_percent"]
        + df_community_and_scores["behave_percent"]
        + df_community_and_scores["tactic_percent"]
        + df_community_and_scores["normalized_article_entropy"]
        + df_community_and_scores["normalized_tactic_entropy"]
    ) / 5

    df_community_and_scores = df_community_and_scores.sort_values(
        by=["avg_score"], ascending=False
    )

    min_x = df_community_and_scores["avg_score"].min()
    max_x = df_community_and_scores["avg_score"].max()
    df_community_and_scores["normalized_avg_score"] = 0
    for i, row in df_community_and_scores.iterrows():
        x_i = row["avg_score"]
        if max_x == min_x:
            z_i = 0
        else:
            z_i = (x_i - min_x) / (max_x - min_x)
        df_community_and_scores.loc[i, "normalized_avg_score"] = z_i

    return df_community_and_scores

def create_new_graph(node_list, graph):
    new_graph = graph.subgraph(node_list).copy()
    isolated_nodes = list(nx.isolates(new_graph))
    new_graph.remove_nodes_from(isolated_nodes)
    return new_graph

def get_community_size(folder_name):
    return int(folder_name.split("_")[-1])

###Load Data
big_label_list=['Initial Access', 'Execution', 'Defense Evasion', 'Command and Control', 'Privilege Escalation', 'Persistence','Lateral Movement','DataLeak','Exfiltration','Impact']
with open('/home/local/XXXXAD/user/Dropbox (XXXX)/code/G_Value_forXXXX.pkl', 'rb') as f:
    G_Value = pickle.load(f)
communities = {}

files1 = [f for f in os.listdir('.') if f.startswith('communities_value_') and f.endswith('.pkl')]
files2 = [f for f in os.listdir('.') if f.startswith('ownAlgorithm_threshold_') and f.endswith('.pkl')]
files=files1+files2

for file in files:
    with open(file, 'rb') as f:
        community = pickle.load(f)
        communities[file] = community

In [None]:
# CTIKG Community Detection: Base Function
import community
import random
from collections import deque
import networkx as nx
import os
def clear_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
    with open(file_path, 'w') as f:
        pass

def CTIKG_Community_Detection_GetEdgesIntoDict(inputG):
    edges = {}
    for t in inputG.edges():
        if len(t) > 0:
            if t[0] != t[1]:
                if t[0] not in edges:
                    edges[t[0]] = {t[1]}
                else:
                    edges[t[0]].add(t[1])
                if t[1] not in edges:
                    edges[t[1]] = {t[0]}
                else:
                    edges[t[1]].add(t[0])
    return edges

def CTIKG_Community_Detection_FirstPartition(edges,first_part_file):
    OUT = open(first_part_file, "w")
    node_count = 0
    for n in edges:
        node_count = node_count + 1
        if node_count > 0:
            index = {}
            reverse_index = {}
            count = 0
            to_add_edges = []
            adj = set([])
            for neighbor in edges[n]:
                index[count] = neighbor
                reverse_index[neighbor] = count
                adj.add(neighbor)
                count = count + 1
            for m in reverse_index:
                for k in edges[m]:
                    if k in reverse_index and reverse_index[k] < reverse_index[m]:
                        to_add_edges.append((reverse_index[m], reverse_index[k]))
            G = nx.Graph()
            G.add_nodes_from([i for i in index])
            G.add_edges_from(to_add_edges)
            if len(to_add_edges) > 0:
                dict_H = community.best_partition(G)
                H = {}
                for node in dict_H:
                    if dict_H[node] not in H:
                        H[dict_H[node]] = set([])
                    H[dict_H[node]].add(node)
                for i in H:
                    comm = H[i]
                    if len(comm) > 0:
                        for c in comm:
                            OUT.write(str(index[int(c)]) + " ")
                        OUT.write(str(n))
                        OUT.write("\n")
                    elif len(comm) > 0:
                        for c in comm:
                            if index[int(c)] in edges[n]:
                                OUT.write(str(index[int(c)]) + " ")
                        OUT.write(str(n))
                        OUT.write("\n")
    OUT.close()

def CTIKG_Community_Detection_Jaccard(set1, set2):
        set1 = set(set1)
        set2 = set(set2)
        return float(len(set1.intersection(set2))) / float(len(set1.union(set2)))

def CTIKG_Community_Detection_GetMembership(first_part_file, membership_file):
    node_membership = {}
    IN = open(first_part_file, "rb")
    read_line = IN.readline()
    count = 0
    while read_line:
        t = read_line.rstrip().split()
        if len(t) >= min_comm_size:
            for mem in t:
                if mem not in node_membership:
                    node_membership[mem] = set([])
                node_membership[mem].add(count)
        count = count + 1
        read_line = IN.readline()

    IN.close()

    OUT = open(membership_file, "w")
    for n in node_membership:
        in_comms = node_membership[n]
        OUT.write(str(n) + " ")
        for c in in_comms:
            OUT.write(str(c) + " ")
        OUT.write("\n")
    OUT.close()


def get_subset_graph(in_undirected_G, percent_to_keep):
    num_nodes = len(in_undirected_G.nodes())
    num_edges = len(in_undirected_G.edges())
    num_nodes_to_keep = int(num_nodes * percent_to_keep)
    num_edges_to_keep = int(num_edges * percent_to_keep)
    nodes_to_keep = random.sample(list(in_undirected_G.nodes()), num_nodes_to_keep)
    edges_to_keep = random.sample(list(in_undirected_G.edges()), num_edges_to_keep)
    G_Value_undirected_subset = nx.Graph()
    G_Value_undirected_subset.add_nodes_from(nodes_to_keep)
    G_Value_undirected_subset.add_edges_from(edges_to_keep)
    return G_Value_undirected_subset



In [None]:
# CTIKG Community Detection: Type-based Grouping
import random
import networkx as nx
first_part_file = "./tmp/part1_.txt"
membership_file = "./tmp/membership_.txt"
sim_file = "./tmp/simfile_.txt"
clear_file(first_part_file)
clear_file(membership_file)
clear_file(sim_file)
edges = CTIKG_Community_Detection_GetEdgesIntoDict(G_Value_undirected_subset)
min_comm_size=3
sim_threshold=0.25
global_overlap_threshold=0.25
CTIKG_Community_Detection_FirstPartition(edges,first_part_file)
def CTIKG_Community_Detection_Filter_only_MalwareCVEActor(file_path, in_entity_set):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    filtered_lines = []
    for line in lines:
        line = line.strip().split(' ')
        if line[-1] in in_entity_set:
            filtered_lines.append(line)
    if os.path.exists(file_path + '.bak'):
        os.remove(file_path + '.bak')
    with open(file_path + '.bak', 'w') as f:
        for line in lines:
            f.write(line)
    with open(file_path, 'w') as f:
        for line in filtered_lines:
            f.write(' '.join(line) + '\n')
    print('The length of original file:', len(lines), 'The length of filtered file:', len(filtered_lines), 'The difference:', len(lines) - len(filtered_lines))

CTIKG_Community_Detection_Filter_only_MalwareCVEActor(first_part_file, entity_conf_set)
CTIKG_Community_Detection_GetMembership(first_part_file, membership_file)

In [None]:
# CTIKG Community Detection: Community Fusion
import pickle
with open('/home/local/XXXXAD/user/Dropbox (XXXX)/code/tmp/part1_.txt', 'r') as f:
    lines = f.readlines()
lines_result=[]
with open('Embedding_BY_BERT.pickle', 'rb') as handle:
    embeddings_dict = pickle.load(handle)
for i in lines:
    lines_result.append(embeddings_dict[i])
    
lines_result_possible=[]
lines_result_possible_truevalue=[]
for i in range(len(lines_result)):
    if lines[i].count(' ')>=2:
        lines_result_possible_truevalue.append(lines[i])
        lines_result_possible.append(lines_result[i])
        
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from numpy.linalg import norm

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

dist = pdist(lines_result_possible, metric=cosine_similarity)

dist_mat = squareform(dist)

sims = np.mean(dist_mat, axis=0)

#
plt.hist(sims, bins=50, color='blue', edgecolor='black')
plt.xlabel('Cosine similarity')
plt.ylabel('Frequency')
plt.title('Similarity distribution')
plt.show()

threshold_V=0.15

def threshold_percentile(sims, percentile):
    threshold = sorted(sims, reverse=True)[int(len(sims) * percentile / 100)]
    return threshold
import os
if os.path.exists('tmp/simfile_.txt'):
    os.remove('tmp/simfile_.txt')

value_p1file_index={}
for line_this in lines_result_possible_truevalue:
    index_loc=lines.index(line_this)
    value_p1file_index[line_this]=index_loc
    
result_newp1=[]
total_sim=[]
for i in range(len(dist_mat)):
    for j in range(i,len(dist_mat)):
        total_sim.append(dist_mat[i][j])

threshold_this=threshold_percentile(total_sim, threshold_V)
print('threshold_this:',threshold_this)
import tqdm
for i in tqdm.tqdm(range(len(dist_mat))):
    for j in range(i,len(dist_mat)):
        if i!=j:
            if dist_mat[i][j]>threshold_this:
                aindex=value_p1file_index[lines_result_possible_truevalue[i]]
                bindex=value_p1file_index[lines_result_possible_truevalue[j]]
                aindex_split=lines_result_possible_truevalue[i].split(' ')
                bindex_split=lines_result_possible_truevalue[j].split(' ')
                aindex_split[-1]=aindex_split[-1].replace('\n','')
                bindex_split[-1]=bindex_split[-1].replace('\n','')
                union_len=len(set(aindex_split).intersection(set(bindex_split)))
                a_len=len(aindex_split)
                b_len=len(bindex_split)
                if a_len <10 and b_len <10 and union_len>=1:
                    result_newp1.append(str(aindex)+' '+str(bindex)+' '+str(dist_mat[i][j])+' '+str(union_len)+' '+str(a_len)+' '+str(b_len)+' '+str(dist_mat[i][j]))
#save the result as txt file as tmp/simfile_.txt
with open('tmp/simfile_.txt', 'w') as f:
    for item in result_newp1:
        f.write("%s\n" % item)
        
def CTIKG_Community_Detection_SecondPartition(overlap_threshold, sim_file, first_part_file):
        return_vals =CTIKG_Community_Detection_ModClusteringSingleBig(
            sim_file, first_part_file, overlap_threshold
        )
        return return_vals

def CTIKG_Community_Detection_ModClusteringSingleBig(sim_file, first_part_file, overlap_threshold=0):
    overlap_threshold=global_overlap_threshold
    IN = open(sim_file, "r")
    read_line = IN.readline()
    num_lines = 0
    comm_edges = {}
    to_add_edges = []
    while read_line:
        t = read_line.rstrip().split()
        num_lines += 1
        if len(t) > 0:
            node1 = int(t[0])
            node2 = int(t[1])
            sim = t[2]
            overlap = t[3]
            if node1 not in comm_edges:
                comm_edges[node1] = set([])
            if node2 not in comm_edges:
                comm_edges[node2] = set([])
            if node2 not in comm_edges[node1]:
                comm_edges[node1].add(node2)
                comm_edges[node2].add(node1)
                weight = sim
                to_add_edges.append((node1, node2, {"weight": float(weight)}))

        read_line = IN.readline()
    IN.close()
    G = nx.Graph()
    G.add_nodes_from(range(len(comm_edges)))
    G.add_edges_from(to_add_edges)
    dict_H = community.best_partition(G)
    H1 = {}
    for e in dict_H:
        if dict_H[e] not in H1:
            H1[dict_H[e]] = set([])
        H1[dict_H[e]].add(e)
    H = []
    for e in H1:
        H.append(H1[e])

    IN = open(first_part_file, "rb")
    line_offset = {}
    offset = 0
    count = 0
    for line in IN:
        line_offset[count] = offset
        count = count + 1
        offset += len(line)
    IN.close()

    IN = open(first_part_file, "rb")
    all_comms = {}
    i = 0
    for big_comm in H:
        comm_members = {}
        for comm in big_comm:
            IN.seek(line_offset[int(comm)])
            read_line = IN.readline()
            t = read_line.rstrip().split()
            if len(t) > 0:
                for t1 in t:
                    if t1 not in comm_members:
                        comm_members[t1] = 0
                    comm_members[t1] += 1
        all_comms[i] = set([])
        for t1 in comm_members:
            if comm_members[t1] >= 0:
                all_comms[i].add(t1)

        i += 1

    return all_comms

def CTIKG_Community_Detection_GetModComms(G):
    dict_H = community.best_partition(G)
    H1 = {}
    for e in dict_H:
        if dict_H[e] not in H1:
            H1[dict_H[e]] = set([])
        H1[dict_H[e]].add(e)
    H = []
    for e in H1:
        H.append(H1[e])
    return H

def CTIKG_Community_Detection_CleanComms(to_clean):
    comms = {}
    count = 0
    idx = {}
    for t in to_clean.values():
        if len(t) > 0:
            comms[count] = set(t)
            for i in t:
                if i not in idx:
                    idx[i] = set([])

                idx[i].add(count)
            count += 1
        elif len(t) > 0:
            comms[count] = set(t)
            count += 1
    coms = []
    for i in range(count):
        C = comms[i]
        if len(C) > 0:
            poss = set([])
            found = 0
            for n in C:
                poss = poss.union(idx[n])
            for j in poss:
                if j < i:
                    if (
                        len(comms[j]) == len(comms[i])
                        and len(comms[j].difference(comms[i])) == 0
                    ):
                        found = 1
            if found != 1:
                coms.append([t.decode("utf-8") for t in C])
        else:
            coms.append([t.decode("utf-8") for t in C])
    return coms

import networkx as nx
import community
sim_file='tmp/simfile_.txt'
first_part_file='tmp/part1_.txt'
global_overlap_threshold=0
return_vals = CTIKG_Community_Detection_SecondPartition(0, sim_file, first_part_file)
coms = CTIKG_Community_Detection_CleanComms(return_vals)
print('Find communities number:',len(coms))

import pickle
fianlname='ownAlgorithm_threshold_'+str(threshold_V)+'.pkl'
with open(fianlname, 'wb') as f:
    pickle.dump(coms, f)
print('save as pkl file name:',fianlname)

In [None]:
# CTIKG Community Detection: Community Filters
import tqdm
for community in tqdm.tqdm(coms):
    graph_com=create_new_graph(community,G_Value)
    article_id=check_graph_by_unique_article_id(graph_com)
    tactic_number=check_unique_tactic(get_graph_tactics(
        graph_com))
    if tactic_number>=3 and article_id:
        count=count+1
        new_draw_graph(graph_com,save_folder='xxxxx',saveorshow='save')