In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from sklearn import random_projection
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import kneighbors_graph

# Phase 1: Preprocessing
1. Load the dataset with the static features
2. Apply one hot encoding to the categorical features
3. Apply `VarianceThreshold` to delete 'constant' columns i.e. columns with 0 variance

In [2]:
with open("/home/luca/ml-malware-concept-drift/data/dataset/dataset.pickle", "rb") as f:
    df_1 = pickle.load(f).reset_index()
    columns_to_drop = [c for c in df_1.columns if c.startswith("ngram_")]
    print(f"Dropping {len(columns_to_drop)} columns")
    df_1 = df_1.drop(columns_to_drop, axis=1)
    print(df_1.shape)

with open(
    "/home/luca/ml-malware-concept-drift/data/dataset/dataset_opcodes.pickle", "rb"
) as f:
    df_opcodes = pickle.load(f).reset_index()
    print(df_opcodes.shape)

with open(
    "/home/luca/ml-malware-concept-drift/data/dataset/dataset_final_1_1.pickle", "rb"
) as f:
    df_ngrams = pickle.load(f)
    # df_ngrams = df_ngrams.drop([c for c in df_ngrams.columns if not c.startswith("ngram_")], axis=1).reset_index()
    print(df_ngrams.shape)

df = pd.merge(
    left=df_1,
    right=pd.merge(left=df_opcodes, right=df_ngrams, on="sample_hash"),
    on="sample_hash",
)
df = df.drop(["ms_elapsed"], axis=1)
df.head()

Dropping 13000 columns
(67000, 29232)
(67000, 2501)
(67000, 13000)


Unnamed: 0,sample_hash,generic_fileSize,generic_fileEntropy,header_SizeOfHeaders,header_AddressOfEntryPoint,header_ImageBase,header_SizeOfImage,header_SizeOfCode,header_SizeOfInitializedData,header_SizeOfUninitializedData,...,ngram_b'\x00\x00\x84\xc0',ngram_b'\x0c\x8bD$\x08\xd1',ngram_b'\xfd\xff\xff\x85\xc0t',ngram_b'ltiB',ngram_b'e\x00a\x00',ngram_b'ExW\x00',ngram_b'iday\x00\x00',ngram_b'\xa4\xfe\xff\xff',ngram_b'\x83\xc8\xff\xeb',ngram_b'eAlloc'
0,cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302...,1666137,7.996651,1024,5358807,4194304,5365760,310272,56832,0,...,True,False,False,False,False,False,False,False,False,False
1,00ebebc75f61527282cee19ab7aed80693b63fbb969e71...,10543104,4.821304,4096,4512,4194304,118784,94208,20480,0,...,False,False,False,False,True,False,False,False,False,False
2,f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e69...,57368,4.979988,4096,9759,4194304,57344,8192,45056,0,...,False,False,False,False,False,False,False,False,False,False
3,d694a203bb211751669f8742db877e8ebd5eff5b126abc...,7922688,7.950081,1024,155508,4194304,7950336,570880,7350784,0,...,True,True,True,True,True,True,True,True,True,True
4,15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f...,159744,5.911521,4096,86524,4194304,159744,86016,69632,0,...,True,False,True,True,False,False,False,False,True,False


In [3]:
categorical_columns = ["pesectionProcessed_entrypointSection_name"]
print(
    f"Unique number of elements of categorical features: {[len(set(df[c])) for c in categorical_columns]}"
)

Unique number of elements of categorical features: [1622]


In [4]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int32)
array_hot_encoded = enc.fit_transform(df[categorical_columns])
cat_df = pd.DataFrame(
    array_hot_encoded,
    index=df.index,
    columns=enc.get_feature_names_out(categorical_columns),
)
cat_df

Unnamed: 0,pesectionProcessed_entrypointSection_name_,pesectionProcessed_entrypointSection_name_����0,pesectionProcessed_entrypointSection_name_����a,pesectionProcessed_entrypointSection_name_����oc,pesectionProcessed_entrypointSection_name_����ta,pesectionProcessed_entrypointSection_name_���,pesectionProcessed_entrypointSection_name_���0,pesectionProcessed_entrypointSection_name_���1,pesectionProcessed_entrypointSection_name_��@.data,pesectionProcessed_entrypointSection_name_���,...,pesectionProcessed_entrypointSection_name_zsjkdbik,pesectionProcessed_entrypointSection_name_zvmqxlbk,pesectionProcessed_entrypointSection_name_zydklvkj,pesectionProcessed_entrypointSection_name_zyefvnbm,pesectionProcessed_entrypointSection_name_zzbszero,pesectionProcessed_entrypointSection_name_| ⚛️,pesectionProcessed_entrypointSection_name_}@,pesectionProcessed_entrypointSection_name_┞ Quiz,pesectionProcessed_entrypointSection_name_▪️GO,pesectionProcessed_entrypointSection_name_📍SWED
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df = df.drop(categorical_columns, axis=1)
X = pd.merge(left=df, right=cat_df, left_index=True, right_index=True)
X = X.set_index("sample_hash")
X.head()

Unnamed: 0_level_0,generic_fileSize,generic_fileEntropy,header_SizeOfHeaders,header_AddressOfEntryPoint,header_ImageBase,header_SizeOfImage,header_SizeOfCode,header_SizeOfInitializedData,header_SizeOfUninitializedData,header_BaseOfCode,...,pesectionProcessed_entrypointSection_name_zsjkdbik,pesectionProcessed_entrypointSection_name_zvmqxlbk,pesectionProcessed_entrypointSection_name_zydklvkj,pesectionProcessed_entrypointSection_name_zyefvnbm,pesectionProcessed_entrypointSection_name_zzbszero,pesectionProcessed_entrypointSection_name_| ⚛️,pesectionProcessed_entrypointSection_name_}@,pesectionProcessed_entrypointSection_name_┞ Quiz,pesectionProcessed_entrypointSection_name_▪️GO,pesectionProcessed_entrypointSection_name_📍SWED
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,1666137,7.996651,1024,5358807,4194304,5365760,310272,56832,0,4096,...,0,0,0,0,0,0,0,0,0,0
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,10543104,4.821304,4096,4512,4194304,118784,94208,20480,0,4096,...,0,0,0,0,0,0,0,0,0,0
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,57368,4.979988,4096,9759,4194304,57344,8192,45056,0,4096,...,0,0,0,0,0,0,0,0,0,0
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,7922688,7.950081,1024,155508,4194304,7950336,570880,7350784,0,4096,...,0,0,0,0,0,0,0,0,0,0
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,159744,5.911521,4096,86524,4194304,159744,86016,69632,0,4096,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold()
X_vt = sel.fit_transform(X)
X_vt.shape

(67000, 46181)

In [7]:
features_mask = sel.get_support()
zero_var_columns = set(X.columns) - set(X.loc[:, features_mask].columns)
# with open("zero_var_columns.txt", 'w') as file:
#     for c in zero_var_columns:
#         file.write(c + '\n')

In [8]:
sdf = pd.DataFrame(X_vt, columns=list(set(X.columns) - zero_var_columns), index=X.index)
sdf.head()

Unnamed: 0_level_0,imp___vbafreevar,str_Invalid,str_|$$},str_D$(p,imp_mmioseek,ngram_b'ndCl',str_1!1+12161<1@1F1P1Z1d1n1u1y1,str_L$@H3,opcode_xor mov jmp,str_l$@E,...,ngram_b'a\xff\xff\xff',ngram_b'\x8aF\x03\x88G\x03',ngram_b'\xaa\x01\x00\x00',str_-tvf,str_June,str_~DWP,ngram_b'EndOfF',str_u39},str_;K$|,str_D$$;T$
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,1666137.0,7.996651,1024.0,5358807.0,4194304.0,5365760.0,310272.0,56832.0,0.0,4096.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,10543104.0,4.821304,4096.0,4512.0,4194304.0,118784.0,94208.0,20480.0,0.0,4096.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,57368.0,4.979988,4096.0,9759.0,4194304.0,57344.0,8192.0,45056.0,0.0,4096.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,7922688.0,7.950081,1024.0,155508.0,4194304.0,7950336.0,570880.0,7350784.0,0.0,4096.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,159744.0,5.911521,4096.0,86524.0,4194304.0,159744.0,86016.0,69632.0,0.0,4096.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Scaler: Apply MinMax scaling before PCA

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
sdf_normalized = scaler.fit_transform(sdf)
sdf_normalized = np.nan_to_num(sdf_normalized)

array([[2.85713512e-03, 9.99581391e-01, 1.46094805e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.80892343e-02, 6.02390637e-01, 9.25267101e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.66281839e-05, 6.22239813e-01, 9.25267101e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.28784385e-04, 5.62105944e-01, 1.46094805e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.28784385e-04, 5.62147088e-01, 1.46094805e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.13787262e-04, 7.54936930e-01, 9.25267101e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [12]:
sdf_normalized = pd.DataFrame(sdf_normalized, columns=sdf.columns, index=sdf.index)

In [13]:
sdf_normalized.head()

Unnamed: 0_level_0,imp___vbafreevar,str_Invalid,str_|$$},str_D$(p,imp_mmioseek,ngram_b'ndCl',str_1!1+12161<1@1F1P1Z1d1n1u1y1,str_L$@H3,opcode_xor mov jmp,str_l$@E,...,ngram_b'a\xff\xff\xff',ngram_b'\x8aF\x03\x88G\x03',ngram_b'\xaa\x01\x00\x00',str_-tvf,str_June,str_~DWP,ngram_b'EndOfF',str_u39},str_;K$|,str_D$$;T$
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,0.002857,0.999581,1.460948e-07,0.020755,0.002018,0.003215,7.2e-05,1.3e-05,0.0,1.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,0.018089,0.602391,9.252671e-07,1.6e-05,0.002018,7.1e-05,2.2e-05,5e-06,0.0,1.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,9.7e-05,0.62224,9.252671e-07,3.6e-05,0.002018,3.4e-05,2e-06,1e-05,0.0,1.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,0.013593,0.993756,1.460948e-07,0.000601,0.002018,0.004763,0.000133,0.001711,0.0,1.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,0.000272,0.738761,9.252671e-07,0.000333,0.002018,9.5e-05,2e-05,1.6e-05,0.0,1.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
sdf_normalized.to_csv("normalized_dataset.csv", escapechar="\\")

# Phase 2: Apply Incremental PCA

In [14]:
del cat_df
del df_opcodes
del df_ngrams
del df_1
del X_vt
del sdf
del X
del df

In [None]:
from sklearn.decomposition import IncrementalPCA

pca_arr = [
    IncrementalPCA(n_components=n_components) for n_components in [300, 400, 500]
]
for pca in pca_arr:
    print("Fitting")
    pca.fit(sdf_normalized)

Fitting


In [None]:
# from sklearn.decomposition import SparsePCA
# sparse_pca = SparsePCA(n_components=500)
# X_std_scaler_spca = sparce_pca.fit_transform(X_vt_df)

In [12]:
np.sum(pca.explained_variance_ratio_)

0.8560958970577225

In [13]:
loading_matrix = pca.components_
loading_matrix.shape

(300, 46181)

In [14]:
fea_magnitude = np.linalg.norm(loading_matrix, axis=0)
fea_magnitude.shape

(46181,)

In [15]:
feature_idx = np.argsort(-1 * fea_magnitude)
fea_magnitude[feature_idx[0:8000]]

array([0.22063661, 0.22054637, 0.22030525, ..., 0.11959986, 0.11959809,
       0.11959116])

In [16]:
print(feature_idx[0:8000])

[40072 37917 33304 ... 40767 23999 41776]


In [17]:
print(df.columns[feature_idx[0:8000]])  ### print out the top 8000 featuers' names

Index(['str_InitializeAcl', 'ngram_b'apFree'', 'str_u)j,',
       'ngram_b'\x04\x00\x00\x83'', 'str_Buginese',
       'pesectionProcessed_entrypointSection_name_aedcqpmu', 'opcode_and pop',
       'ngram_b'\x00\x00\x00\x00\x00\xd6'', 'str_t 9^', 'str_QQQQP',
       ...
       'str_\$ W', 'str_DockClient', 'str_TClass', 'ngram_b'\x033\xc0@'',
       'ngram_b'\x8bE\x0cS'', 'str_NoRun', 'str_;$;(;.;2;8;B;L;V;`;g;k;q;u;{;',
       'ngram_b'\x00\x00\x81\x9f'', 'str_TRttiInstancePropertyEx',
       'ngram_b'lign''],
      dtype='object', length=8000)


In [18]:
sdf_subcol = sdf_normalized[
    :, feature_idx[0:8000]
]  ### choosing the top features only for clustering
sdf_subcol.shape

(6700, 8000)

In [1]:
#### clustering
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(
    linkage="complete", metric="cosine", distance_threshold=0, n_clusters=None
)
# model = model.fit(sdf_subcol)
model = model.fit(sdf_normalized)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=5)

NameError: name 'sdf_normalized' is not defined

In [None]:
model = AgglomerativeClustering(
    linkage="complete", metric="cosine", n_clusters=670
).fit(sdf_normalized)

labels = model.labels_
true_labels = []
for k in range(670):
    true_labels.extend(10 * [k])

from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score

print(normalized_mutual_info_score(true_labels, labels))
print(adjusted_mutual_info_score(true_labels, labels))