<a href="https://colab.research.google.com/github/stwater20/AIS3-2024-Material/blob/main/AIS3_Lab7_apt_group_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [2]:
import pandas as pd

# Load the uploaded file
# In the repo, the file is in ./Datasets/ThreatActor
file_path = 'ThreatActor_cleaned.xlsx'

# Read the Excel file
df = pd.read_excel(file_path)
df

Unnamed: 0,source name,mapping description
0,APT-C-36,APT-C-36 has downloaded binary data from a spe...
1,APT-C-36,APT-C-36 has prompted victims to accept macros...
2,APT-C-36,APT-C-36 has disguised its scheduled tasks as ...
3,APT-C-36,APT-C-36 has used port 4050 for C2 communicati...
4,APT-C-36,APT-C-36 has used ConfuserEx to obfuscate its ...
...,...,...
3382,menuPass,menuPass has used and modified open-source too...
3383,menuPass,menuPass has used legitimate access granted to...
3384,menuPass,menuPass has used valid accounts including sha...
3385,menuPass,menuPass executes commands using a command-lin...


In [3]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,source name,mapping description
0,APT3,APT3 has a tool that can delete files.
1,Equation,Equation has used an encrypted virtual file sy...
2,OilRig,OilRig has used LinkedIn to send spearphishing...
3,APT18,APT18 uses cmd.exe to execute commands on the ...
4,Mofang,Mofang has compressed the ShimRat executable w...


In [4]:
len(set(df["source name"].to_list()))

143

In [5]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['mapping description'])

# Labels
y = df['source name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Use RandomForestClassifier for classification
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Evaluate the classification performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9867256637168141
Classification Report:
                    precision    recall  f1-score   support

          APT-C-36       1.00      1.00      1.00         2
              APT1       1.00      1.00      1.00         4
             APT12       1.00      0.33      0.50         3
             APT17       1.00      1.00      1.00         1
             APT18       1.00      1.00      1.00         2
             APT19       1.00      1.00      1.00         1
             APT28       1.00      1.00      1.00        17
             APT29       1.00      1.00      1.00        13
              APT3       1.00      1.00      1.00         7
             APT32       1.00      1.00      1.00         7
             APT33       1.00      1.00      1.00         4
             APT37       1.00      1.00      1.00         8
             APT38       1.00      1.00      1.00         4
             APT39       1.00      1.00      1.00        12
             APT41       0.93      1.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the text descriptions using SentenceBERT
sentence_embeddings = model.encode(df['mapping description'],device=device)

# Labels
y = df['source name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, y, test_size=0.2, shuffle=False)

# Use RandomForestClassifier for classification
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Evaluate the classification performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Accuracy: 0.7876106194690266
Classification Report:
                    precision    recall  f1-score   support

          APT-C-36       1.00      0.50      0.67         2
              APT1       1.00      0.25      0.40         4
             APT12       0.00      0.00      0.00         3
             APT17       1.00      1.00      1.00         1
             APT18       1.00      1.00      1.00         2
             APT19       0.00      0.00      0.00         1
             APT28       0.74      1.00      0.85        17
             APT29       0.61      0.85      0.71        13
              APT3       0.75      0.86      0.80         7
             APT32       0.37      1.00      0.54         7
             APT33       1.00      0.75      0.86         4
             APT37       1.00      0.25      0.40         8
             APT38       0.50      0.25      0.33         4
             APT39       0.85      0.92      0.88        12
             APT41       0.58      1.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np



model = SentenceTransformer('all-MiniLM-L6-v2')

sentence_embeddings = model.encode(df['mapping description'])

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['source name'])

X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, y_encoded, test_size=0.2, shuffle=False)

train_embeddings = {label: [] for label in np.unique(y_train)}
for emb, label in zip(X_train, y_train):
    train_embeddings[label].append(emb)

y_pred = []
for test_emb in X_test:
    similarities = []
    for label in np.unique(y_train):
        avg_train_emb = np.mean(train_embeddings[label], axis=0)
        similarity = cosine_similarity([test_emb], [avg_train_emb])[0][0]
        similarities.append((label, similarity))
    best_label = max(similarities, key=lambda x: x[1])[0]
    y_pred.append(best_label)

y_test = label_encoder.inverse_transform(y_test)
y_pred = label_encoder.inverse_transform(y_pred)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9660766961651918
Classification Report:
                    precision    recall  f1-score   support

          APT-C-36       1.00      1.00      1.00         2
              APT1       1.00      0.75      0.86         4
             APT12       0.00      0.00      0.00         3
             APT17       1.00      1.00      1.00         1
             APT18       1.00      1.00      1.00         2
             APT19       1.00      1.00      1.00         1
             APT28       0.94      0.94      0.94        17
             APT29       1.00      1.00      1.00        13
              APT3       1.00      1.00      1.00         7
             APT30       0.00      0.00      0.00         0
             APT32       0.70      1.00      0.82         7
             APT33       1.00      1.00      1.00         4
             APT37       1.00      0.88      0.93         8
             APT38       1.00      1.00      1.00         4
             APT39       1.00      0.92      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
