# Live Locked Shields

In [1]:
configuration = "fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no"
dataset_path = "datasets"
model_path = "trained_models/fine_tune_all_years_ded_coa_med_seq-128_old_" + configuration + "/finetuned_model"

In [None]:
import os

ip_documents = {}

def get_next_sentence_pairs(document_path):    
    labelled_sentence_pairs = []
    file_list = os.listdir(document_path)
    for file in file_list:
        with open(document_path + "/" + file, 'r') as f:
            ip1 = file.split(".txt")[0]
            if ip1 in ip_documents:
                position = ip_documents[ip1]["position"]
                f.seek(position)
                lines = f.readlines()
                ip_documents[ip1]["position"] = f.tell()
            else:
                lines = f.readlines()
                ip_documents[ip1] = {"position": f.tell()}
                        
            for line in lines:
                line.strip()
                ip2, start_time, end_time, sentence = line.split(",")
                last_sentence = None
                if ip2 in ip_documents[ip1]:
                    last_sentence = ip_documents[ip1][ip2]
                    ip_documents[ip1][ip2] = sentence
                    labelled_sentence_pairs.append((last_sentence, sentence, ip1, ip2, start_time, end_time))
                else:
                    ip_documents[ip1][ip2] = sentence
                    labelled_sentence_pairs.append((sentence, "3 \n", ip1, ip2, start_time, end_time))
                    
    return labelled_sentence_pairs

In [None]:
from bert_inference_dataset import *

sentence_pairs = get_next_sentence_pairs("ls24_live_dataset/exploration_full")
data = BERTInferenceDataset(sentence_pairs, seq_length=128)

In [None]:
from evaluate_bert import *
model_path = "trained_models/fine_tune_all_years_ded_coa_med_seq-128_old_fi-100_ft-15000000_nb-30_ht-100000_di-uni_mx-1500_lg-no/finetuned_model"
embeddings = get_embeddings(model_path, data, batch_size=1024, resample=False, use_labels=False)

In [None]:
from evaluate_bert import *
visualize_embeddings([embeddings], ["All Traffic"], "Embeddings during Exploration", method="tsne", sample=20000)

In [None]:
rt_ip_list = []
with open("rt_24_ips.txt", "r") as f:
    rt_ip_list = f.readlines()
    rt_ip_list = [ip.strip() for ip in rt_ip_list]
rt_ips = set(rt_ip_list)

In [None]:
rt_embeddings = []
rt_metadata = []
rt_data = []
other_embeddings = []
other_metadata = []
other_data = []

for embedding, meta, d in zip(embeddings, data.metadata, data):
    ip1, ip2, start_time, end_time = meta
    if ip1 in rt_ips:
        rt_embeddings.append(embedding)
        rt_metadata.append(meta)
        rt_data.append(d)
    else:
        other_embeddings.append(embedding)
        other_metadata.append(meta)
        other_data.append(d)

In [None]:
# convert to numpy array
rt_embeddings = np.array(rt_embeddings)
other_embeddings = np.array(other_embeddings)

In [None]:
# calculate cosine similarities between rt and other
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(rt_embeddings, other_embeddings)

print(cosine_similarities.shape)

In [None]:
# get top 5 that are most similar to rt on average
top_5 = np.argsort(np.mean(cosine_similarities, axis=0))[::-1][:50]


In [None]:
# print ips of top 5 and their score
for i in top_5:
    print(other_metadata[i], np.mean(cosine_similarities[:, i]))
    print(other_data[i])

In [None]:
from evaluate_bert import *
visualize_embeddings([other_embeddings, rt_embeddings], ["Classified as Malicious", "Classified as Benign"], "Embeddings during Exploration by Classification", method="tsne", sample=20000)

In [None]:
import joblib
svm = joblib.load("svm_model.pkl")

batch_size = 5000
bt_embeddings = []
rt_embeddings = []
labels = []
for i in range(0, embeddings.shape[0], batch_size):
    print("Processing embedding ", i, " of ", embeddings.shape[0])
    predictions = svm.predict_proba(embeddings[i:i+batch_size])[:,1]
    y_pred_batch = (predictions >= 0.01).astype(int)
    rt_embeddings_batch = embeddings[i:i+batch_size][y_pred_batch == 0]
    bt_embeddings_batch = embeddings[i:i+batch_size][y_pred_batch == 1]
    
    labels.extend(y_pred_batch)
    rt_embeddings.extend(rt_embeddings_batch)
    bt_embeddings.extend(bt_embeddings_batch)

In [None]:
columns = ["external_host", "internal_host", "start_time", "end_time", "label"]

In [None]:
columns = ["external_host", "internal_host", "start_time", "end_time", "label"]
file_name = "ls24_live_dataset/predictions.csv"
all_data_file_name = "ls24_live_dataset/all_data.csv"

with open(file_name, 'a') as f:
    with open(all_data_file_name, 'a') as f2:
        for label, metadata in zip(labels, data.metadata):
            if label == 0:
                ip1, ip2, start_time, end_time = metadata
                f2.write(ip1 + "," + ip2 + "," + start_time + "," + end_time + "\n")
                f.write(ip1 + "," + ip2 + "," + start_time + "," + end_time + ",0" + "\n")
            else:
                ip1, ip2, start_time, end_time = metadata
                f2.write(ip1 + "," + ip2 + "," + start_time + "," + end_time + ",1" + "\n")

In [None]:
import pandas as pd
df = pd.read_csv(file_name, names=columns[:4])
# read top 10 most appearing external hosts
top_external_hosts = df["external_host"].value_counts().head(30)
top_external_hosts

In [159]:
import importlib
import similarity_dataset
importlib.reload(similarity_dataset)

<module 'similarity_dataset' from '/cluster/raid/home/sosi/repos/Bert/similarity_dataset.py'>

In [162]:
from similarity_dataset import SimilarityDataset
data_2021 = SimilarityDataset(2021, fine=True, label_path="ip_labels_2021.txt", only_rt=False)
data_2022 = SimilarityDataset(2022, fine=True, label_path="ip_labels_2022.txt", only_rt=False)

In [163]:
import numpy as np

X_rt_21, X_rt_ip_21 = data_2021.get_rt_host_embeddings()
X_b_21, X_b_ip_21 = data_2021.get_benign_host_embeddings()
X_rt_22, X_rt_ip_22 = data_2022.get_rt_host_embeddings()
X_b_22, X_b_ip_22 = data_2022.get_benign_host_embeddings()
XX_rt_21, XX_rt_ip_21 = data_2021.get_rt_host_host_embeddings()
XX_b_21, XX_b_ip_21 = data_2021.get_benign_host_host_embeddings()
XX_rt_22, XX_rt_ip_22 = data_2022.get_rt_host_host_embeddings()
XX_b_22, XX_b_ip_22 = data_2022.get_benign_host_host_embeddings()

X_21 = np.concatenate([X_rt_21, X_b_21])
y_21 = np.concatenate([np.ones(X_rt_21.shape[0]), np.zeros(X_b_21.shape[0])])
X_22 = np.concatenate([X_rt_22, X_b_22])
y_22 = np.concatenate([np.ones(X_rt_22.shape[0]), np.zeros(X_b_22.shape[0])])
XX_21 = np.concatenate([XX_rt_21, XX_b_21])
yy_21 = np.concatenate([np.ones(XX_rt_21.shape[0]), np.zeros(XX_b_21.shape[0])])
XX_22 = np.concatenate([XX_rt_22, XX_b_22])
yy_22 = np.concatenate([np.ones(XX_rt_22.shape[0]), np.zeros(XX_b_22.shape[0])])

In [164]:
X_21.shape, y_21.shape, X_22.shape, y_22.shape, XX_21.shape, yy_21.shape, XX_22.shape, yy_22.shape

((576, 512),
 (576,),
 (27344, 512),
 (27344,),
 (17712, 512),
 (17712,),
 (373472, 512),
 (373472,))

In [177]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

def train_model(X, y, model, split_size=0.2):
    if X.shape[0] > 100000:
        idx = np.random.choice(X.shape[0], 100000, replace=False)
        X = X[idx]
        y = y[idx]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_size, random_state=42, stratify=y)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    print(roc_auc_score(y_test, y_pred))
    print(classification_report(y_test, y_pred >= 0.5))
    model.fit(X, y)
    return model

model = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)
clf = train_model(X_22, y_22, model)

# store classifer
import joblib
joblib.dump(clf, "rf_host_model_22.pkl")

y_pred = clf.predict_proba(X_21)[:,1]
print(roc_auc_score(y_21, y_pred))
print(classification_report(y_21, y_pred >= 0.5))


model = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)
clf = train_model(XX_22, yy_22, model)

joblib.dump(clf, "rf_host_host_model_22.pkl")

y_pred = clf.predict_proba(XX_21)[:,1]

print(roc_auc_score(yy_21, y_pred))
print(classification_report(yy_21, y_pred >= 0.5))

0.7950550986319069
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      5241
         1.0       0.98      0.28      0.44       228

    accuracy                           0.97      5469
   macro avg       0.98      0.64      0.71      5469
weighted avg       0.97      0.97      0.96      5469

0.7952184989222026
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       567
         1.0       0.00      0.00      0.00         9

    accuracy                           0.98       576
   macro avg       0.49      0.50      0.50       576
weighted avg       0.97      0.98      0.97       576

0.7811811180034491
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     19430
         1.0       1.00      0.19      0.32       570

    accuracy                           0.98     20000
   macro avg       0.99      0.60      0.65     20000
weighted avg     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
