In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [None]:
data_dir = '/opt/Malware-Project/BigDataset/FEELScenarios/'

In [None]:
df = pd.read_csv(os.path.join(data_dir, 'Processed', 'Client4', 'Day1', "comb_features_ben.csv"))
df.head()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df2 = pd.read_csv(os.path.join(data_dir, 'Raw', 'Malware', 'CTU-Malware-Capture-Botnet-327-2', 'Day1', "comb_features.csv"))
df2.head()

In [None]:
df.SNI_equal_DstIP.describe()

In [None]:
df2.self_signed_ratio.describe()

In [None]:
df2.ssl_ratio.describe()

In [None]:
for column in df.columns:
    if df[column].describe()["mean"] == -1 or df[column].describe()["mean"] == 0:
        print(column)

In [None]:
for column in df2.columns:
    if df2[column].describe()["mean"] == -1:
        print(column)

# Anomaly detection tests

## Create benign dataset for each day

In [None]:
data = dict()
for j in range(1, 6):
    data["Day"+str(j)] = pd.DataFrame()
    for i in range(1, 11):
        df_temp = pd.read_csv(os.path.join(data_dir, 'Processed', 'Client'+str(i), 'Day'+str(j), "comb_features_ben.csv"))
        data["Day"+str(j)] = pd.concat([data["Day"+str(j)], df_temp], ignore_index=True)
    

In [None]:
for i in range(1, 6):
    print(f'Size of Day{i} dataframe: {len(data["Day"+str(i)])}')

In [None]:
for i in range(1, 6):
    for column in df.columns:
        if data["Day"+str(j)][column].describe()["mean"] == -1:
            print(i, column)

In [None]:
# Drop column "ssl_ratio"
for i in range(1, 6):
    data["Day"+str(i)] = data["Day"+str(i)].drop(["ssl_ratio", "self_signed_ratio", "SNI_equal_DstIP", "ratio_certificate_path_error", "ratio_missing_cert_in_cert_path"], axis=1)
    data["Day"+str(i)] = data["Day"+str(i)].drop_duplicates()

In [None]:
# Final sizes
for i in range(1, 6):
    print(f'Size of Day{i} dataframe: {len(data["Day"+str(i)])}')


## Create malware dataset (Day1)

In [None]:
mal_data = dict()
mal_folders = ['CTU-Malware-Capture-Botnet-346-1', 'CTU-Malware-Capture-Botnet-327-2', 'CTU-Malware-Capture-Botnet-230-1', 'CTU-Malware-Capture-Botnet-219-2']

for folder in mal_folders:
    mal_data[folder] = pd.DataFrame()
    df_temp = pd.read_csv(os.path.join(data_dir, 'Raw', 'Malware', folder, 'Day1', "comb_features.csv"))
    mal_data[folder] = pd.concat([mal_data[folder], df_temp], ignore_index=True)

In [None]:
for folder in mal_folders:
    print(f'Size of {folder} dataframe: {len(mal_data[folder])}')

In [None]:
for folder in mal_folders:
    for column in df.columns:
        if mal_data[folder][column].describe()["mean"] == -1:
            print(folder, column)

In [None]:
# Drop column "ssl_ratio"
for folder in mal_folders:
    mal_data[folder] = mal_data[folder].drop(["ssl_ratio", "self_signed_ratio", "SNI_equal_DstIP", "ratio_certificate_path_error", "ratio_missing_cert_in_cert_path"], axis=1)
    mal_data[folder] = mal_data[folder].drop_duplicates()

In [None]:
for folder in mal_folders:
    print(f'Size of {folder} dataframe: {len(mal_data[folder])}')

## Anomaly detection

### Isolation Forests

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn import preprocessing

In [None]:
# scaler = preprocessing.StandardScaler().fit(data["Day1"])
scaler = preprocessing.MinMaxScaler().fit(data["Day1"])
X = scaler.transform(data["Day1"])
iso = IsolationForest(n_estimators=50, 
                      contamination=0.01,
                      random_state=1337).fit(X)

In [None]:
X_test = scaler.transform(data["Day2"])
print(f'False positives: {100*sum(iso.predict(X_test) == -1) / len(X_test):.2f}%')

In [None]:
for folder in mal_folders:
    X_test_mal = scaler.transform(mal_data[folder])
    print(f'Detection {folder}: {100*sum(iso.predict(X_test_mal) == -1) / len(X_test_mal):.2f}%')

### LOF

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
lof = LocalOutlierFactor(n_neighbors=5, novelty=True, contamination=0.02)

In [None]:
# scaler = preprocessing.StandardScaler().fit(data["Day1"])
scaler = preprocessing.MinMaxScaler().fit(data["Day1"])
X = scaler.transform(data["Day1"])
lof.fit(X)

In [None]:
X_test = scaler.transform(data["Day2"])
print(f'False positives: {100*sum(lof.predict(X_test) == -1) / len(X_test):.2f}%')

In [None]:
for folder in mal_folders:
    X_test_mal = scaler.transform(mal_data[folder])
    print(f'Detection {folder}: {100*sum(lof.predict(X_test_mal) == -1) / len(X_test_mal):.2f}% ({sum(lof.predict(X_test_mal)==-1)} out of {len(X_test_mal)})')

### One class SVM

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM

In [None]:
# It does better with nu=0.02 but higher FP
scaler = preprocessing.MinMaxScaler().fit(data["Day1"])
X = scaler.transform(data["Day1"])
# svm_ = OneClassSVM(nu=0.02, kernel="linear", gamma=0.1).fit(X)
svm_ = SGDOneClassSVM(nu=0.01).fit(X)

In [None]:
X_test = scaler.transform(data["Day2"])
print(f'False positives: {100*sum(svm_.predict(X_test) == -1) / len(X_test):.2f}% ({sum(svm_.predict(X_test)==-1)} out of {len(X_test)})')

In [None]:
for folder in mal_folders:
    X_test_mal = scaler.transform(mal_data[folder])
    print(f'{folder} detected: {100*sum(svm_.predict(X_test_mal) == -1) / len(X_test_mal):.2f}% ({sum(svm_.predict(X_test_mal)==-1)} out of {len(X_test_mal)})')

### Autoencoders (NN)

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(36)),
        tf.keras.layers.Dense(32, activation='elu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(20, activation='elu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(10, activation='elu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(20, activation='elu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(32, activation='elu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(36, activation='elu')
    ]
)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mse")

In [None]:
EPOCHS = 8
BATCH_SIZE = 64

scaler = preprocessing.MinMaxScaler()

X = scaler.fit_transform(data["Day4"])
X_test = scaler.transform(data["Day5"])

X_train , X_val = train_test_split(X, test_size=0.2, random_state=8181)

# X = data["Day1"]
# X_test = data["Day2"]

history = model.fit(
    X_train, X_train,
    shuffle=True,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
#     callbacks=cb,
    validation_data=(X_val, X_val)
)


In [None]:
plt.plot(history.history["loss"], label="train loss")
plt.plot(history.history["val_loss"], label="val loss")
plt.legend()

In [None]:
rec_ben = model.predict(X_test)
mse_ben = np.mean(np.power(X_test - rec_ben, 2), axis=1)


rec_mal = dict()
mse_mal = dict()
num_malware = 0
for folder in mal_folders:
    X_test_mal = scaler.transform(mal_data[folder])
    num_malware += X_test_mal.shape[0]
    rec_mal[folder] = model.predict(X_test_mal)
    mse_mal[folder] = np.mean(np.power(X_test_mal - rec_mal[folder], 2), axis=1)

num_malware

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

# ax.hist(mse_ben, bins=10, density=False, label="clean", alpha=.6, color="green")
# for folder in mal_folders:
# ax.hist(mse_mal["CTU-Malware-Capture-Botnet-346-1"], bins=10, density=False, label="CTU-Malware-Capture-Botnet-346-1", alpha=.6)
# ax.hist(mse_mal["CTU-Malware-Capture-Botnet-327-2"], bins=10, density=False, label="CTU-Malware-Capture-Botnet-327-2", alpha=.6)
# ax.hist(mse_mal["CTU-Malware-Capture-Botnet-230-1"], bins=10, density=False, label="CTU-Malware-Capture-Botnet-230-1", alpha=.6)
ax.hist(mse_mal["CTU-Malware-Capture-Botnet-219-2"], bins=10, density=False, label="CTU-Malware-Capture-Botnet-219-2", alpha=.6)


plt.title("Distribution of the Reconstruction Loss")
plt.legend()

In [None]:
# The number of faulty samples for a 2% FPR (on the training set)
rec_ben = model.predict(X_val)
mse_ben = np.mean(np.power(X_val - rec_ben, 2), axis=1)

num = 0.01*len(X_val)

th = 0.001
while (sum(mse_ben > th) > num):
    th += 0.001
print(f"Calculated threshold: {th:.5f}")

# Measure in the testset
rec_ben = model.predict(X_test)
mse_ben = np.mean(np.power(X_test - rec_ben, 2), axis=1)
print(f'False positives on next day: { 100*sum(mse_ben > th) / len(X_test):.2f}% ({sum(mse_ben > th)} out of {len(X_test)})')
anomalies_ben = sum(mse_ben > th)
num_examples_test = X_test.shape[0]

In [None]:
anomalies_mal = 0
for folder in mal_folders:
    anomalies_mal += sum(mse_mal[folder] > th)
    print(f'{folder} detected: {100*sum(mse_mal[folder] > th) / len(mse_mal[folder]):.2f}% ({sum(mse_mal[folder] > th)} out of {len(mse_mal[folder])})')

In [None]:
fp = anomalies_ben
tp = anomalies_mal
tn = num_examples_test - fp
fn = num_malware - tp

accuracy = (tp + tn) / (num_examples_test + num_malware)
tpr = tp / num_malware
fpr = fp / num_examples_test

# Metrics on the test set for both malware and benign data
print(f"Centralized accuracy: {100*accuracy:.2f}%")
print(f"Centralized tpr: {100*tpr:.2f}%")
print(f"Centralized fpr: {100*fpr:.2f}%")