In [None]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score, multilabel_confusion_matrix, classification_report, confusion_matrix, accuracy_score,average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.under_sampling import (
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours
)

# Load Data

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareCategory_DATASET_Increased_Imbalance.csv')
# Delete rows where 'Category' is equal to 'Unknown'
df = df[(df['Category'] != 'Unknown')]
# Delete rows where 'column1' is equal to 'value1' and 'column2' is equal to 'value2'
df = df[(df['Category'] != 'Trojan Monitoring Software') & (df['Category'] != 'Backdoor') & (df['Category'] != 'Potentially Unwanted Software')]
df.head()

Unnamed: 0,file_name,mov,add,push,pop,inc,call,xor,dec,cmp,...,insb,jae,outsb,popa,jo,ja,name,Category,Category Target,Family Target
0,VirusShare_703bb560ffbbdef898582d51935eed03,0.033368,0.392889,0.045533,0.022447,0.029113,0.007043,0.01288,0.010922,0.020978,...,0.021354,0.00177,0.002335,0.001732,0.006026,0.001017,Beebone,Trojan Downloader,5,8
1,VirusShare_ead649131199ce961265ff8f81bf4e56,0.067794,0.29211,0.059742,0.042351,0.034622,0.010789,0.034783,0.012238,0.006119,...,0.001449,0.000161,0.060386,0.002093,0.000805,0.000644,Obfuscator,Tool,4,4
2,VirusShare_ab921e0669670594c699391875c8f72f,0.045229,0.130716,0.158549,0.046223,0.01839,0.052187,0.041252,0.021372,0.004473,...,0.015905,0.013917,0.011928,0.007952,0.003976,0.000497,OnLineGames,Password Stealer,3,11
3,VirusShare_5d8a0e4f29a9bc1677bbad18156bf3cb,0.117216,0.032492,0.048115,0.040838,0.028375,0.005653,0.032425,0.030511,0.022121,...,0.003338,0.004006,0.003628,0.003405,0.004028,0.004206,Allaple.A,Worm,2,13
4,VirusShare_8d21dc13e0302c61eaa40e3b7d3755f0,0.117504,0.073477,0.194514,0.048658,0.026541,0.070122,0.031469,0.01615,0.033933,...,0.005314,0.004542,0.005938,0.005433,0.003711,0.002672,Enterak.A,Password Stealer,3,10


In [None]:
df['Category Target'].value_counts()#/len(df)

Category Target
1    8590
2    5497
3    5252
4    4948
5     256
6     227
8      25
7      25
Name: count, dtype: int64

# Features/Targets

In [None]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Prepare the target
targets = df["Category Target"].astype(int)

In [None]:
print(features.shape)
targets.shape

(24820, 40)


(24820,)

# Split Train/Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((19856, 40), (4964, 40))

# Metrics Functions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (df['Category'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [None]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [None]:
def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (df['Category'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# Balance Data

## 1. ENN

In [None]:
enn1 = EditedNearestNeighbours(sampling_strategy='majority', n_neighbors=1, kind_sel='all')
X_train_enn1, y_train_enn1 = enn1.fit_resample(X_train, y_train)

In [None]:
enn4 = EditedNearestNeighbours(sampling_strategy='majority', n_neighbors=4, kind_sel='all')
X_train_enn4, y_train_enn4 = enn4.fit_resample(X_train, y_train)

## 2. OSS

In [None]:
oss3 = OneSidedSelection(sampling_strategy='majority', n_neighbors=3, random_state=0)
X_train_oss3, y_train_oss3 = oss3.fit_resample(X_train, y_train)

# Models

## KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_enn1, y_train_enn1)

In [None]:
# Predict the labels for the test set
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (ENN 'majority' n =1)",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (ENN 'majority' n =1)-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

-----------------------------KNN (ENN 'majority' n =1)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.88      0.92      0.90      1718
           2       0.97      0.99      0.98      1100
           3       0.87      0.88      0.88      1050
           4       0.83      0.76      0.80       990
           5       0.84      0.73      0.78        51
           6       1.00      1.00      1.00        45
           7       0.83      1.00      0.91         5
           8       1.00      0.20      0.33         5

    accuracy                           0.89      4964
   macro avg       0.90      0.81      0.82      4964
weighted avg       0.89      0.89      0.89      4964

AUC-ROC for class 1: 0.925606298286348
PR AUC for class 1: 0.8374247180339442
AUC-ROC for class 2: 0.9903416149068324
PR AUC for class 2: 0.9605359548751007
AUC-ROC for class 3: 0.9221850743363262
PR AUC for class 3: 0.7939107132026473
AUC-R

### Confusion_Matrix_KNN

In [None]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3037          209
Actual +          145         1573

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3828           36
Actual +           11         1089

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3782          132
Actual +          128          922

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3823          151
Actual +          234          756

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4906            7
Actual +           14           37

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4958            1
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## RF model

In [None]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_enn1, y_train_enn1)

In [None]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (ENN 'majority' n =1)",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (ENN 'majority' n =1)-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (ENN 'majority' n =1)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.90      0.92      1718
           2       0.99      0.99      0.99      1100
           3       0.93      0.91      0.92      1050
           4       0.79      0.89      0.84       990
           5       1.00      0.73      0.84        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       0.00      0.00      0.00         5

    accuracy                           0.92      4964
   macro avg       0.83      0.80      0.81      4964
weighted avg       0.92      0.92      0.92      4964

AUC-ROC for class 1: 0.9350343971303088
PR AUC for class 1: 0.8827144523927718
AUC-ROC for class 2: 0.9941605495953323
PR AUC for class 2: 0.9839153308781908
AUC-ROC for class 3: 0.9432761515439083
PR AUC for class 3: 0.85945069496

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_RF

In [None]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3153           93
Actual +          174         1544

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3854           10
Actual +           10         1090

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3839           75
Actual +           99          951

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3745          229
Actual +          105          885

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4913            0
Actual +           14           37

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## SVM model

In [None]:
# Train the model
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train_oss3, y_train_oss3)

In [None]:
# Predict the labels for the test set
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (OSS 'majority' n =3)",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (OSS 'majority' n =3)-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (OSS 'majority' n =3)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.80      0.83      0.81      1718
           2       0.93      0.98      0.95      1100
           3       0.77      0.70      0.73      1050
           4       0.64      0.65      0.65       990
           5       0.57      0.16      0.25        51
           6       1.00      1.00      1.00        45
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         5

    accuracy                           0.79      4964
   macro avg       0.59      0.54      0.55      4964
weighted avg       0.79      0.79      0.79      4964

AUC-ROC for class 1: 0.8572363083928136
PR AUC for class 1: 0.7186952544002029
AUC-ROC for class 2: 0.9773122529644269
PR AUC for class 2: 0.9107770426581514
AUC-ROC for class 3: 0.8223719492907026
PR AUC for class 3: 0.6029007868407867
AUC-

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_SVM

In [None]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         2886          360
Actual +          300         1418

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3780           84
Actual +           26         1074

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3694          220
Actual +          314          736

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3608          366
Actual +          343          647

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4907            6
Actual +           43            8

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            5            0

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## MLP Model

In [None]:
# Train the model
mlp = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5)
mlp.fit(X_train_enn4, y_train_enn4)

In [None]:
# Predict the labels for the test set
y_pred_mlp = mlp.predict(X_test)
metrics_mlp = calculate_MacroAvg_metrics("MLP (ENN 'majority' n =4)",y_pred_mlp, y_test)
final_result.append(metrics_mlp)
print("-----------------------------MLP (ENN 'majority' n =4)-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------MLP (ENN 'majority' n =4)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.80      0.85      1718
           2       0.99      0.99      0.99      1100
           3       0.81      0.83      0.82      1050
           4       0.67      0.82      0.74       990
           5       0.74      0.69      0.71        51
           6       1.00      1.00      1.00        45
           7       1.00      0.40      0.57         5
           8       0.00      0.00      0.00         5

    accuracy                           0.85      4964
   macro avg       0.77      0.69      0.71      4964
weighted avg       0.86      0.85      0.85      4964

AUC-ROC for class 1: 0.8803551178238892
PR AUC for class 1: 0.802600417046913
AUC-ROC for class 2: 0.9925381140598533
PR AUC for class 2: 0.9800106995864475
AUC-ROC for class 3: 0.8876910723410467
PR AUC for class 3: 0.7096984063993298
AUC-R

### Confusion_Matrix_MLP

In [None]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3123          123
Actual +          346         1372

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3852           12
Actual +           13         1087

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3717          197
Actual +          183          867

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3576          398
Actual +          176          814

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4901           12
Actual +           16           35

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            3            2

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

# Macro Avg Results

In [None]:
df_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,KNN (ENN 'majority' n =1),0.904155,0.809103,0.821572,0.892023
1,Random Forest (ENN 'majority' n =1),0.831929,0.801847,0.813709,0.91801
2,SVM (OSS 'majority' n =3),0.588123,0.539137,0.54856,0.791297
3,MLP (ENN 'majority' n =4),0.767244,0.690124,0.710994,0.850524
