In [None]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score, multilabel_confusion_matrix, classification_report, confusion_matrix, accuracy_score,average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import (
    SMOTE,
    BorderlineSMOTE,
    ADASYN
)
from imblearn.under_sampling import (
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours
)

# Load Data

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareCategory_DATASET_Increased_Imbalance.csv')
# Delete rows where 'Category' is equal to 'Unknown'
df = df[(df['Category'] != 'Unknown')]
# Delete rows where 'column1' is equal to 'value1' and 'column2' is equal to 'value2'
df = df[(df['Category'] != 'Trojan Monitoring Software') & (df['Category'] != 'Backdoor') & (df['Category'] != 'Potentially Unwanted Software')]
df.head()

Unnamed: 0,file_name,mov,add,push,pop,inc,call,xor,dec,cmp,...,insb,jae,outsb,popa,jo,ja,name,Category,Category Target,Family Target
0,VirusShare_703bb560ffbbdef898582d51935eed03,0.033368,0.392889,0.045533,0.022447,0.029113,0.007043,0.01288,0.010922,0.020978,...,0.021354,0.00177,0.002335,0.001732,0.006026,0.001017,Beebone,Trojan Downloader,5,8
1,VirusShare_ead649131199ce961265ff8f81bf4e56,0.067794,0.29211,0.059742,0.042351,0.034622,0.010789,0.034783,0.012238,0.006119,...,0.001449,0.000161,0.060386,0.002093,0.000805,0.000644,Obfuscator,Tool,4,4
2,VirusShare_ab921e0669670594c699391875c8f72f,0.045229,0.130716,0.158549,0.046223,0.01839,0.052187,0.041252,0.021372,0.004473,...,0.015905,0.013917,0.011928,0.007952,0.003976,0.000497,OnLineGames,Password Stealer,3,11
3,VirusShare_5d8a0e4f29a9bc1677bbad18156bf3cb,0.117216,0.032492,0.048115,0.040838,0.028375,0.005653,0.032425,0.030511,0.022121,...,0.003338,0.004006,0.003628,0.003405,0.004028,0.004206,Allaple.A,Worm,2,13
4,VirusShare_8d21dc13e0302c61eaa40e3b7d3755f0,0.117504,0.073477,0.194514,0.048658,0.026541,0.070122,0.031469,0.01615,0.033933,...,0.005314,0.004542,0.005938,0.005433,0.003711,0.002672,Enterak.A,Password Stealer,3,10


In [None]:
df['Category Target'].value_counts()#/len(df)

Category Target
1    8590
2    5497
3    5252
4    4948
5     256
6     227
8      25
7      25
Name: count, dtype: int64

# Features/Targets

In [None]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Prepare the target
targets = df["Category Target"].astype(int)

In [None]:
print(features.shape)
targets.shape

(24820, 40)


(24820,)

# Split Train/Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((19856, 40), (4964, 40))

# Metrics Functions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (df['Category'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [None]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [None]:
def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (df['Category'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# Balance Data

## 1. SMOTE + ENN

In [None]:
sm_8 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=8)
X_resampled, y_resampled = sm_8.fit_resample(X_train, y_train)
enn4 = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=4, kind_sel='all')
X_train_rf, y_train_rf = enn4.fit_resample(X_resampled, y_resampled)

## 2. BSMOTE + TL

In [None]:
bsm_3_18 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=18)
X_resampled, y_resampled = bsm_3_18.fit_resample(X_train, y_train)
tl_auto = TomekLinks(sampling_strategy="auto")
X_train_knn, y_train_knn=  tl_auto.fit_resample(X_resampled, y_resampled)

In [None]:
bsm_7_18 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=7, m_neighbors=18)
X_resampled, y_resampled = bsm_7_18.fit_resample(X_train, y_train)
tl_maj = TomekLinks(sampling_strategy="majority")
X_train_svm, y_train_svm=  tl_maj.fit_resample(X_resampled, y_resampled)

## 3. SMOTE + TL

In [None]:
sm_13 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=13)
X_resampled, y_resampled = sm_13.fit_resample(X_train, y_train)
tl_maj = TomekLinks(sampling_strategy="majority")
X_train_mlp, y_train_mlp=  tl_maj.fit_resample(X_resampled, y_resampled)

# Models

## KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_knn, y_train_knn)

In [None]:
# Predict the labels for the test set
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (BSMOTE + TL 'auto')",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (BSMOTE + TL 'auto')-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

-----------------------------KNN (BSMOTE + TL 'auto')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.93      0.88      0.90      1718
           2       0.98      0.99      0.98      1100
           3       0.86      0.91      0.88      1050
           4       0.82      0.81      0.81       990
           5       0.64      0.84      0.73        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       0.67      0.40      0.50         5

    accuracy                           0.90      4964
   macro avg       0.86      0.85      0.85      4964
weighted avg       0.90      0.90      0.90      4964

AUC-ROC for class 1: 0.9205521688016485
PR AUC for class 1: 0.8558690571165045
AUC-ROC for class 2: 0.9922195558065122
PR AUC for class 2: 0.9707059027197207
AUC-ROC for class 3: 0.933846266150814
PR AUC for class 3: 0.7974156724001422
AUC-RO

### Confusion_Matrix_KNN

In [None]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3127          119
Actual +          210         1508

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3839           25
Actual +           10         1090

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3754          160
Actual +           96          954

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3792          182
Actual +          184          806

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4889           24
Actual +            8           43

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4958          

## RF model

In [None]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_rf, y_train_rf)

In [None]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (SMOTE + ENN 'auto')",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (SMOTE + ENN 'auto')-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (SMOTE + ENN 'auto')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.85      0.96      0.90      1718
           2       0.98      0.99      0.99      1100
           3       0.94      0.85      0.89      1050
           4       0.87      0.78      0.82       990
           5       0.89      0.82      0.86        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       1.00      0.60      0.75         5

    accuracy                           0.90      4964
   macro avg       0.94      0.87      0.90      4964
weighted avg       0.91      0.90      0.90      4964

AUC-ROC for class 1: 0.9332279291356712
PR AUC for class 1: 0.8280835379033807
AUC-ROC for class 2: 0.9918911161302466
PR AUC for class 2: 0.9755841060908634
AUC-ROC for class 3: 0.9162143222133001
PR AUC for class 3: 0.831565062982

### Confusion_Matrix_RF

In [None]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         2958          288
Actual +           77         1641

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3847           17
Actual +           13         1087

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3862           52
Actual +          162          888

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3857          117
Actual +          216          774

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4908            5
Actual +            9           42

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## SVM model

In [None]:
# Train the model
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train_svm, y_train_svm)

In [None]:
# Predict the labels for the test set
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (BSMOTE + TL 'majority')",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (BSMOTE + TL 'majority')-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (BSMOTE + TL 'majority')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.89      0.65      0.75      1718
           2       0.93      0.90      0.91      1100
           3       0.66      0.79      0.72      1050
           4       0.58      0.68      0.63       990
           5       0.24      0.82      0.37        51
           6       1.00      1.00      1.00        45
           7       0.50      1.00      0.67         5
           8       0.25      0.40      0.31         5

    accuracy                           0.74      4964
   macro avg       0.63      0.78      0.67      4964
weighted avg       0.78      0.74      0.75      4964

AUC-ROC for class 1: 0.801776091214978
PR AUC for class 1: 0.6965920988930443
AUC-ROC for class 2: 0.9386693017127801
PR AUC for class 2: 0.8592073007515377
AUC-ROC for class 3: 0.8408528603061051
PR AUC for class 3: 0.5677141835129417
AU

### Confusion_Matrix_SVM

In [None]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3106          140
Actual +          607         1111

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3794           70
Actual +          115          985

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3492          422
Actual +          221          829

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3482          492
Actual +          313          677

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4780          133
Actual +            9           42

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4954            5
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4953          

## MLP Model

In [None]:
# Train the model
mlp = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 3)
mlp.fit(X_train_mlp, y_train_mlp)

In [None]:
# Predict the labels for the test set
y_pred_mlp = mlp.predict(X_test)
metrics_mlp = calculate_MacroAvg_metrics("MLP (SMOTE + TL 'majority')",y_pred_mlp, y_test)
final_result.append(metrics_mlp)
print("-----------------------------MLP (SMOTE + TL 'majority')-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

-----------------------------MLP (SMOTE + TL 'majority')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.75      0.82      1718
           2       0.98      0.98      0.98      1100
           3       0.87      0.79      0.83      1050
           4       0.61      0.86      0.71       990
           5       0.62      0.78      0.70        51
           6       1.00      1.00      1.00        45
           7       0.83      1.00      0.91         5
           8       0.40      0.40      0.40         5

    accuracy                           0.83      4964
   macro avg       0.78      0.82      0.79      4964
weighted avg       0.86      0.83      0.84      4964

AUC-ROC for class 1: 0.8561132282806025
PR AUC for class 1: 0.7732609686569868
AUC-ROC for class 2: 0.9863735177865612
PR AUC for class 2: 0.9642911792451854
AUC-ROC for class 3: 0.878828381633696
PR AUC for class 3: 0.7327484924422878
AUC

### Confusion_Matrix_MLP

In [None]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3130          116
Actual +          433         1285

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3843           21
Actual +           24         1076

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3793          121
Actual +          222          828

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3428          546
Actual +          139          851

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4889           24
Actual +           11           40

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4958            1
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4956          

# Macro Avg Results

In [None]:
, Randf_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,KNN (BSMOTE + TL 'auto'),0.860632,0.854316,0.851415,0.897059
1,Random Forest (SMOTE + ENN 'auto'),0.942786,0.874303,0.90111,0.903505
2,SVM (BSMOTE + TL 'majority'),0.631692,0.779879,0.669537,0.744561
3,MLP (SMOTE + TL 'majority'),0.779757,0.819828,0.793711,0.832393
