In [None]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score, multilabel_confusion_matrix, classification_report, confusion_matrix, accuracy_score,average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import (
    SMOTE,
    BorderlineSMOTE,
    ADASYN
)

# Load Data

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareCategory_DATASET_Increased_Imbalance.csv')
# Delete rows where 'Category' is equal to 'Unknown'
df = df[(df['Category'] != 'Unknown')]
# Delete rows where 'column1' is equal to 'value1' and 'column2' is equal to 'value2'
df = df[(df['Category'] != 'Trojan Monitoring Software') & (df['Category'] != 'Backdoor') & (df['Category'] != 'Potentially Unwanted Software')]
df.head()

Unnamed: 0,file_name,mov,add,push,pop,inc,call,xor,dec,cmp,...,insb,jae,outsb,popa,jo,ja,name,Category,Category Target,Family Target
0,VirusShare_703bb560ffbbdef898582d51935eed03,0.033368,0.392889,0.045533,0.022447,0.029113,0.007043,0.01288,0.010922,0.020978,...,0.021354,0.00177,0.002335,0.001732,0.006026,0.001017,Beebone,Trojan Downloader,5,8
1,VirusShare_ead649131199ce961265ff8f81bf4e56,0.067794,0.29211,0.059742,0.042351,0.034622,0.010789,0.034783,0.012238,0.006119,...,0.001449,0.000161,0.060386,0.002093,0.000805,0.000644,Obfuscator,Tool,4,4
2,VirusShare_ab921e0669670594c699391875c8f72f,0.045229,0.130716,0.158549,0.046223,0.01839,0.052187,0.041252,0.021372,0.004473,...,0.015905,0.013917,0.011928,0.007952,0.003976,0.000497,OnLineGames,Password Stealer,3,11
3,VirusShare_5d8a0e4f29a9bc1677bbad18156bf3cb,0.117216,0.032492,0.048115,0.040838,0.028375,0.005653,0.032425,0.030511,0.022121,...,0.003338,0.004006,0.003628,0.003405,0.004028,0.004206,Allaple.A,Worm,2,13
4,VirusShare_8d21dc13e0302c61eaa40e3b7d3755f0,0.117504,0.073477,0.194514,0.048658,0.026541,0.070122,0.031469,0.01615,0.033933,...,0.005314,0.004542,0.005938,0.005433,0.003711,0.002672,Enterak.A,Password Stealer,3,10


In [None]:
df['Category Target'].value_counts()#/len(df)

Category Target
1    8590
2    5497
3    5252
4    4948
5     256
6     227
8      25
7      25
Name: count, dtype: int64

# Features/Targets

In [None]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Prepare the target
targets = df["Category Target"].astype(int)

In [None]:
print(features.shape)
targets.shape

(24820, 40)


(24820,)

# Split Train/Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((19856, 40), (4964, 40))

# Metrics Functions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (df['Category'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [None]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [None]:
def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (df['Category'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# Balance Data

## 1. BSMOTE

In [None]:
bsm_3_19 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=19)
X_train_svm, y_train_svm = bsm_3_19.fit_resample(X_train, y_train)

In [None]:
bsm_3_17 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=17)
X_train_knn, y_train_knn= bsm_3_17.fit_resample(X_train, y_train)

## 2. SMOTE

In [None]:
sm_2 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=2)
X_train_mlp, y_train_mlp = sm_2.fit_resample(X_train, y_train)

In [None]:
sm_8 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=8)
X_train_rf, y_train_rf = sm_8.fit_resample(X_train, y_train)

# Models

## KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_knn, y_train_knn)

In [None]:
# Predict the labels for the test set
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (BSMOTE k=3, m=17)",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (BSMOTE k=3, m=17)-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

-----------------------------KNN (BSMOTE k=3, m=17)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.93      0.88      0.90      1718
           2       0.98      0.99      0.98      1100
           3       0.87      0.91      0.89      1050
           4       0.81      0.84      0.82       990
           5       0.70      0.84      0.77        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       0.50      0.40      0.44         5

    accuracy                           0.90      4964
   macro avg       0.85      0.86      0.85      4964
weighted avg       0.90      0.90      0.90      4964

AUC-ROC for class 1: 0.9208772756583369
PR AUC for class 1: 0.8574964475761738
AUC-ROC for class 2: 0.9914398644833429
PR AUC for class 2: 0.9701610286014835
AUC-ROC for class 3: 0.9343338929848893
PR AUC for class 3: 0.8058325406955102
AUC-ROC

### Confusion_Matrix_KNN

In [None]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3131          115
Actual +          211         1507

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3840           24
Actual +           12         1088

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3769          145
Actual +           99          951

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3782          192
Actual +          163          827

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4895           18
Actual +            8           43

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4957          

## RF model

In [None]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_rf, y_train_rf)

In [None]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (SMOTE k=8)",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (SMOTE k=8)-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (SMOTE k=8)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.90      0.92      1718
           2       0.99      0.99      0.99      1100
           3       0.93      0.90      0.92      1050
           4       0.81      0.90      0.85       990
           5       0.98      0.82      0.89        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       1.00      0.40      0.57         5

    accuracy                           0.92      4964
   macro avg       0.96      0.86      0.89      4964
weighted avg       0.92      0.92      0.92      4964

AUC-ROC for class 1: 0.936130399947782
PR AUC for class 1: 0.8816195883960979
AUC-ROC for class 2: 0.9933808582721627
PR AUC for class 2: 0.9833936214449182
AUC-ROC for class 3: 0.9432179964474292
PR AUC for class 3: 0.8637243099985108
AUC-R

### Confusion_Matrix_RF

In [None]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3145          101
Actual +          166         1552

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3855            9
Actual +           12         1088

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3846           68
Actual +          101          949

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3760          214
Actual +          102          888

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4912            1
Actual +            9           42

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## SVM model

In [None]:
# Train the model
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train_svm, y_train_svm)

In [None]:
# Predict the labels for the test set
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (BSMOTE k=3, m=19)",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (BSMOTE k=3, m=19)-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (BSMOTE k=3, m=19)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.89      0.67      0.76      1718
           2       0.93      0.88      0.90      1100
           3       0.67      0.79      0.73      1050
           4       0.58      0.67      0.63       990
           5       0.23      0.82      0.36        51
           6       1.00      1.00      1.00        45
           7       0.31      1.00      0.48         5
           8       0.22      0.40      0.29         5

    accuracy                           0.75      4964
   macro avg       0.60      0.78      0.64      4964
weighted avg       0.78      0.75      0.76      4964

AUC-ROC for class 1: 0.8120312131273595
PR AUC for class 1: 0.7073991721319443
AUC-ROC for class 2: 0.9304244306418219
PR AUC for class 2: 0.8440962569657001
AUC-ROC for class 3: 0.8424554103705867
PR AUC for class 3: 0.5749160870585625
AUC-ROC

### Confusion_Matrix_SVM

In [None]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3097          149
Actual +          567         1151

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3790           74
Actual +          132          968

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3512          402
Actual +          223          827

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3500          474
Actual +          322          668

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4774          139
Actual +            9           42

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4948           11
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4952          

## MLP Model

In [None]:
# Train the model
mlp = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5)
mlp.fit(X_train_mlp, y_train_mlp)

In [None]:
# Predict the labels for the test set
y_pred_mlp = mlp.predict(X_test)
metrics_mlp = calculate_MacroAvg_metrics("MLP (SMOTE k=2)",y_pred_mlp, y_test)
final_result.append(metrics_mlp)
print("-----------------------------MLP (SMOTE k=2)-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

-----------------------------MLP (SMOTE k=2)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.90      0.86      0.88      1718
           2       0.99      0.99      0.99      1100
           3       0.83      0.89      0.86      1050
           4       0.79      0.77      0.78       990
           5       0.48      0.82      0.60        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       1.00      0.40      0.57         5

    accuracy                           0.88      4964
   macro avg       0.87      0.84      0.84      4964
weighted avg       0.88      0.88      0.88      4964

AUC-ROC for class 1: 0.907097801754035
PR AUC for class 1: 0.8263928651784638
AUC-ROC for class 2: 0.991499623564841
PR AUC for class 2: 0.9777071350217676
AUC-ROC for class 3: 0.9192240309511642
PR AUC for class 3: 0.7629312601015589
AUC-ROC for clas

### Confusion_Matrix_MLP

In [None]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3085          161
Actual +          234         1484

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3851           13
Actual +           15         1085

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3729          185
Actual +          120          930

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3768          206
Actual +          230          760

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4867           46
Actual +            9           42

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

# Macro Avg Results

In [None]:
df_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,"KNN (BSMOTE k=3, m=17)",0.848964,0.85631,0.851002,0.900081
1,Random Forest (SMOTE k=8),0.955798,0.864597,0.892934,0.92083
2,"SVM (BSMOTE k=3, m=19)",0.604872,0.779483,0.64287,0.746978
3,MLP (SMOTE k=2),0.873549,0.840885,0.835221,0.876914
