In [1]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score, multilabel_confusion_matrix, classification_report, confusion_matrix, accuracy_score,average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import (
    SMOTE,
    BorderlineSMOTE,
    ADASYN
)
from imblearn.under_sampling import (
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours
)

# Load Data

In [3]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')\
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareFamily_DATASET_FINAL_Increased_Imbalance.csv')
# Delete rows where 'name' is equal to 'Unknown_Family'
df = df[(df['name'] != 'Unknown_Family')]
df.head()

Unnamed: 0,file_name,mov,add,push,pop,inc,call,xor,dec,cmp,...,insb,jae,outsb,popa,jo,ja,name,Category,Category Target,Family Target
0,VirusShare_61414397fe0108e7f3d729d4372850cd,0.176383,0.10218,0.170927,0.014749,0.027229,0.109421,0.007918,0.005528,0.00688,...,0.000797,0.007676,0.001593,0.00659,0.002607,0.000628,Vobfus,Worm,2,1
1,VirusShare_f6409ecf531a795368a0172ee19a1e09,0.035199,0.342921,0.040362,0.027478,0.039454,0.011811,0.012601,0.014605,0.016986,...,0.017364,0.003466,0.00514,0.002688,0.005788,0.001049,Vobfus,Worm,2,1
2,VirusShare_3466fc13ccaffd1202657fcfd72c8692,0.045042,0.30528,0.047044,0.022496,0.03323,0.005754,0.020863,0.017465,0.023421,...,0.013006,0.003432,0.003315,0.002557,0.006192,0.00138,Vobfus,Worm,2,1
3,VirusShare_fca52c87dadb1bf3ea7bf2d526edd12c,0.103277,0.102014,0.214684,0.015998,0.030507,0.137185,0.003077,0.004825,0.001814,...,0.001263,0.00502,0.001781,0.008517,0.003271,0.000745,Vobfus,Worm,2,1
4,VirusShare_93b8eb2c79d46e92ec816844a41fdbaa,0.182714,0.114227,0.179367,0.014457,0.027968,0.102761,0.008104,0.005888,0.010707,...,0.002045,0.004555,0.002092,0.006074,0.003719,0.000527,Vobfus,Worm,2,1


In [4]:
df['name'].value_counts()#/len(df)

name
Vobfus         4204
Zbot           2353
Diplugem       2269
Obfuscator     2102
Vundo          1877
VBInject       1688
Delf           1679
Beebone        1629
Winwebsec      1625
Enterak.A      1530
OnLineGames     137
Startpage       131
Allaple.A       129
Injector        116
Systex.A        110
Expiro.BK       110
FakeRean        109
Small           105
Toga!rfn         25
Lamechi.B        25
Name: count, dtype: int64

# Features/Targets

In [5]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Prepare the target
targets = df["Family Target"].astype(int)

In [6]:
print(features.shape)
targets.shape

(21953, 40)


(21953,)

# Split Train/Test Data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((17562, 40), (4391, 40))

# Metric Functions

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (df['name'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [9]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [10]:
from sklearn.metrics import multilabel_confusion_matrix
import pandas as pd

def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (df['name'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# def plot_confusion_matrix(y_true, y_pred):
#   labels = range(df['name'].nunique())
#   cm = confusion_matrix(y_true, y_pred, labels=labels)
#   plt.figure(figsize=(10,7))
#   sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
#   plt.xlabel('Predicted')
#   plt.ylabel('True')
#   plt.show()


# Balance Data

## 1. BSMOTE + TL

In [11]:
bsm_3_14 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=14)
X_resampled, y_resampled = bsm_3_14.fit_resample(X_train, y_train)
tl_maj = TomekLinks(sampling_strategy="majority")
X_train_rf, y_train_rf = tl_maj.fit_resample(X_resampled, y_resampled)

In [12]:
bsm_1_12 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=1, m_neighbors=12)
X_resampled, y_resampled = bsm_1_12.fit_resample(X_train, y_train)
tl_maj = TomekLinks(sampling_strategy="majority")
X_train_knn, y_train_knn=  tl_maj.fit_resample(X_resampled, y_resampled)

## 2. ADASYN + TL

In [13]:
ada_5 = ADASYN(sampling_strategy='auto', random_state=0,n_neighbors=5)
X_resampled, y_resampled = ada_5.fit_resample(X_train, y_train)
tl_auto = TomekLinks(sampling_strategy="auto")
X_train_mlp, y_train_mlp=  tl_auto.fit_resample(X_resampled, y_resampled)

## 3. SMOTE + ENN

In [14]:
sm_8 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=8)
X_resampled, y_resampled = sm_8.fit_resample(X_train, y_train)
enn1_maj = EditedNearestNeighbours(sampling_strategy='majority', n_neighbors=1, kind_sel='all')
X_train_svm, y_train_svm=  enn1_maj.fit_resample(X_resampled, y_resampled)

# Models

## KNN model

In [15]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_knn, y_train_knn)

In [16]:
# Predict the labels for the test set
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (BSMOTE + TL 'majority')",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (BSMOTE + TL 'majority')-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------KNN (BSMOTE + TL 'majority')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.98      0.98      0.98       841
           2       0.84      0.80      0.82       471
           3       1.00      1.00      1.00       454
           4       0.81      0.76      0.78       420
           5       0.92      0.93      0.92       375
           6       0.78      0.80      0.79       338
           7       0.97      0.96      0.97       336
           8       0.96      0.98      0.97       326
           9       0.81      0.87      0.84       325
          10       0.98      0.98      0.98       306
          11       0.76      0.93      0.83        27
          12       1.00      0.81      0.89        26
          13       0.96      1.00      0.98        26
          14       0.74      0.61      0.67        23
          15       0.96      1.00      0.98        22
          16       0.78 

### Confusion_Matrix_KNN

In [17]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3535           15
Actual +           18          823

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3847           73
Actual +           92          379

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3937            0
Actual +            1          453

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3896           75
Actual +          102          318

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         3984           32
Actual +           28          347

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         3975           78
Actual +           67          271

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4045           10
Actual +           13          323

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4050          

## RF model

In [18]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_rf, y_train_rf)

In [19]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (BSMOTE + TL 'majority')",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (BSMOTE + TL 'majority')-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (BSMOTE + TL 'majority')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       841
           2       0.87      0.85      0.86       471
           3       1.00      1.00      1.00       454
           4       0.82      0.85      0.84       420
           5       0.97      0.93      0.95       375
           6       0.79      0.92      0.85       338
           7       0.99      0.98      0.98       336
           8       0.99      0.99      0.99       326
           9       0.90      0.88      0.89       325
          10       1.00      0.98      0.99       306
          11       1.00      0.93      0.96        27
          12       1.00      0.81      0.89        26
          13       1.00      1.00      1.00        26
          14       0.84      0.70      0.76        23
          15       1.00      1.00      1.00        22
          16  

### Confusion_Matrix_RF

In [20]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3546            4
Actual +           10          831

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3860           60
Actual +           69          402

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3937            0
Actual +            1          453

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3891           80
Actual +           61          359

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4006           10
Actual +           26          349

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         3969           84
Actual +           26          312

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4050            5
Actual +            7          329

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4061          

## SVM model

In [21]:
# Train the model
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train_svm, y_train_svm)

In [22]:
# Predict the labels for the test set
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (SMOTE + ENN 'majority')",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (SMOTE + ENN 'majority')-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (SMOTE + ENN 'majority')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.92      0.94       841
           2       0.73      0.49      0.59       471
           3       1.00      0.99      1.00       454
           4       0.63      0.47      0.54       420
           5       0.82      0.83      0.82       375
           6       0.78      0.66      0.71       338
           7       0.98      0.96      0.97       336
           8       0.83      0.91      0.87       326
           9       0.82      0.69      0.75       325
          10       0.95      0.96      0.95       306
          11       0.68      0.85      0.75        27
          12       1.00      0.81      0.89        26
          13       0.76      1.00      0.87        26
          14       0.19      0.61      0.29        23
          15       0.92      1.00      0.96        22
          16       0.63 

### Confusion_Matrix_SVM

In [23]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3515           35
Actual +           66          775

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3836           84
Actual +          239          232

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3937            0
Actual +            3          451

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3857          114
Actual +          224          196

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         3945           71
Actual +           62          313

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         3990           63
Actual +          116          222

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4047            8
Actual +           13          323

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4005          

## MLP Model

In [24]:
# Train the model
mlp = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5)
mlp.fit(X_train_mlp, y_train_mlp)

In [25]:
# Predict the labels for the test set
y_pred_mlp = mlp.predict(X_test)
metrics_mlp = calculate_MacroAvg_metrics("MLP (ADASYN + TL 'auto')",y_pred_mlp, y_test)
final_result.append(metrics_mlp)
print("-----------------------------MLP (ADASYN + TL 'auto')-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

-----------------------------MLP (ADASYN + TL 'auto')-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.99      0.97      0.98       841
           2       0.84      0.63      0.72       471
           3       1.00      1.00      1.00       454
           4       0.66      0.69      0.67       420
           5       0.83      0.87      0.85       375
           6       0.72      0.71      0.72       338
           7       0.97      0.96      0.96       336
           8       0.95      0.98      0.96       326
           9       0.83      0.80      0.81       325
          10       0.92      0.99      0.95       306
          11       0.75      0.89      0.81        27
          12       0.78      0.81      0.79        26
          13       0.93      1.00      0.96        26
          14       0.54      0.61      0.57        23
          15       1.00      1.00      1.00        22
          16       0.77     

### Confusion_Matrix_MLP

In [26]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3539           11
Actual +           24          817

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3863           57
Actual +          173          298

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3937            0
Actual +            1          453

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3822          149
Actual +          131          289

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         3950           66
Actual +           49          326

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         3961           92
Actual +           98          240

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4044           11
Actual +           13          323

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4047          

# Macro Avg Results

In [27]:
df_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,KNN (BSMOTE + TL 'majority'),0.825208,0.830943,0.825691,0.906172
1,Random Forest (BSMOTE + TL 'majority'),0.952298,0.863201,0.890535,0.93555
2,SVM (SMOTE + ENN 'majority'),0.719241,0.807554,0.724329,0.795719
3,MLP (ADASYN + TL 'auto'),0.761246,0.815844,0.778239,0.862901
