In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from collections import Counter
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score,multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

# Load Data

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareCategory_DATASET_Increased_Imbalance.csv')
# Delete rows where 'Category' is equal to 'Unknown'
df = df[(df['Category'] != 'Unknown')]
df['Category'].value_counts()

Category
Trojan               8590
Worm                 5497
Password Stealer     5252
Tool                 4948
Trojan Downloader     256
Browser Modifier      227
Virus                  25
Trojan Dropper         25
Name: count, dtype: int64

# Drop Samples Less Than 100

In [None]:
# Delete rows where 'column1' is equal to 'value1' and 'column2' is equal to 'value2'
df = df[(df['Category'] != 'Trojan Monitoring Software') & (df['Category'] != 'Backdoor') & (df['Category'] != 'Potentially Unwanted Software')]

In [None]:
df['Category'].value_counts()

Category
Trojan               8590
Worm                 5497
Password Stealer     5252
Tool                 4948
Trojan Downloader     256
Browser Modifier      227
Virus                  25
Trojan Dropper         25
Name: count, dtype: int64

In [None]:
df['Category Target'].value_counts()

Category Target
1    8590
2    5497
3    5252
4    4948
5     256
6     227
8      25
7      25
Name: count, dtype: int64

In [None]:
df['Category Target'].value_counts()/len(df)

Category Target
1    0.346092
2    0.221475
3    0.211604
4    0.199355
5    0.010314
6    0.009146
8    0.001007
7    0.001007
Name: count, dtype: float64

In [None]:
df.head()

Unnamed: 0,file_name,mov,add,push,pop,inc,call,xor,dec,cmp,...,insb,jae,outsb,popa,jo,ja,name,Category,Category Target,Family Target
0,VirusShare_703bb560ffbbdef898582d51935eed03,0.033368,0.392889,0.045533,0.022447,0.029113,0.007043,0.01288,0.010922,0.020978,...,0.021354,0.00177,0.002335,0.001732,0.006026,0.001017,Beebone,Trojan Downloader,5,8
1,VirusShare_ead649131199ce961265ff8f81bf4e56,0.067794,0.29211,0.059742,0.042351,0.034622,0.010789,0.034783,0.012238,0.006119,...,0.001449,0.000161,0.060386,0.002093,0.000805,0.000644,Obfuscator,Tool,4,4
2,VirusShare_ab921e0669670594c699391875c8f72f,0.045229,0.130716,0.158549,0.046223,0.01839,0.052187,0.041252,0.021372,0.004473,...,0.015905,0.013917,0.011928,0.007952,0.003976,0.000497,OnLineGames,Password Stealer,3,11
3,VirusShare_5d8a0e4f29a9bc1677bbad18156bf3cb,0.117216,0.032492,0.048115,0.040838,0.028375,0.005653,0.032425,0.030511,0.022121,...,0.003338,0.004006,0.003628,0.003405,0.004028,0.004206,Allaple.A,Worm,2,13
4,VirusShare_8d21dc13e0302c61eaa40e3b7d3755f0,0.117504,0.073477,0.194514,0.048658,0.026541,0.070122,0.031469,0.01615,0.033933,...,0.005314,0.004542,0.005938,0.005433,0.003711,0.002672,Enterak.A,Password Stealer,3,10


# Features/Targets

In [None]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Standardize the features
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features)
# features_scaled = pd.DataFrame(features_scaled, columns=features.columns)

# Prepare the target
targets = df["Category Target"].astype(int)

In [None]:
features.shape

(24820, 40)

In [None]:
targets.shape

(24820,)

# Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((19856, 40), (4964, 40))

# Metric Functions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (df['Category'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [None]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [None]:
def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (df['Category'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# Models

## KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (Unbalanced)",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (Unbalanced)-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

-----------------------------KNN (Unbalanced)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.87      0.92      0.90      1718
           2       0.97      0.99      0.98      1100
           3       0.88      0.87      0.88      1050
           4       0.84      0.75      0.79       990
           5       0.84      0.73      0.78        51
           6       1.00      1.00      1.00        45
           7       0.83      1.00      0.91         5
           8       1.00      0.20      0.33         5

    accuracy                           0.89      4964
   macro avg       0.91      0.81      0.82      4964
weighted avg       0.89      0.89      0.89      4964

AUC-ROC for class 1: 0.9256588389973295
PR AUC for class 1: 0.8304828150342536
AUC-ROC for class 2: 0.9904710144927535
PR AUC for class 2: 0.9613885527398695
AUC-ROC for class 3: 0.9215577779399957
PR AUC for class 3: 0.7983167332264834
AUC-ROC for c

### Confusion_Matrix_KNN

In [None]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3009          237
Actual +          130         1588

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3829           35
Actual +           11         1089

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3792          122
Actual +          132          918

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3837          137
Actual +          248          742

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4906            7
Actual +           14           37

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4958            1
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## RF model

In [None]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (Unbalanced)",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (Unbalanced)-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (Unbalanced)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.92      0.92      1718
           2       0.99      0.99      0.99      1100
           3       0.94      0.90      0.92      1050
           4       0.82      0.87      0.84       990
           5       1.00      0.65      0.79        51
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00         5
           8       0.00      0.00      0.00         5

    accuracy                           0.92      4964
   macro avg       0.83      0.79      0.81      4964
weighted avg       0.92      0.92      0.92      4964

AUC-ROC for class 1: 0.9391280895910575
PR AUC for class 1: 0.8731424984428126
AUC-ROC for class 2: 0.9928632599284775
PR AUC for class 2: 0.9798296751556298
AUC-ROC for class 3: 0.941986763024065
PR AUC for class 3: 0.8675162119642378
AUC-

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_RF

In [None]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3104          142
Actual +          134         1584

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3851           13
Actual +           12         1088

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3855           59
Actual +          106          944

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3785          189
Actual +          128          862

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4913            0
Actual +           18           33

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            0            5

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## SVM model

In [None]:
# Train the model
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (Unbalanced)",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (Unbalanced)-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (Unbalanced)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.79      0.83      0.81      1718
           2       0.93      0.98      0.95      1100
           3       0.77      0.70      0.73      1050
           4       0.64      0.64      0.64       990
           5       0.57      0.16      0.25        51
           6       1.00      1.00      1.00        45
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         5

    accuracy                           0.79      4964
   macro avg       0.59      0.54      0.55      4964
weighted avg       0.78      0.79      0.78      4964

AUC-ROC for class 1: 0.8565357058064479
PR AUC for class 1: 0.7123658831353932
AUC-ROC for class 2: 0.9773122529644269
PR AUC for class 2: 0.9107770426581514
AUC-ROC for class 3: 0.8202812857386184
PR AUC for class 3: 0.60209529597769
AUC-ROC for cla

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_SVM

In [None]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         2855          391
Actual +          286         1432

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3780           84
Actual +           26         1074

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3700          214
Actual +          320          730

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3624          350
Actual +          360          630

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4907            6
Actual +           43            8

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            5            0

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

## MLP Model

In [None]:
# Train the model
mlp = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5)
mlp.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred_mlp = mlp.predict(X_test)
metrics_mlp = calculate_MacroAvg_metrics("MLP (Unbalanced)",y_pred_mlp, y_test)
final_result.append(metrics_mlp)
print("-----------------------------MLP (Unbalanced)-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------MLP (Unbalanced)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.83      0.90      0.86      1718
           2       0.98      0.98      0.98      1100
           3       0.86      0.78      0.81      1050
           4       0.73      0.72      0.72       990
           5       0.74      0.57      0.64        51
           6       1.00      1.00      1.00        45
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         5

    accuracy                           0.85      4964
   macro avg       0.64      0.62      0.63      4964
weighted avg       0.85      0.85      0.85      4964

AUC-ROC for class 1: 0.9010436055623577
PR AUC for class 1: 0.7832152677804785
AUC-ROC for class 2: 0.9870238095238095
PR AUC for class 2: 0.9639596118392114
AUC-ROC for class 3: 0.8704662140788865
PR AUC for class 3: 0.7111343883526473
AUC-ROC for c

### Confusion_Matrix_MLP

In [None]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         2938          308
Actual +          177         1541

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         3841           23
Actual +           22         1078

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         3776          138
Actual +          235          815

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         3707          267
Actual +          280          710

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         4903           10
Actual +           22           29

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         4919            0
Actual +            0           45

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         4959            0
Actual +            5            0

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         4959          

# Macro Avg Results

In [None]:
df_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,KNN (Unbalanced),0.905009,0.80795,0.821198,0.891418
1,Random Forest (Unbalanced),0.833409,0.790988,0.807316,0.918815
2,SVM (Unbalanced),0.587571,0.537294,0.547255,0.789484
3,MLP (Unbalanced),0.642254,0.61737,0.627965,0.849718
