In [None]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf


from collections import Counter
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,average_precision_score

# Load Real Data

In [None]:
real_df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/Rescaled_DATASET.csv')
# Delete rows where 'Category' is equal to 'Unknown'
real_df = real_df[(real_df['Category'] != 'Unknown')]
# Delete rows where 'column1' is equal to 'value1' and 'column2' is equal to 'value2'
real_df = real_df[(real_df['Category'] != 'Trojan Monitoring Software') & (real_df['Category'] != 'Backdoor') & (real_df['Category'] != 'Potentially Unwanted Software')]
real_df = real_df.drop(columns=["file_name","name", "Category", "Family Target"])
real_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
0,-0.995171,-0.995198,-0.988461,-0.999296,-0.998518,-0.993056,-0.999842,-0.999793,-0.999889,-0.999902,...,-0.999996,-0.99992,-0.999967,-0.999951,-0.999805,-0.999934,-0.999635,-0.999864,-0.999974,2
1,-0.996971,-0.9993,-0.993004,-0.997937,-0.995422,-0.99975,-0.99537,-0.994865,-0.998209,-0.999882,...,-0.999988,-0.999298,-1.0,-0.999255,-0.999344,-0.999281,-0.999322,-0.999387,-0.999373,3
4,-0.987096,-0.995503,-0.994812,-0.996726,-0.997999,-0.995715,-0.998633,-0.999423,-0.999211,-0.999774,...,-0.999993,-0.999714,-0.999994,-0.999868,-0.999842,-0.999679,-0.999824,-0.999905,-0.999935,1
8,-0.99189,-0.998705,-0.989185,-0.998874,-0.999152,-0.994189,-0.999159,-0.999824,-0.997573,-0.999702,...,-1.0,-1.0,-1.0,-1.0,-0.99979,-1.0,-0.999993,-1.0,-0.999946,1
10,-0.995647,-0.994348,-0.997736,-0.998155,-0.998579,-0.999525,-0.997752,-0.99878,-0.998818,-0.998609,...,-0.999736,-0.999845,-0.99971,-0.999824,-0.999777,-0.999829,-0.999805,-0.999807,-0.999887,3


In [None]:
features = real_df.drop(columns=["Category Target"]).astype(float)
# Prepare the target
targets = real_df["Category Target"].astype(int)

# Split Train/Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((25828, 40), (6457, 40))

# Load Fake Data

In [None]:
fake_df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/Fake_samples_20_percent_minority.csv')
fake_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
0,-0.999919,-0.999991,-0.999812,-0.987941,-0.999717,-0.996263,-0.999336,-0.996359,-0.998097,-0.999416,...,-0.999537,-0.999875,-0.99078,-0.999073,-0.999832,-0.999655,-0.998752,-0.999744,-0.999834,2
1,-0.998529,-0.99921,-0.996625,-0.995348,-0.998636,-0.999635,-0.999365,-0.996392,-0.998008,-0.994739,...,-0.999374,-0.998241,-0.995622,-0.995949,-0.999908,-0.997161,-0.999239,-0.999927,-0.999462,2
2,-0.998967,-0.997969,-0.996306,-0.990246,-0.983596,-0.994683,-0.999213,-0.998867,-0.999625,-0.998377,...,-0.997972,-0.998413,-0.988392,-0.99891,-0.993769,-0.999562,-0.975699,-0.999176,-0.999587,2
3,-0.99632,-0.999803,-0.984721,-0.99724,-0.996267,-0.998956,-0.997333,-0.994125,-0.931219,-0.999351,...,-0.999303,-0.99764,-0.971503,-0.997992,-0.999621,-0.994554,-0.989555,-0.996913,-0.999865,2
4,-0.999927,-0.996486,-0.999233,-0.999016,-0.995594,-0.999782,-0.99958,-0.996745,-0.999129,-0.996367,...,-0.998633,-0.999986,-0.999715,-0.999243,-0.997769,-0.999952,-0.994553,-0.999853,-0.999259,2


# Make Trainig Data Frame

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
# Rename the column containing labels to 'label'
train_df.rename(columns={0: 'Category Target'}, inplace=True)

In [None]:
train_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
62050,-0.994659,-0.99483,-0.996598,-0.998392,-0.999368,-0.997515,-0.999309,-0.99964,-0.998942,-0.999823,...,-0.999974,-0.999725,-0.999993,-0.999859,-0.999777,-0.999682,-0.999768,-0.999859,-0.999951,3
22262,-0.995171,-0.995191,-0.988454,-0.999288,-0.998535,-0.993056,-0.999845,-0.999785,-0.999889,-0.999904,...,-0.999996,-0.999921,-0.999967,-0.999944,-0.999803,-0.999931,-0.999641,-0.999864,-0.999976,2
18218,-0.995295,-0.994733,-0.988697,-0.999358,-0.998475,-0.992971,-0.999822,-0.999775,-0.99988,-0.999824,...,-0.999984,-0.999891,-0.999988,-0.999919,-0.999925,-0.999922,-0.999735,-0.999957,-0.99997,2
47673,-0.99189,-0.998705,-0.989185,-0.998874,-0.999152,-0.994189,-0.999159,-0.999824,-0.997573,-0.999702,...,-1.0,-1.0,-1.0,-1.0,-0.99979,-1.0,-0.999993,-1.0,-0.999946,1
56956,-0.997221,-0.98274,-0.997831,-0.998653,-0.997831,-0.999678,-0.998848,-0.998407,-0.996119,-0.999449,...,-0.999915,-0.999864,-0.999966,-0.999475,-0.999856,-0.999822,-0.999881,-0.999424,-0.999856,4


In [None]:
train_df['Category Target'].size

25828

# Combine fake and real samples for Training

In [None]:
# Append new_data to train_df
train_df = pd.concat([train_df, fake_df], ignore_index=True)
# Shuffle the rows while keeping the column order constant
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)

In [None]:
train_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
0,-0.973909,-0.998533,-0.995467,-0.998646,-0.999718,-0.997423,-0.999643,-0.999831,-0.999417,-0.999737,...,-0.999981,-1.0,-0.999981,-0.999981,-0.999944,-1.0,-0.999172,-1.0,-0.999944,4
1,-0.998174,-0.982711,-0.998022,-0.998536,-0.998036,-0.999454,-0.999371,-0.999346,-0.999138,-0.99699,...,-0.999863,-0.999832,-0.999939,-0.999104,-0.999831,-0.999752,-0.999861,-0.999698,-0.999932,2
2,-0.998082,-0.998132,-0.997797,-0.999001,-0.969352,-0.999874,-0.998941,-0.996819,-0.993883,-0.99876,...,-0.998536,-0.997547,-0.9983,-0.999236,-0.99839,-0.996136,-0.999685,-0.998359,-0.999816,6
3,-0.994951,-0.997927,-0.99737,-0.997877,-0.998082,-0.999558,-0.998771,-0.99854,-0.998907,-0.998439,...,-0.999677,-0.999714,-0.999706,-0.999849,-0.999847,-0.999832,-0.999821,-0.999791,-0.999842,3
4,-0.993016,-0.999218,-0.986346,-0.996381,-0.999863,-0.992586,-0.999354,-0.999902,-0.99912,-1.0,...,-1.0,-1.0,-1.0,-1.0,-0.99998,-1.0,-1.0,-1.0,-1.0,5


In [None]:
train_df['Category Target'].value_counts()

1    6872
2    5277
3    5042
4    4749
5    2455
6    2178
7    1993
8    1051
Name: Category Target, dtype: int64

In [None]:
# Separate the features (X_train) and labels (y_train) again
X_train = train_df.drop('Category Target', axis=1)
y_train = train_df['Category Target']

In [None]:
X_train.shape, X_test.shape

((29617, 40), (6457, 40))

# Metrics Functions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (real_df['Category Target'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [None]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [None]:
def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (real_df['Category Target'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# Models

## KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (WGANGP 20% Upsampled Minority Classes)",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (WGANGP 20% Upsampled Minority Classes)-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

-----------------------------KNN (WGANGP 20% Upsampled Minority Classes)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.88      0.91      0.89      1718
           2       0.95      0.99      0.97      1099
           3       0.86      0.89      0.87      1050
           4       0.81      0.73      0.77       990
           5       0.90      0.85      0.88       511
           6       1.00      0.99      1.00       454
           7       0.99      0.98      0.99       416
           8       0.96      0.96      0.96       219

    accuracy                           0.90      6457
   macro avg       0.92      0.91      0.92      6457
weighted avg       0.90      0.90      0.90      6457

AUC-ROC for class 1: 0.9318632868568125
PR AUC for class 1: 0.8214996222929064
AUC-ROC for class 2: 0.9878564652585524
PR AUC for class 2: 0.9393999503184541
AUC-ROC for class 3: 0.9288012893339321
PR AUC for class 3: 0.779

### Confusion_Matrix_KNN

In [None]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         4518          221
Actual +          154         1564

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5301           57
Actual +           15         1084

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5255          152
Actual +          120          930

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5303          164
Actual +          269          721

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5900           46
Actual +           75          436

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         6003            0
Actual +            3          451

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6038            3
Actual +            7          409

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6230          

## RF

In [None]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (WGANGP 20% Upsampled Minority Classes)",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (WGANGP 20% Upsampled Minority Classes)-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (WGANGP 20% Upsampled Minority Classes)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.92      0.92      1718
           2       0.99      0.99      0.99      1099
           3       0.93      0.91      0.92      1050
           4       0.80      0.86      0.83       990
           5       0.98      0.89      0.93       511
           6       1.00      0.99      1.00       454
           7       1.00      0.99      1.00       416
           8       0.98      0.99      0.98       219

    accuracy                           0.93      6457
   macro avg       0.95      0.94      0.95      6457
weighted avg       0.93      0.93      0.93      6457

AUC-ROC for class 1: 0.9428177034446046
PR AUC for class 1: 0.8622452096853367
AUC-ROC for class 2: 0.9947038622440367
PR AUC for class 2: 0.9852232663655647
AUC-ROC for class 3: 0.9482101684764899
PR AUC for clas

### Confusion_Matrix_RF

In [None]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         4597          142
Actual +          145         1573

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5350            8
Actual +           10         1089

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5331           76
Actual +           94          956

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5257          210
Actual +          135          855

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5938            8
Actual +           58          453

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         6003            0
Actual +            3          451

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6040            1
Actual +            3          413

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6233          

## SVM

In [None]:
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train,y_train)

In [None]:
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (WGANGP 20% Upsampled Minority Classes)",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (WGANGP 20% Upsampled Minority Classes)-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (WGANGP 20% Upsampled Minority Classes)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.53      0.89      0.66      1718
           2       0.81      0.77      0.79      1099
           3       0.61      0.66      0.64      1050
           4       0.63      0.17      0.27       990
           5       0.70      0.64      0.67       511
           6       0.97      0.99      0.98       454
           7       0.97      0.46      0.62       416
           8       0.00      0.00      0.00       219

    accuracy                           0.65      6457
   macro avg       0.65      0.57      0.58      6457
weighted avg       0.66      0.65      0.62      6457

AUC-ROC for class 1: 0.8018424383800632
PR AUC for class 1: 0.5015855014243592
AUC-ROC for class 2: 0.868506474208288
PR AUC for class 2: 0.6653951827335881
AUC-ROC for class 3: 0.7902782107849613
PR AUC for class 3: 0.4591

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_SVM

In [None]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3396         1343
Actual +          194         1524

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5158          200
Actual +          248          851

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         4962          445
Actual +          354          696

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5368           99
Actual +          818          172

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5809          137
Actual +          184          327

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         5987           16
Actual +            3          451

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6035            6
Actual +          226          190

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6238          

## MLP

In [None]:
model = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5)

model.fit(X_train, y_train)
y_pred_mlp = model.predict(X_test)

metrics_mlp = calculate_MacroAvg_metrics("MLP (WGANGP 20% Upsampled Minority Classes))",y_pred_mlp, y_test)
final_result.append(metrics_mlp)

print("-----------------------------MLP (WGANGP 20% Upsampled Minority Classes))-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

-----------------------------MLP (WGANGP 20% Upsampled Minority Classes))-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.27      1.00      0.42      1718
           2       0.00      0.00      0.00      1099
           3       0.00      0.00      0.00      1050
           4       0.00      0.00      0.00       990
           5       0.00      0.00      0.00       511
           6       0.00      0.00      0.00       454
           7       0.00      0.00      0.00       416
           8       0.00      0.00      0.00       219

    accuracy                           0.27      6457
   macro avg       0.03      0.12      0.05      6457
weighted avg       0.07      0.27      0.11      6457

AUC-ROC for class 1: 0.5
PR AUC for class 1: 0.26606783335914513
AUC-ROC for class 2: 0.5
PR AUC for class 2: 0.17020288059470343
AUC-ROC for class 3: 0.5
PR AUC for class 3: 0.16261421712869753
AUC-ROC for class 4: 0.5
PR

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_MLP

In [None]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -            0         4739
Actual +            0         1718

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5358            0
Actual +         1099            0

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5407            0
Actual +         1050            0

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5467            0
Actual +          990            0

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5946            0
Actual +          511            0

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         6003            0
Actual +          454            0

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6041            0
Actual +          416            0

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6238          

# Macro Avg Results

In [None]:
df_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,KNN (WGANGP 20% Upsampled Minority Classes),0.92015,0.912997,0.916065,0.899179
1,Random Forest (WGANGP 20% Upsampled Minority C...,0.949599,0.94302,0.945845,0.930308
2,SVM (WGANGP 20% Upsampled Minority Classes),0.653227,0.573507,0.579448,0.65216
3,MLP (WGANGP 20% Upsampled Minority Classes)),0.033258,0.125,0.052538,0.266068
