In [1]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf


from collections import Counter
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score,f1_score, auc, precision_recall_curve, accuracy_score, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,average_precision_score

# Load Real Data

In [3]:
real_df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/Rescaled_DATASET.csv')
# Delete rows where 'Category' is equal to 'Unknown'
real_df = real_df[(real_df['Category'] != 'Unknown')]
# Delete rows where 'column1' is equal to 'value1' and 'column2' is equal to 'value2'
real_df = real_df[(real_df['Category'] != 'Trojan Monitoring Software') & (real_df['Category'] != 'Backdoor') & (real_df['Category'] != 'Potentially Unwanted Software')]
real_df = real_df.drop(columns=["file_name","name", "Category", "Family Target"])
real_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
0,-0.995171,-0.995198,-0.988461,-0.999296,-0.998518,-0.993056,-0.999842,-0.999793,-0.999889,-0.999902,...,-0.999996,-0.99992,-0.999967,-0.999951,-0.999805,-0.999934,-0.999635,-0.999864,-0.999974,2
1,-0.996971,-0.9993,-0.993004,-0.997937,-0.995422,-0.99975,-0.99537,-0.994865,-0.998209,-0.999882,...,-0.999988,-0.999298,-1.0,-0.999255,-0.999344,-0.999281,-0.999322,-0.999387,-0.999373,3
4,-0.987096,-0.995503,-0.994812,-0.996726,-0.997999,-0.995715,-0.998633,-0.999423,-0.999211,-0.999774,...,-0.999993,-0.999714,-0.999994,-0.999868,-0.999842,-0.999679,-0.999824,-0.999905,-0.999935,1
8,-0.99189,-0.998705,-0.989185,-0.998874,-0.999152,-0.994189,-0.999159,-0.999824,-0.997573,-0.999702,...,-1.0,-1.0,-1.0,-1.0,-0.99979,-1.0,-0.999993,-1.0,-0.999946,1
10,-0.995647,-0.994348,-0.997736,-0.998155,-0.998579,-0.999525,-0.997752,-0.99878,-0.998818,-0.998609,...,-0.999736,-0.999845,-0.99971,-0.999824,-0.999777,-0.999829,-0.999805,-0.999807,-0.999887,3


In [4]:
features = real_df.drop(columns=["Category Target"]).astype(float)
# Prepare the target
targets = real_df["Category Target"].astype(int)

# Split Train/Test Data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

((25828, 40), (6457, 40))

# Load Fake Data

In [6]:
fake_df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/Fake_samples_10_percent_minority.csv')
fake_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
0,-0.999882,-0.981843,-0.997741,-0.999613,-0.997631,-0.999455,-0.999288,-0.996132,-0.999357,-0.998638,...,-0.999447,-0.999385,-0.995763,-0.99967,-0.997669,-0.999769,-0.999753,-0.999848,-0.9994,2
1,-0.987571,-0.996491,-0.998418,-0.99663,-0.999618,-0.995487,-0.998958,-0.998004,-0.998656,-0.999352,...,-0.999143,-0.999815,-0.997895,-0.999396,-0.999807,-0.999662,-0.998541,-0.999906,-0.99929,2
2,-0.999789,-0.99945,-0.999426,-0.999662,-0.995099,-0.999394,-0.999584,-0.995672,-0.998494,-0.99983,...,-0.999384,-0.998731,-0.999208,-0.998502,-0.999055,-0.999798,-0.999921,-0.999825,-0.999841,2
3,-0.99654,-0.998391,-0.977303,-0.998372,-0.99918,-0.990419,-0.996191,-0.99511,-0.998054,-0.998477,...,-0.999676,-0.998699,-0.995243,-0.948564,-0.995498,-0.998104,-0.999631,-0.997666,-0.999828,2
4,-0.998966,-0.999283,-0.999043,-0.998683,-0.999295,-0.992937,-0.999695,-0.996351,-0.996924,-0.999772,...,-0.999934,-0.999752,-0.998445,-0.999757,-0.99982,-0.999914,-0.999904,-0.99989,-0.999941,2


# Make Trainig Data Frame

In [7]:
train_df = pd.concat([X_train, y_train], axis=1)
# Rename the column containing labels to 'label'
train_df.rename(columns={0: 'Category Target'}, inplace=True)

In [8]:
train_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
62050,-0.994659,-0.99483,-0.996598,-0.998392,-0.999368,-0.997515,-0.999309,-0.99964,-0.998942,-0.999823,...,-0.999974,-0.999725,-0.999993,-0.999859,-0.999777,-0.999682,-0.999768,-0.999859,-0.999951,3
22262,-0.995171,-0.995191,-0.988454,-0.999288,-0.998535,-0.993056,-0.999845,-0.999785,-0.999889,-0.999904,...,-0.999996,-0.999921,-0.999967,-0.999944,-0.999803,-0.999931,-0.999641,-0.999864,-0.999976,2
18218,-0.995295,-0.994733,-0.988697,-0.999358,-0.998475,-0.992971,-0.999822,-0.999775,-0.99988,-0.999824,...,-0.999984,-0.999891,-0.999988,-0.999919,-0.999925,-0.999922,-0.999735,-0.999957,-0.99997,2
47673,-0.99189,-0.998705,-0.989185,-0.998874,-0.999152,-0.994189,-0.999159,-0.999824,-0.997573,-0.999702,...,-1.0,-1.0,-1.0,-1.0,-0.99979,-1.0,-0.999993,-1.0,-0.999946,1
56956,-0.997221,-0.98274,-0.997831,-0.998653,-0.997831,-0.999678,-0.998848,-0.998407,-0.996119,-0.999449,...,-0.999915,-0.999864,-0.999966,-0.999475,-0.999856,-0.999822,-0.999881,-0.999424,-0.999856,4


In [9]:
train_df['Category Target'].size

25828

# Combine fake and real samples for Training

In [10]:
# Append new_data to train_df
train_df = pd.concat([train_df, fake_df], ignore_index=True)
# Shuffle the rows while keeping the column order constant
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)

In [11]:
train_df.head()

Unnamed: 0,mov,add,push,pop,inc,call,xor,dec,cmp,or,...,scas,outsl,lret,insb,jae,outsb,popa,jo,ja,Category Target
0,-0.993942,-0.993203,-0.992749,-0.998888,-0.998472,-0.996348,-0.999529,-0.999433,-0.999028,-0.999846,...,-0.999951,-0.999488,-0.999919,-0.999473,-0.999581,-0.999235,-0.999465,-0.999473,-0.999945,1
1,-0.999105,-0.97899,-0.998247,-0.998622,-0.99786,-0.999731,-0.999265,-0.999262,-0.998957,-0.997152,...,-0.999876,-0.999758,-0.99997,-0.99945,-0.99984,-0.999834,-0.999867,-0.99977,-0.999946,5
2,-0.998345,-0.982996,-0.99765,-0.998563,-0.9984,-0.999385,-0.999436,-0.999266,-0.999424,-0.998817,...,-0.999948,-0.999813,-0.999877,-0.99917,-0.99973,-0.999865,-0.999849,-0.999722,-0.999917,7
3,-0.993008,-0.999217,-0.986369,-0.996377,-0.999863,-0.992597,-0.999354,-0.999902,-0.999119,-1.0,...,-1.0,-1.0,-1.0,-1.0,-0.99998,-1.0,-1.0,-1.0,-1.0,5
4,-0.986498,-0.995816,-0.994822,-0.996687,-0.998441,-0.995774,-0.998719,-0.999356,-0.999026,-0.999727,...,-0.999992,-0.999788,-0.999996,-0.999863,-0.999851,-0.999742,-0.999828,-0.999903,-0.99994,1


In [12]:
train_df['Category Target'].value_counts()

1    6872
2    4837
3    4622
4    4353
5    2250
6    1996
7    1827
8     963
Name: Category Target, dtype: int64

In [13]:
# Separate the features (X_train) and labels (y_train) again
X_train = train_df.drop('Category Target', axis=1)
y_train = train_df['Category Target']

In [14]:
X_train.shape, X_test.shape

((27720, 40), (6457, 40))

# Metrics Functions

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def calculate_metrics(y_true, y_pred):
    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Calculate AUC-ROC & Precision-Recall for each class
    for i in range(1, (real_df['Category Target'].nunique() + 1)):
        print(f"AUC-ROC for class {i}: {roc_auc_score(y_true == i, y_pred == i)}")
        print(f"PR AUC for class {i}: {average_precision_score(y_true == i, y_pred == i)}")

    # Calculate Accuracy
    print("Accuracy:")
    print(accuracy_score(y_true, y_pred))

In [16]:
def calculate_MacroAvg_metrics(description, y_pred, y_test):

    # Calculate each metric
    metrics = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics in a DataFrame
    metrics_df = pd.DataFrame({
        'Model': description,
        'Precision': metrics['macro avg']['precision'],
        'Recall': metrics['macro avg']['recall'],
        'F1-Score': metrics['macro avg']['f1-score'],
        'Accuracy': accuracy
    },index=[0])

    return metrics_df
final_result = []

In [17]:
def generate_multilabel_confusion_df(y_true, y_pred):
  labels = range(1, (real_df['Category Target'].nunique() + 1))
  cms = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
  confusion_matrices = {}
  for cm, label in zip(cms, labels):
      confusion_df = pd.DataFrame({
          'Predicted -': [cm[0][0], cm[1][0]],
          'Predicted +': [cm[0][1], cm[1][1]]
      }, index=['Actual -', 'Actual +'])
      confusion_matrices[label] = confusion_df
  return confusion_matrices

# Models

## KNN

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [19]:
y_pred_knn = knn.predict(X_test)
metrics_knn = calculate_MacroAvg_metrics("KNN (WGANGP 10% Upsampled Minority Classes)",y_pred_knn, y_test)
final_result.append(metrics_knn)

print("-----------------------------KNN (WGANGP 10% Upsampled Minority Classes)-----------------------------\n")
calculate_metrics(y_test, y_pred_knn)

-----------------------------KNN (WGANGP 10% Upsampled Minority Classes)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.88      0.91      0.89      1718
           2       0.95      0.99      0.97      1099
           3       0.86      0.89      0.87      1050
           4       0.81      0.73      0.77       990
           5       0.91      0.85      0.88       511
           6       1.00      0.99      1.00       454
           7       0.99      0.98      0.99       416
           8       0.96      0.96      0.96       219

    accuracy                           0.90      6457
   macro avg       0.92      0.91      0.92      6457
weighted avg       0.90      0.90      0.90      6457

AUC-ROC for class 1: 0.9318632868568125
PR AUC for class 1: 0.8214996222929064
AUC-ROC for class 2: 0.9877631468561634
PR AUC for class 2: 0.9385793926213668
AUC-ROC for class 3: 0.9288937620544796
PR AUC for class 3: 0.780

### Confusion_Matrix_KNN

In [20]:
confusion_knn = generate_multilabel_confusion_df(y_test, y_pred_knn)
for label, frame in confusion_knn.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         4518          221
Actual +          154         1564

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5300           58
Actual +           15         1084

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5256          151
Actual +          120          930

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5303          164
Actual +          268          722

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5901           45
Actual +           75          436

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         6003            0
Actual +            3          451

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6038            3
Actual +            7          409

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6230          

## RF

In [21]:
# Train the model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

In [22]:
# Predict the labels for the test set
y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_MacroAvg_metrics("Random Forest (WGANGP 10% Upsampled Minority Classes)",y_pred_rf, y_test)
final_result.append(metrics_rf)
print("-----------------------------Random Forest (WGANGP 10% Upsampled Minority Classes)-----------------------------\n")
calculate_metrics(y_test, y_pred_rf)

-----------------------------Random Forest (WGANGP 10% Upsampled Minority Classes)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.91      0.92      0.92      1718
           2       0.99      0.99      0.99      1099
           3       0.93      0.91      0.92      1050
           4       0.81      0.87      0.84       990
           5       0.99      0.89      0.93       511
           6       1.00      0.99      1.00       454
           7       1.00      0.99      1.00       416
           8       0.98      0.99      0.98       219

    accuracy                           0.93      6457
   macro avg       0.95      0.94      0.95      6457
weighted avg       0.93      0.93      0.93      6457

AUC-ROC for class 1: 0.9432978300830721
PR AUC for class 1: 0.8589454995624833
AUC-ROC for class 2: 0.9938872625390555
PR AUC for class 2: 0.9846107591714982
AUC-ROC for class 3: 0.9487788316732279
PR AUC for clas

### Confusion_Matrix_RF

In [23]:
confusion_rf = generate_multilabel_confusion_df(y_test, y_pred_rf)
for label, frame in confusion_rf.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         4585          154
Actual +          139         1579

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5351            7
Actual +           12         1087

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5332           75
Actual +           93          957

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5272          195
Actual +          133          857

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5940            6
Actual +           58          453

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         6003            0
Actual +            3          451

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6040            1
Actual +            3          413

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6233          

## SVM

In [24]:
svm = SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
svm.fit(X_train,y_train)

In [25]:
y_pred_svm = svm.predict(X_test)
metrics_svm = calculate_MacroAvg_metrics("SVM (WGANGP 10% Upsampled Minority Classes)",y_pred_svm, y_test)
final_result.append(metrics_svm)
print("-----------------------------SVM (WGANGP 10% Upsampled Minority Classes)-----------------------------\n")
calculate_metrics(y_test, y_pred_svm)

-----------------------------SVM (WGANGP 10% Upsampled Minority Classes)-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.81      0.68      1718
           2       0.83      0.90      0.86      1099
           3       0.69      0.62      0.65      1050
           4       0.50      0.41      0.45       990
           5       0.75      0.62      0.68       511
           6       0.99      0.99      0.99       454
           7       0.87      0.46      0.60       416
           8       0.00      0.00      0.00       219

    accuracy                           0.68      6457
   macro avg       0.65      0.60      0.61      6457
weighted avg       0.67      0.68      0.66      6457

AUC-ROC for class 1: 0.7981886120200914
PR AUC for class 1: 0.5197807587817252
AUC-ROC for class 2: 0.9312675577003221
PR AUC for class 2: 0.7607837230243965
AUC-ROC for class 3: 0.783765401111434
PR AUC for class 3: 0.4903

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_SVM

In [26]:
confusion_svm = generate_multilabel_confusion_df(y_test, y_pred_svm)
for label, frame in confusion_svm.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -         3731         1008
Actual +          328         1390

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5148          210
Actual +          108          991

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5113          294
Actual +          397          653

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5057          410
Actual +          586          404

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5839          107
Actual +          195          316

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         5998            5
Actual +            3          451

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6013           28
Actual +          226          190

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6238          

## MLP

In [27]:
model = MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5)

model.fit(X_train, y_train)
y_pred_mlp = model.predict(X_test)

metrics_mlp = calculate_MacroAvg_metrics("MLP (WGANGP 10% Upsampled Minority Classes))",y_pred_mlp, y_test)
final_result.append(metrics_mlp)

print("-----------------------------MLP (WGANGP 10% Upsampled Minority Classes))-----------------------------\n")
calculate_metrics(y_test, y_pred_mlp)

-----------------------------MLP (WGANGP 10% Upsampled Minority Classes))-----------------------------

Classification Report:
              precision    recall  f1-score   support

           1       0.27      1.00      0.42      1718
           2       0.00      0.00      0.00      1099
           3       0.00      0.00      0.00      1050
           4       0.00      0.00      0.00       990
           5       0.00      0.00      0.00       511
           6       0.00      0.00      0.00       454
           7       0.00      0.00      0.00       416
           8       0.00      0.00      0.00       219

    accuracy                           0.27      6457
   macro avg       0.03      0.12      0.05      6457
weighted avg       0.07      0.27      0.11      6457

AUC-ROC for class 1: 0.5
PR AUC for class 1: 0.26606783335914513
AUC-ROC for class 2: 0.5
PR AUC for class 2: 0.17020288059470343
AUC-ROC for class 3: 0.5
PR AUC for class 3: 0.16261421712869753
AUC-ROC for class 4: 0.5
PR

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Confusion_Matrix_MLP

In [28]:
confusion_mlp = generate_multilabel_confusion_df(y_test, y_pred_mlp)
for label, frame in confusion_mlp.items():
    print(f'Confusion Matrix for {label}:')
    print(frame)
    print()

Confusion Matrix for 1:
          Predicted -  Predicted +
Actual -            0         4739
Actual +            0         1718

Confusion Matrix for 2:
          Predicted -  Predicted +
Actual -         5358            0
Actual +         1099            0

Confusion Matrix for 3:
          Predicted -  Predicted +
Actual -         5407            0
Actual +         1050            0

Confusion Matrix for 4:
          Predicted -  Predicted +
Actual -         5467            0
Actual +          990            0

Confusion Matrix for 5:
          Predicted -  Predicted +
Actual -         5946            0
Actual +          511            0

Confusion Matrix for 6:
          Predicted -  Predicted +
Actual -         6003            0
Actual +          454            0

Confusion Matrix for 7:
          Predicted -  Predicted +
Actual -         6041            0
Actual +          416            0

Confusion Matrix for 8:
          Predicted -  Predicted +
Actual -         6238          

# Macro Avg Results

In [29]:
df_concat = pd.concat(final_result, axis=0, ignore_index=True)
df_concat

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,KNN (WGANGP 10% Upsampled Minority Classes),0.920406,0.913123,0.916255,0.899334
1,Random Forest (WGANGP 10% Upsampled Minority C...,0.951087,0.9436,0.946891,0.931392
2,SVM (WGANGP 10% Upsampled Minority Classes),0.649787,0.601164,0.613283,0.680657
3,MLP (WGANGP 10% Upsampled Minority Classes)),0.033258,0.125,0.052538,0.266068
