I recreated https://doi.org/10.1007/s11416-022-00424-3 with a smaller data set to test out the methods for my research. So, I combined Word2Vec with Convolutional Neural Network, Random Forest Classifier, Support Vector Machine, and k-Nearest Neighbors.

 I chose to simply recreated the one's using Word2Vec for efficiency, and I didn't directly recreate the combo of Word2Vec and HMM2Vec because I decided to dedicate a whole other notebook to coding HMM2Vec, since there does not exist a library for HMM2Vec.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import sklearn.metrics as skm
from sklearn.utils import shuffle
import tensorflow as tf

from gensim.models import Word2Vec

import keras.layers
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# Import preprocessed data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/23-24/summer 24/malware_2vec.csv')

In [None]:
df = shuffle(df)

# Tests combining Word2Vec w/ KNN, SVM, RFC, and CNN

In [None]:
DIMENSIONS = 150
texts = [str(text).split() for text in df['text']]
w2v = Word2Vec(texts, max_vocab_size=31)

In [None]:
keyed_vecs = w2v.wv

In [None]:
X_w2v = []
for sample in df['text']:
  sum = 0       # sum of all vectors
  count = 0     # number of vectors in the average
  for split in str(sample).split(): # for each token
    try:                    # test to see if token is in keyed vector dictionary
      keyed_vecs[split]
    except KeyError:
      continue
    sum += keyed_vecs[split]
    count += 1
  if count != 0:
    X_w2v.append(sum/count)     # add averaged vector to X
  else:
    X_w2v.append(np.zeros(DIMENSIONS)) # if no vectors are in dictionary, add a neutral, zero vector

# split data set
X_trainw2v,X_testw2v,y_trainw2v,y_testw2v=train_test_split(X_w2v, df['label'], test_size=0.2, random_state=7)

## knn

In [None]:
w2v_knn = KNeighborsClassifier()
w2v_knn.fit(X_trainw2v, y_trainw2v)
y_pred_w2v_knn = w2v_knn.predict(X_testw2v)

## svm

In [None]:
w2v_svm = SVC()
w2v_svm.fit(X_trainw2v, y_trainw2v)
y_pred_w2v_svm = w2v_svm.predict(X_testw2v)

## rfc

In [None]:
w2v_rfc = RandomForestClassifier()
w2v_rfc.fit(X_trainw2v, y_trainw2v)
y_pred_w2v_rfc = w2v_rfc.predict(X_testw2v)

## cnn

In [None]:
# tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
word_index = tokenizer.word_index

data = pad_sequences(sequences)

# make word2vec embeddings
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in keyed_vecs:
        embedding_matrix[i] = keyed_vecs[word]

In [None]:
X_train_w2vcnn,X_test_w2vcnn,y_train_w2vcnn,y_test_w2vcnn=train_test_split(data, df['label'], test_size=0.2, random_state=7)

In [None]:
X_valid_w2vcnn, X_test_w2vcnn, y_valid_w2vcnn, y_test_w2vcnn = train_test_split(X_test_w2vcnn, y_test_w2vcnn, test_size=0.5, random_state=7)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_w2vcnn = label_encoder.fit_transform(y_train_w2vcnn)
y_valid_w2vcnn = label_encoder.transform(y_valid_w2vcnn)
y_test_w2vcnn = label_encoder.transform(y_test_w2vcnn)

In [None]:
weights = keyed_vecs.vectors

w2v_cnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = len(word_index) + 1, output_dim = embedding_dim, weights = [embedding_matrix], input_length = data.shape[1]),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Conv1D(filters = 32, kernel_size = 3, activation = 'relu'),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(3, activation = 'softmax')
])

w2v_cnn.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
NUM_EPOCHS = 100

early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 3)

fitted = w2v_cnn.fit(X_train_w2vcnn, y_train_w2vcnn, epochs = NUM_EPOCHS, validation_data=(X_valid_w2vcnn, y_valid_w2vcnn),
                   validation_steps=10, verbose=2, callbacks = [early_stopping])

In [None]:
y_pred_w2v_cnn = w2v_cnn.predict(X_test_w2vcnn)
y_pred_w2v_cnn = np.argmax(y_pred_w2v_cnn, axis=1)

# Evaluate

In [None]:
evaluation_df = pd.DataFrame(columns = ['Model', 'Accuracy', 'Confusion Matrix'])

In [None]:
def evaluate(df, name, y_pred, y_test):
  acc = skm.accuracy_score(y_test, y_pred)
  #prec = str(skm.precision_score(y_test, y_pred))
  #recall = str(skm.recall_score(y_test, y_pred))
  #f1 = str(skm.f1_score(y_test, y_pred))
  conf = str(skm.confusion_matrix(y_test, y_pred))
  df.loc[int(len(df.index))] = [name, acc, conf]
  return df

## w2v

In [None]:
evaluate(evaluation_df, "w2v_knn", y_pred_w2v_knn, y_testw2v)
evaluate(evaluation_df, "w2v_svm", y_pred_w2v_svm, y_testw2v)
evaluate(evaluation_df, "w2v_rfc", y_pred_w2v_rfc, y_testw2v)
evaluate(evaluation_df, "w2v_cnn", y_pred_w2v_cnn, y_test_w2vcnn)

Unnamed: 0,Model,Accuracy,Confusion Matrix
0,w2v_knn,0.995516,[[898 2 0]\n [ 3 415 0]\n [ 2 0 241]]
1,w2v_svm,0.980782,[[893 7 0]\n [ 15 397 6]\n [ 2 0 241]]
2,w2v_rfc,0.995516,[[898 2 0]\n [ 2 416 0]\n [ 1 2 240]]
3,w2v_cnn,0.99872,[[410 0 0]\n [ 1 228 0]\n [ 0 0 142]]


In [None]:
evaluation_df

Unnamed: 0,Model,Accuracy,Confusion Matrix
0,w2v_knn,0.993594,[[868 4 0]\n [ 4 416 0]\n [ 2 0 267]]
1,w2v_svm,0.983344,[[866 3 3]\n [ 13 402 5]\n [ 2 0 267]]
2,w2v_rfc,0.997438,[[872 0 0]\n [ 2 418 0]\n [ 1 1 267]]
