# General File Classifier

A CNN connected to a Dense network that predicts file labels given the first 512 bytes of a file

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from ml_utils import grab_labels, feature_from_file, translate_bytes
import csv

le = LabelEncoder()
naivetruth_path = "/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/automated_training_results/balanced_cdiac_subset.csv"

labels, file_paths = grab_labels(naivetruth_path)
labels.pop(0) #Gets rid of headers
file_paths.pop(0)

x = []

for file_path in file_paths:
    x.append(feature_from_file(file_path))

x = translate_bytes(x) / 255
y = to_categorical(le.fit_transform(labels), 6)

x_train, x_test, y_train, y_test = train_test_split(x, y)

print(x_train)

In [None]:
print(le.classes_)
for unique in set(labels):
    print("{} is {} and there are {} files".format(unique, ((labels.count(unique) / len(labels)) * 100), labels.count(unique)))

### Model

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, Dense, MaxPooling1D, GlobalMaxPooling1D, Reshape, Flatten


classifier_model = Sequential()
classifier_model.add(Reshape((len(x[0]), 1), input_shape=(len(x[0]),)))
classifier_model.add(Conv1D(50, 32, activation='relu'))
classifier_model.add(MaxPooling1D(pool_size=2))
classifier_model.add(Conv1D(50, 32, activation='relu'))
classifier_model.add(GlobalMaxPooling1D())
classifier_model.add(Dense(50, activation='relu'))
classifier_model.add(Dense(32, activation='relu'))
classifier_model.add(Dense(28, activation='relu'))
classifier_model.add(Dense(24, activation='relu'))
classifier_model.add(Dense(16, activation='relu'))
classifier_model.add(Dense(8, activation='relu'))
classifier_model.add(Dense(6, activation='softmax'))

classifier_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
classifier_model.summary()

history = classifier_model.fit(x_train, y_train,
                               epochs=20,
                               batch_size = 16,
                               shuffle=True,
                               validation_data=(x_test, y_test)) 

In [None]:
%%capture --no-display 

import matplotlib.pyplot as plt

# set up figure
f = plt.figure(figsize=(12,6))
f.add_subplot(1,2, 1)

# plot accuracy as a function of epoch
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')

# plot loss as a function of epoch
f.add_subplot(1,2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')
plt.show(block=True)

## Model Validation

In [None]:
import csv
from ml_utils import grab_labels, feature_from_file, translate_bytes, features_from_list

validation_set = "/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/automated_training_results/nist_subset.csv"

validation_labels, validation_paths = grab_labels(validation_set)
validation_labels.pop(0) #Gets rid of headers
validation_paths.pop(0)

validation_features = features_from_list(validation_paths)

validation_encoded = translate_bytes(validation_features) / 255
validation_labels = to_categorical(le.transform(validation_labels), 6)
validation_predictions = classifier_model.predict(validation_encoded, verbose=True)

In [None]:
from ml_utils import convert_to_index, plot_confusion_matrix
from pycm import ConfusionMatrix
import numpy as np

# apply conversion function to data
y_test_ind = convert_to_index(validation_labels)
y_pred_test_ind = convert_to_index(validation_predictions)

# compute confusion matrix
cm_test = ConfusionMatrix(y_test_ind, y_pred_test_ind)
np.set_printoptions(precision=2)

# plot confusion matrix result
plt.figure()
plot_confusion_matrix(cm_test,title='confusion matrix')

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

validation_accuracy = accuracy_score(y_test_ind, y_pred_test_ind)
validation_recall = recall_score(y_test_ind, y_pred_test_ind, average='micro')

print(validation_accuracy)