In [28]:
#Importing necessary packages
import numpy as np
import matplotlib.pyplot as plt
from keras import models
from keras.applications.vgg16 import VGG16
from keras.models import Sequential, Model
from keras.layers import Dense, Activation ,Flatten, Conv2D, Input
from sklearn.model_selection import train_test_split
import tensorflow as tf
from PIL import Image


In [1]:
#Importing google drive since colab disk does not have required space to load the datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Malimg Dataset**

Due to colab limits, images were resized to 112x112

In [None]:
#generating our dataset
from keras.preprocessing.image import ImageDataGenerator
batches = ImageDataGenerator().flow_from_directory(directory='/content/drive/MyDrive/malimg_paper_dataset_imgs',target_size=(112,112), batch_size=10000)

Found 9339 images belonging to 25 classes.


In [None]:
#Generating images with labels
imgs, labels = next(batches)

In [None]:
#Seeing shape of images and labels
imgs.shape, labels.shape

((9339, 112, 112, 3), (9339, 25))

In [None]:
%cd /content/drive/MyDrive/malimg_paper_dataset_imgs

/content/drive/MyDrive/malimg_paper_dataset_imgs


In [None]:
#Saving the np array for future use
np.save("imgs",imgs)

In [None]:
#Saving np array for future use
np.save("labels",labels)

In [9]:
#Code for loading saved variables
#imgs=np.load('/content/drive/MyDrive/Malimg/malimg_paper_dataset_imgs/imgs.npy')
#labels=np.load('/content/drive/MyDrive/Malimg/malimg_paper_dataset_imgs/labels.npy')

In [10]:
#Generating train and test data
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imgs/255.,labels, test_size=0.1)
X_train.shape, X_test.shape , y_train.shape, y_test.shape

((8405, 112, 112, 3), (934, 112, 112, 3), (8405, 25), (934, 25))

In [None]:
model_vgg16_conv = VGG16(weights='imagenet', include_top=False) #taking vgg-16 model and removing last 3 fc and softmax layers
 
#Creating our own input format
keras_input = Input(shape=(112,112,3), name = 'image_input') 
    
#Use the generated model 
output_vgg16_conv = model_vgg16_conv(keras_input)
    
#Add the fully-connected layers 
x = Flatten(name='flatten')(output_vgg16_conv)
x = Dense(4096, activation='relu', name='fc1')(x)
x = Dense(4096, activation='relu', name='fc2')(x)
x = Dense(25, activation='softmax', name='predictions')(x)
    
#final model creation 
pretrained_model = Model(inputs=keras_input, outputs=x)
pretrained_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 112, 112, 3)]     0         
_________________________________________________________________
vgg16 (Functional)           (None, None, None, 512)   14714688  
_________________________________________________________________
flatten (Flatten)            (None, 4608)              0         
_________________________________________________________________
fc1 (Dense)                  (None, 4096)              18878464  
_________________________________________________________________
fc2 (Dense)                  (None, 4096)              16781312  
_________________________________________________________________
predictions (Dense)          (None, 25)                102425    
Total params: 50,476,889
Trainable params: 50,476,889
Non-trainable params: 0
_______________________________________________

In [13]:
#defining learning rate scheduler as learning rate is reduced by factor of 10 after 20th epoch
def scheduler(epoch, lr):
  if epoch == 21:
    return lr/10
  else:
    return lr

In [14]:
#adding regularization to pretrained vgg-16 convolutional layers
import os
import tempfile

def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0005)):

    #error checking if regularizer object is not passed
    if not isinstance(regularizer, tf.keras.regularizers.Regularizer): 
      print("Regularizer must be a subclass of tf.keras.regularizers.Regularizer")
      return model
    #if layer has a regularizer attribute set it to required value
    for layer in model.layers:
        for attr in ['kernel_regularizer']:
            if hasattr(layer, attr):
              setattr(layer, attr, regularizer)

    # When we change the layers attributes, the change only happens in the model config file
    #Therefore to achieve the desired effect we need to save and reload the model
    model_json = model.to_json()

    #The problem with this is that the weights get lost when saving and reloading.
    # Save the weights before reloading the model.
    tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')
    model.save_weights(tmp_weights_path)

    # load the model from the config
    model = tf.keras.models.model_from_json(model_json)
    
    # Reload the model weights
    model.load_weights(tmp_weights_path, by_name=True)
    return model

In [None]:
#Adding regularization
pretrined_model=add_regularization(pretrained_model)


In [15]:
#Setting learning rate and momentum
opt = tf.keras.optimizers.SGD(learning_rate=0.001,momentum=0.9)

In [None]:
#Compiling the model
pretrined_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
#Training the model. All the variables are initialized as described in research paper
batch_size = 6
epochs  = 25
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
history = pretrined_model.fit(X_train, y_train,
                    epochs=epochs, callbacks=[callback], batch_size=batch_size)


Colab got stuck when ran the code second time for saving the model so trained the model on different colab ID and now loading it here.

In [30]:
pretrined_model=models.load_model('/content/drive/MyDrive/Malimg/')

In [31]:
#Testing on test set
print("starting evaluating")
pretrined_model.evaluate(X_test, y_test, verbose=0)

starting evaluating


[1.8986002206802368, 0.9957173466682434]

# **Microsoft Malware Classification Dataset**

In [None]:
#Reading the train labels file
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/malware-classification/trainLabels.csv')
df.head()

Unnamed: 0,Id,Class
0,01kcPWA9K2BOxQeS5Rju,1
1,04EjIdbPV5e1XroFOpiN,1
2,05EeG39MTRrI6VY21DPd,1
3,05rJTUWYAKNegBk2wE8X,1
4,0AnoOZDNbPXIr2MRBSCJ,1


In [None]:
#Seeing number of training instances
df.shape

(10868, 2)

In [None]:
#Changing directory to Microsoft Malware Dataset
%cd /content/drive/MyDrive/malware-classification/train

/content/drive/MyDrive/malware-classification/train


In [None]:
#Changing dataframe to numpy array so that we can utilize the labels name and 
# save the images in label folders after creating them as we can see in the 
#next step
records = df.to_records(index=False)
print(records.shape)

(10868,)


## Process of creating images from .bytes file is as follows


1.   Trainlabels file contains name of every train file along with its label.
2.   We take a record from trainlabels and add .bytes so that it matches name of the file and then we open that file.
3.   After opening a file, we already know that every line of .bytes file contains some line offset at the start followed by 16 hexadecimal representation form of malware code. A number of these lines make complete malware code.
4.   We ignore the first offset part and convert hexadecimal representation to decimal representation and store all the decimal numbers in an array.
5.   Then we calculate file size as width of image depends on file size.
6.   Then we create a matrix with specified width and height as total decimal numbers divided by width.
7.   At last we convert the matrix to an image format and save it.





In [None]:
#importing os for calculating file size as we have to keep width of the image according to the
#file size
import os 
#directory where we have to keep malware images
dir='/content/drive/MyDrive/malware-classification/train/train_imgs/'
#Taking each tuple(name,label) from records
for t in records:                                                    
    FileToOpen=t[0]
    #Adding .bytes so that it matches name of the file      
    FileToOpen+='.bytes'
    #Opening .bytes file                                                   
    with open(FileToOpen) as f:
      #Declaring array which will store decimal values of 8 bits in a contiguous manner                                            
      array=[]
      #Taking each line in .bytes file                                                              
      for line in f:
        #Splitting it into offset part + 16 hexadecimal numbers                                                       
        xx=line.split() 
        #Checking for some errors if line is not of above format                                                   
        if len(xx)!=17:                                                    
          continue
        #xx[1:] contains only hexadecimal numbers, there were some ?? which 
        #were replaced by zeroes otherwise store corresponding decimal representation  
        array.append([int(i,16) if i!='??' else 0 for i in xx[1:] ])       
      #Converting it to a numpy array so that we can reshape it                                                                     
      array=np.array(array)
      #For calculating file size                                                
      f.seek(0, os.SEEK_END)
      #Converting file size to kilobytes                                               
      kbyt=f.tell()/1024
      #Code for initializing width of image based on file size                                                   
      if kbyt < 10:
        width=32
      elif kbyt < 30:
        width=64
      elif kbyt < 60:
        width=128
      elif kbyt < 100:
        width=256
      elif kbyt < 200:
        width=384
      elif kbyt < 500:
        width=512
      elif kbyt < 1000:
        width=768
      else:
        width=1024
      #Calculating height of image(total decimal numbers/width)  
      height=int((array.shape[0]*16)/width)
      #Creating matrix by taking only width*height numbers of decimal numbers
      #rest of the numbers are not taken as almost complete malware code is included
      #in this fashion and we reject few numbers at the end otherwise it will not form an
      #image and then we would have to pad it which would alter malware code
      Matrix=array[:width*height//16,:]
      #Reshaping it to (height x width)
      decMat = np.reshape(Matrix,(height,width))
      #Generating image from matrix
      im = Image.fromarray(np.uint8(decMat))
      #Saving the image in corresponding labels folder
      im.save(dir+str(t[1])+'/'+t[0]+'.png', "PNG")
#Prints end of pre-processing
print("Preprocessing is  done")

Preprocessing is  done


## Model Creation and Training Process

In [None]:
#Generating dataset
from keras.preprocessing.image import ImageDataGenerator
batches = ImageDataGenerator().flow_from_directory(directory='/content/drive/MyDrive/malware-classification/train/train_imgs',target_size=(112,112), batch_size=11000)

Found 10868 images belonging to 9 classes.


In [None]:
#Generating images with labels
imgs, labels = next(batches)

In [None]:
#Visualizing shape of images and labels
imgs.shape, labels.shape

((10868, 112, 112, 3), (10868, 9))

In [None]:
#Code for loading saved variables
#imgs=np.load('/content/drive/MyDrive/malware-classification/imgsmicro.npy')
#labels=np.load('/content/drive/MyDrive/malware-classification/labelsmicro.npy')

In [None]:
#Splitting into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imgs/255.,labels, test_size=0.1)
X_train.shape, X_test.shape , y_train.shape, y_test.shape

((9781, 112, 112, 3), (1087, 112, 112, 3), (9781, 9), (1087, 9))

In [None]:
#Imporing vgg model and removing last 3 fc and softmax layers
model_vgg16_conv2 = VGG16(weights='imagenet', include_top=False)
model_vgg16_conv2.summary()
    
#Creating our own input format
keras_input2 = Input(shape=(112,112,3), name = 'image_input2')
    
#Using the generated model 
output_vgg16_conv2 = model_vgg16_conv2(keras_input2)
    
#Add the fully-connected layers 
x2 = Flatten(name='flatten2')(output_vgg16_conv2)
x2 = Dense(4096, activation='relu', name='fc12')(x2)
x2 = Dense(4096, activation='relu', name='fc22')(x2)
x2 = Dense(9, activation='softmax', name='predictions2')(x2)
    
#Final model creation 
pretrained_model2 = Model(inputs=keras_input2, outputs=x2)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)  

In [None]:
#Adding regularization
pretrined_model2=add_regularization(pretrained_model2)

In [None]:
#Compiling our model
pretrined_model2.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
#Training of our model
batch_size = 8
epochs  = 25
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
history2 = pretrined_model2.fit(X_train, y_train,
                    epochs=epochs, callbacks=[callback], batch_size=batch_size)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
#Evaluating our model on test set
pretrined_model2.evaluate(X_test, y_test)



[2.606424331665039, 0.9816007614135742]

In [None]:
#Saving our model for future use
pretrined_model2.save('saved_model/my_model') 

INFO:tensorflow:Assets written to: saved_model/my_model/assets
