In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# In this section we use a labeled malware dataset. 
# The malware dataset contains features extracted from the following:
# 41,323 Windows binaries (executables .exe and .dlls), as legitimate files.
# 96,724 malware files downloaded from the VirusShare website. So, the dataset
# contains 138,048 lines, in total.

MalwareDataset = pd.read_csv('../input/malware/malware.csv', sep='|')
Legit = MalwareDataset[0:41323].drop(['legitimate'], axis=1)
Malware = MalwareDataset[41323::].drop(['legitimate'], axis=1)

In [None]:
# To make sure that the dataset has loaded properly, let's print the number of important features:

print('The Number of important features is %i \n' % Legit.shape[1])

In [None]:
MalwareDataset.head()

In [None]:
MalwareDataset.tail()

In [None]:
# Let's try to gleam more information :
MalwareDataset.info()
print("The total number of rows is : ", len(MalwareDataset))

#### The current memory usage of the dataset is : 60.0+ MB
#### All numeric variables are either float or integers and use 64 bits. This is not a problem for a small dataset such as this one, however,
#### for larger datasets reducing the number of bits used for each entry will significantly improve performance and reduce the amount
#### of memory used in the analysis process. Let's fix this :

In [None]:
import math

# To avoid scientific notations. This change is global, therefore must be reverted later on
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Number of bits needed in order to store a given value. Should be used either for integers or
# floating point numbers with a small number of decimals
def calc_num_bits(value):
    return math.log(value) / math.log(2)

def number_of_decimals(float_value):
    str_v = str(float_value)
    return str_v[::-1].find('.')

# Should it use 8, 16, 32 or 64 bits ?
def closest_to(value):
    close_to_8 = abs(8-value)
    close_to_16 = abs(16-value)
    close_to_32 = abs(32-value)
    close_to_64 = abs(64-value)
    closest = min(close_to_8, close_to_16, close_to_32, close_to_64)
    if closest == close_to_8 :
        return '8'
    elif closest == close_to_16 :
        return '16'
    elif closest == close_to_32 :
        return '32'
    else :
        return '64'
    
averiguation = pd.DataFrame(columns=['max','min','mean','dtype','bits'])
new_dtypes = {'Name': 'object', 'md5': 'object'}
    
for i in range(2,len(MalwareDataset.columns)):
    column = MalwareDataset.iloc[:,i]
    maximum = column.max()
    minimum = column.min()
    mean = column.mean()
    value = max(abs(maximum), abs(minimum))
    if (type(value) == type(1)) :
        size = calc_num_bits(value)
        dtype = 'int' + closest_to(size)
    # The following if statement is an educated guess on how much
    # we can reduce the number of bits used to store a float
    # with a small number of decimals.
    elif (number_of_decimals(value) < 4):
        size = calc_num_bits(value)
        dtype = 'float' + closest_to(size)
    else:
        dtype = 'float64'
        size = 64
        
    averiguation.loc[i] = [maximum, minimum, mean, dtype, size]
    new_dtypes[column.name] = dtype
    
averiguation.index = MalwareDataset.columns[2:]
averiguation

In [None]:
# Let's use our new_dtypes and use them to make the necessary changes:
newMalwareDataset = pd.read_csv('../input/malware/malware.csv', sep = '|', dtype = new_dtypes)
newMalwareDataset.info()

### With these changes we have reduced the memory usage from 60 MB to 27 MB, a reduction of 55%.

In [None]:
# As shown above, there are no null values in any of the 57 features. However it might be possible for a numeric
# column to be comprised of mostly 0's. Lets check :

n_rows = len(newMalwareDataset)

def calc_perc(num_0s):
     return (num_0s * 100) / n_rows
    
zero_check = pd.DataFrame(columns = ['feature','num_0s','perc_0s'])
    
# For every numeric feature
for i in range(2,len(MalwareDataset.columns)):
    column = newMalwareDataset.iloc[:,i]
    num_0s = len(column[column==0])
    perc_0s = calc_perc(num_0s)
    zero_check.loc[i] = [column.name, num_0s, perc_0s]
    
zero_check

In [None]:
# Two of the features - 'SizeOfHeapReserve' and 'LoaderFlags' - have more than 99% of its entries comprised of 0's.
# In this case they can be considered noise and safely removed from our dataset. Other columns that have mostly 0's,
# such as - 'ExportNb' ; 'MinorImageVersion' ; 'SizeOfUninitializedData' ; 'MinorLinkerVersion' - should be analyzed
# further as it still might be possible to gleam useful information from them.

newMalwareDataset.drop(['SizeOfHeapReserve','LoaderFlags'], axis=1, inplace=True)
newMalwareDataset.head()

In [None]:
# To improve the estimators' accuracy scores, we are going to use the
# sklearn.feature_selection module. This module is used in feature selection or
# dimensionality reduction in the dataset.

# To compute the features' importance, in our case, we are going to use tree-based feature
# selection. Load the sklearn.feature_selection module:

import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [None]:
Data = newMalwareDataset.drop(['Name', 'md5', 'legitimate'], axis=1).values
Target = newMalwareDataset['legitimate'].values
FeatSelect = sklearn.ensemble.ExtraTreesClassifier().fit(Data, Target)
Model = SelectFromModel(FeatSelect, prefit=True)
Data_new = Model.transform(Data)
print (Data.shape)
print (Data_new.shape)
print ('Therefore, we have reduced the number of features from ',Data.shape[1],' to ',Data_new.shape[1],'features')

In [None]:
# Feature importance - So, the algorithms has selected X important features for us. To print them out, use the
# following commands:

import sklearn.ensemble as ske

Features = Data_new.shape[1]
index=np.argsort(ske.ExtraTreesClassifier().fit(Data,Target).feature_importances_)[::-1][:Features]
for feat in range(Features):
        print(MalwareDataset.columns[2+index[feat]])

In [None]:
# Now, it is time to train our model with a random forest classifier.

# Legit_Train, Legit_Test, Malware_Train, Malware_Test = cross_validate.train_test_split(Data_new, 
# Target ,test_size=0.2)
Legit_Train, Legit_Test, Malware_Train, Malware_Test = sklearn.model_selection.train_test_split(Data_new, Target, test_size=0.2)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
clf.fit(Legit_Train, Malware_Train)
score = clf.score(Legit_Test, Malware_Test)

In [None]:
print("The score of Random Forest is", score*100)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
Result = clf.predict(Legit_Test)
CM = confusion_matrix(Malware_Test, Result)
print("False positive rate : %f %%" % ((CM[0][1] / float(sum(CM[0])))*100))
print('False negative rate : %f %%' % ( (CM[1][0] / float(sum(CM[1]))*100)))

In [None]:
# To train the model with another classifier, redo the previous steps, but instead of choosing the 
# random forest classifier, select a machine learning algorithm such as gradient-boosting:

In [None]:
Clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=50)
Clf.fit(Legit_Train, Malware_Train)
Score = Clf.score(Legit_Test, Malware_Test)

In [None]:
print("The score of Gradient Boosting is", Score*100)

In [None]:
# This has a 98.8% detection rate

In [None]:
# The following is the score using the AdaBoost classifier

In [None]:
Classifiers ={ "RandomForest": ske.RandomForestClassifier(n_estimators=50),
              "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
              "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),}
for Classif in Classifiers:
    clf = Classifiers[Classif]
clf.fit(Legit_Train,Malware_Train)
score = clf.score(Legit_Test, Malware_Test)

In [None]:
print("%s : %f %%" % (Classif, score*100))