In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#In this section we use a labeled malware dataset. 
#The malware dataset contains features extracted from the following:
#41,323 Windows binaries (executables .exe and .dlls), as legitimate files.
#96,724 malware files downloaded from the VirusShare website. So, the dataset
# contains 138,048 lines, in total.

malware_dataset = pd.read_csv('../input/t1dataset/malware.csv', sep='|')
legit_subset = malware_dataset[0:41323].drop(['legitimate'], axis=1)
malware_subset = malware_dataset[41323::].drop(['legitimate'], axis=1)

In [None]:
# To make sure that the dataset has loaded properly, let's print the number of important features:

print('The Number of important features is %i \n' % legit_subset.shape[1])

In [None]:
malware_dataset.head()

In [None]:
malware_dataset.tail()

In [None]:
# Added by Luiz

missing_data = pd.DataFrame({'total_missing':malware_dataset.isnull().sum(), 'perc_missing': (malware_dataset.isnull().sum()/138047)*100})
missing_data.head(5)

# We don't have any missing data points as can be seen in the below table, so we will not need to
# remove rows or look further to address any related issues

In [None]:
# To improve the estimators' accuracy scores, we are going to use the
# sklearn.feature_selection module. This module is used in feature selection or
# dimensionality reduction in the dataset.

# To compute the features' importance, in our case, we are going to use tree-based feature
# selection. Load the sklearn.feature_selection module:

import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

import matplotlib.pyplot as plt

In [None]:
data = malware_dataset.drop(['Name', 'md5', 'legitimate'], axis=1).values
target = malware_dataset['legitimate'].values
featselect = sklearn.ensemble.ExtraTreesClassifier().fit(data, target)
model = SelectFromModel(featselect, prefit=True)
data_new = model.transform(data)
print (data.shape)
print (data_new.shape)

In [None]:
# Feature importance - So, the algorithms has selected fifteen important features for us. To print them out, use the
# following commands:

import sklearn.ensemble as ske

features = data_new.shape[1]
index=np.argsort(ske.ExtraTreesClassifier().fit(data,target).feature_importances_)[::-1][:features]
for feat in range(features):
    print(malware_dataset.columns[2+index[feat]])

In [None]:
# Added by Luiz 

# This is an easier way to obtain the features that were selected by ExtraTreesClassifier
data_new_features = malware_dataset.drop(['Name', 'md5', 'legitimate'], axis=1).iloc[:, index]
print(data_new_features)

# From here we can proceed with preprocessing using the features selected by ExtraTreesClassifier

In [None]:
# Added by Luiz

# For many of our features, the standard deviation is too high. The first process will be normalization to better understand the dataset's dynamics
data_new_features.describe()

In [None]:
# Added by Luiz

# Dataset normalization
x = data_new_features.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data_new_features = pd.DataFrame(x_scaled, columns=data_new_features.columns)
# After normalization
data_new_features.describe()

In [None]:
# Added by Luiz

# Outliers
num_cols = data_new_features.columns
plt.figure(figsize=(27,9))
data_new_features[num_cols].boxplot()
plt.title("Numerical variables in dataset", fontsize=20)
plt.show()

In [None]:
# Added by Luiz

from scipy import stats

# Removing outliers based on the zscore
data_new_features['legitimate'] = malware_dataset['legitimate'].values
df2 = data_new_features[(np.abs(stats.zscore(data_new_features)) < 3).all(axis=1)]

In [None]:
# Added by Luiz

legitimate_count = df2[df2.legitimate == 1].shape[0]
malware_count = df2[df2.legitimate == 0].shape[0]

legit_perc = legitimate_count / (legitimate_count + malware_count) * 100
malware_perc = malware_count / (legitimate_count + malware_count) * 100

print(f'In the dataset {legit_perc:.2f}% is legitimate, {malware_perc:.2f}% is malware.')

In [None]:
# Added by Luiz

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Using undersampling to balance the dataset

data = df2.drop(['legitimate'], axis=1).values
target = df2['legitimate'].values

under_sampler = RandomUnderSampler()
X_res, y_res = under_sampler.fit_resample(data, target)

In [None]:
# Added by Luiz

df3 = pd.DataFrame(data=X_res, columns=df2.drop(['legitimate'], axis=1).columns)
df3['legitimate'] = y_res
print(f'Number of legitimate flows: {df3[df3.legitimate == 1].shape[0]}')
print(f'Number of malware flows: {df3[df3.legitimate == 0].shape[0]}')

Data_new = df3.drop(['legitimate'], axis=1).values
Target = df3['legitimate'].values

In [None]:
# Now, it is time to train our model with a random forest classifier.

# Legit_Train, Legit_Test, Malware_Train, Malware_Test = cross_validate.train_test_split(Data_new, 
# Target ,test_size=0.2)
Legit_Train, Legit_Test, Malware_Train, Malware_Test = sklearn.model_selection.train_test_split(Data_new, Target, test_size=0.2)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
clf.fit(Legit_Train, Malware_Train)
score = clf.score(Legit_Test, Malware_Test)

In [None]:
print(score*100)

In [None]:
print("The score of Random Forest is", score*100)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
Result = clf.predict(Legit_Test)
CM = confusion_matrix(Malware_Test, Result)
print("False positive rate : %f %%" % ((CM[0][1] / float(sum(CM[0])))*100))
print('False negative rate : %f %%' % ( (CM[1][0] / float(sum(CM[1]))*100)))

In [None]:
# To train the model with another classifier, redo the previous steps, but instead of choosing the 
# random forest classifier, select a machine learning algorithm such as gradient-boosting:

In [None]:
Clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=50)
Clf.fit(Legit_Train, Malware_Train)
Score = Clf.score(Legit_Test, Malware_Test)

In [None]:
print(Score*100)

In [None]:
print("The score of Gradient Boosting is", Score*100)

In [None]:
# The following is the score using the AdaBoost classifier

In [None]:
Classifiers ={ "RandomForest": ske.RandomForestClassifier(n_estimators=50),
              "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
              "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),}
for Classif in Classifiers:
    clf = Classifiers[Classif]
clf.fit(Legit_Train,Malware_Train)
score = clf.score(Legit_Test, Malware_Test)

In [None]:
print("%s : %f %%" % (Classif, score*100))