In [71]:
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics

from IPython.core.interactiveshell import InteractiveShell

from IPython.display import display
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, auc, roc_curve, f1_score
from sklearn.decomposition import PCA
from sklearn import model_selection
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

InteractiveShell.ast_node_interactivity = "all"

models = []
models.append(('LR', LogisticRegression(max_iter=1000)))
models.append(('KNN9', KNeighborsClassifier(n_neighbors=9)))
models.append(('KNN7', KNeighborsClassifier(n_neighbors=7)))
models.append(('KNN5', KNeighborsClassifier()))
models.append(('KNN3', KNeighborsClassifier(n_neighbors=3)))
models.append(('KNN1', KNeighborsClassifier(n_neighbors=1)))
models.append(('RF', RandomForestClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('MLP', MLPClassifier()))

features = []
trained_models = []

def getColumnsZeroStd():
    benign = pd.read_csv("./dataset_benign.csv")
    malware = pd.read_csv("./dataset_malware.csv")

    drop = ['Name', 'Malware']
    
    b = benign.drop(['Name', 'Malware'], axis=1)
    m = malware.drop(['Name', 'Malware'], axis=1)
    for feature in b.columns:
        # print(feature)
        if b[feature].std() == 0:
            drop.append(feature)
    for feature in m.columns:
        # print(feature)
        if m[feature].std() == 0:
            if feature not in drop:
                drop.append(feature)

    print(drop)
    print(len(drop))

    return drop

def buildDataset(resample="", drop=['Name', 'Malware']):
    benign = pd.read_csv("./dataset_benign.csv")
    malware = pd.read_csv("./dataset_malware.csv")

    data = pd.concat([benign, malware], ignore_index=True)

    X = data.drop(drop, axis=1)
    y = data['Malware']

    # print(names)

    if (resample == "-o"):
        print("Aplicando oversampling...")
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
    elif (resample == "-u"):
        print("\n\n\nAplicando undersampling...")
        nearmiss = NearMiss(version=1)
        X, y = nearmiss.fit_resample(X, y)

    print("Número de muestras totales:", len(X), "\n\n\n")
    return X, y


def trainTest(X, y):
    global features
    features = X.columns

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=101)

    sc = StandardScaler().fit(X_train)

    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)

    print(f'Número de características usadas: {X_train.shape[1]} \n\n\n')

    return X_train, X_test, y_train, y_test


def crossValidationScore(X_train, y_train):
    # evaluate each model in turn
    results = []
    names = []
    scoring = 'accuracy'
    print("COMPARACIÓNN DE ALGORITMOS MEDIANTE CROSS-VALIDATION")
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10)
        trained_model = model.fit(X_train, y_train)
        cv_results = model_selection.cross_val_score(
            trained_model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        trained_models.append((name, trained_model))

    print("\n\n\n")


def compareMLAs(X_train, X_test, y_train, y_test):
    MLA_columns = []
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    row_index = 0
    for name, model in trained_models:

        predicted = model.predict(X_test)

        fp, tp, th = roc_curve(y_test, predicted)
        MLA_name = name
        MLA_compare.loc[row_index, 'MLA used'] = MLA_name
        MLA_compare.loc[row_index, 'Train Accuracy'] = round(
            model.score(X_train, y_train), 4)
        MLA_compare.loc[row_index, 'Test Accuracy'] = round(
            model.score(X_test, y_test), 4)
        MLA_compare.loc[row_index, 'Precision'] = precision_score(
            y_test, predicted)
        MLA_compare.loc[row_index, 'Recall'] = recall_score(y_test, predicted)
        MLA_compare.loc[row_index, 'F1-Score'] = f1_score(
            y_test, predicted)
        # MLA_compare.loc[row_index, 'AUC'] = auc(fp, tp)

        row_index += 1

    MLA_compare.sort_values(by=['Test Accuracy'],
                            ascending=False, inplace=True)
    print(MLA_compare)

In [72]:
resample="-u" #"-u" -> aplicar undersampling    "-o" -> aplicar oversampling    "" -> no aplica ningun resampling

X,y = buildDataset(resample)
X_train, X_test, y_train, y_test = trainTest(X, y)




Aplicando undersampling...
Número de muestras totales: 8228 



Número de características usadas: 76 





In [None]:
crossValidationScore(X_train, y_train)
compareMLAs(X_train, X_test, y_train, y_test)

In [73]:
corrMatrix = X.corr()
correlated_features = set()
for i in range(len(corrMatrix .columns)):
    for j in range(i):
        if abs(corrMatrix.iloc[i, j]) > 0.8:
            colname = corrMatrix.columns[i]
            correlated_features.add(colname)

len(correlated_features)
X.drop(labels=correlated_features, axis=1, inplace=True)
# print(corrMatrix)
# f, ax = plt.subplots(figsize =(20, 18))
# sns.heatmap(corrMatrix, ax = ax, cmap ="YlGnBu", linewidths = 0.1)
X_train, X_test, y_train, y_test = trainTest(X, y)

29

Número de características usadas: 47 





In [None]:
crossValidationScore(X_train, y_train)
compareMLAs(X_train, X_test, y_train, y_test)

In [75]:
# sel = VarianceThreshold(threshold=(0.1))
# X = sel.fit_transform(X)
# # print(X)
# X_train, X_test, y_train, y_test = trainTest(X, y)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
crossValidationScore(X_train, y_train)
compareMLAs(X_train, X_test, y_train, y_test)

In [58]:
drop = getColumnsZeroStd()

['Name', 'Malware', 'e_magic', 'e_cblp', 'e_cp', 'e_crlc', 'e_cparhdr', 'e_minalloc', 'e_maxalloc', 'e_ss', 'e_sp', 'e_csum', 'e_ip', 'e_cs', 'e_lfarlc', 'e_ovno', 'PointerToSymbolTable', 'NumberOfSymbols', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SuspiciousNameSection', 'SectionMaxEntropy', 'SectionMaxRawsize', 'SectionMaxVirtualsize', 'SectionMinPhysical', 'SectionMinVirtual', 'SectionMinPointerData', 'SectionMainChar']
28


In [76]:
resample="-u" #"-u" -> aplicar undersampling    "-o" -> aplicar oversampling    "" -> no aplica ningun resampling

X,y = buildDataset(resample, drop)
X_train, X_test, y_train, y_test = trainTest(X, y)
print(features)




Aplicando undersampling...
Número de muestras totales: 8228 



Número de características usadas: 50 



Index(['e_oemid', 'e_oeminfo', 'e_lfanew', 'Machine', 'NumberOfSections',
       'TimeDateStamp', 'SizeOfOptionalHeader', 'Characteristics', 'Magic',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'ImageBase', 'SectionAlignment',
       'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfHeaders',
       'CheckSum', 'SizeOfImage', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'SectionsLength', 'SectionMinEntropy',
       'SectionMinRawsize', 'SectionMinVirtualsize', 'SectionMaxPhysical',
       'SectionMaxVirtual', 'SectionMaxPointerData'

In [78]:
corrMatrix = X.corr()
correlated_features = set()
for i in range(len(corrMatrix .columns)):
    for j in range(i):
        if abs(corrMatrix.iloc[i, j]) > 0.8:
            colname = corrMatrix.columns[i]
            correlated_features.add(colname)

len(correlated_features)
X.drop(labels=correlated_features, axis=1, inplace=True)
# print(corrMatrix)
# f, ax = plt.subplots(figsize =(20, 18))
# sns.heatmap(corrMatrix, ax = ax, cmap ="YlGnBu", linewidths = 0.1)

X_train, X_test, y_train, y_test = trainTest(X, y)
print(features)

0

Número de características usadas: 28 



Index(['e_oemid', 'e_oeminfo', 'e_lfanew', 'Machine', 'NumberOfSections',
       'TimeDateStamp', 'Characteristics', 'MajorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData', 'BaseOfCode',
       'ImageBase', 'FileAlignment', 'MinorOperatingSystemVersion',
       'MajorImageVersion', 'MinorSubsystemVersion', 'Subsystem',
       'DllCharacteristics', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'SectionMinEntropy', 'SectionMinRawsize',
       'SectionMaxChar', 'DirectoryEntryImport', 'DirectoryEntryImportSize',
       'DirectoryEntryExport'],
      dtype='object')


In [61]:
crossValidationScore(X_train, y_train)
compareMLAs(X_train, X_test, y_train, y_test)

COMPARACIÓNN DE ALGORITMOS MEDIANTE CROSS-VALIDATION
LR: 0.999240 (0.001019)
KNN9: 0.997721 (0.002654)
KNN7: 0.998025 (0.002548)
KNN5: 0.998481 (0.001359)
KNN3: 0.998481 (0.001359)
KNN1: 0.998937 (0.000973)
RF: 0.999696 (0.000608)
CART: 0.999696 (0.000608)
NB: 0.998481 (0.001177)
SVM: 0.997417 (0.001528)
MLP: 0.999089 (0.001215)




   MLA used  Train Accuracy  Test Accuracy  Precision    Recall  F1-Score
6        RF          1.0000         0.9988   1.000000  0.997599  0.998798
7      CART          1.0000         0.9988   0.998800  0.998800  0.998800
0        LR          0.9995         0.9982   0.998798  0.997599  0.998198
5      KNN1          1.0000         0.9982   0.997602  0.998800  0.998200
9       SVM          0.9995         0.9982   1.000000  0.996399  0.998196
10      MLP          0.9995         0.9976   0.997599  0.997599  0.997599
1      KNN9          0.9986         0.9970   0.996403  0.997599  0.997001
2      KNN7          0.9985         0.9970   0.996403  0.997599  0.997001