In [1]:
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import numpy as np

from IPython.display import display
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, auc, roc_curve, f1_score
from sklearn.decomposition import PCA
from sklearn import model_selection
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

models = []
models.append(('LR', LogisticRegression(max_iter=1000)))
models.append(('KNN9', KNeighborsClassifier(n_neighbors=9)))
models.append(('KNN7', KNeighborsClassifier(n_neighbors=7)))
models.append(('KNN5', KNeighborsClassifier()))
models.append(('KNN3', KNeighborsClassifier(n_neighbors=3)))
models.append(('KNN1', KNeighborsClassifier(n_neighbors=1)))
models.append(('RF', RandomForestClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('MLP', MLPClassifier()))

names = []
trained_models = []


def buildDataset(resample="", drop=['Name', 'Malware']):
    benign = pd.read_csv("./dataset_benign.csv")
    malware = pd.read_csv("./dataset_malware.csv")

    data = pd.concat([benign, malware], ignore_index=True)

    X = data.drop(drop, axis=1)
    y = data['Malware']

    names = X.columns
    # print(names)

    if (resample == "-o"):
        print("Aplicando oversampling...")
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
    elif (resample == "-u"):
        print("\n\n\nAplicando undersampling...")
        nearmiss = NearMiss(version=1)
        X, y = nearmiss.fit_resample(X, y)

    print("Número de muestras totales:", len(X), "\n\n\n")
    return X, y


def trainTest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=101)

    sc = StandardScaler().fit(X_train)

    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)

    # skpca = PCA(n_components=2)

    # X_train = skpca.fit_transform(X_train)
    # X_test = skpca.transform(X_test)

    print(f'Número de características usadas: {X_train.shape[1]} \n\n\n')

    return X_train, X_test, y_train, y_test


def crossValidationScore(X_train, y_train):
    # evaluate each model in turn
    results = []
    names = []
    scoring = 'accuracy'
    print("COMPARACIÓNN DE ALGORITMOS MEDIANTE CROSS-VALIDATION")
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10)
        trained_model = model.fit(X_train, y_train)
        cv_results = model_selection.cross_val_score(
            trained_model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        trained_models.append((name, trained_model))

    print("\n\n\n")


def compareMLAs(X_train, X_test, y_train, y_test):
    MLA_columns = []
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    row_index = 0
    for name, model in trained_models:

        predicted = model.predict(X_test)

        fp, tp, th = roc_curve(y_test, predicted)
        MLA_name = name
        MLA_compare.loc[row_index, 'MLA used'] = MLA_name
        MLA_compare.loc[row_index, 'Train Accuracy'] = round(
            model.score(X_train, y_train), 4)
        MLA_compare.loc[row_index, 'Test Accuracy'] = round(
            model.score(X_test, y_test), 4)
        MLA_compare.loc[row_index, 'Precision'] = precision_score(
            y_test, predicted)
        MLA_compare.loc[row_index, 'Recall'] = recall_score(y_test, predicted)
        MLA_compare.loc[row_index, 'F1-Score'] = f1_score(
            y_test, predicted)
        # MLA_compare.loc[row_index, 'AUC'] = auc(fp, tp)

        row_index += 1

    MLA_compare.sort_values(by=['Test Accuracy'],
                            ascending=False, inplace=True)
    print(MLA_compare)

In [2]:
resample="-u" #"-u" -> aplicar undersampling    "-o" -> aplicar oversampling    "" -> no aplica ningun resampling

X, y = buildDataset(resample)
X_train, X_test, y_train, y_test = trainTest(X, y)
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# X_train = sel.fit_transform(X_train)
# X_test = sel.transform(X_test)

crossValidationScore(X_train, y_train)
compareMLAs(X_train, X_test, y_train, y_test)




Aplicando undersampling...
Número de muestras totales: 8228 



Número de características usadas: 76 



COMPARACIÓNN DE ALGORITMOS MEDIANTE CROSS-VALIDATION
LR: 0.998785 (0.001324)
KNN9: 0.998025 (0.001368)
KNN7: 0.998481 (0.001359)
KNN5: 0.998481 (0.001359)
KNN3: 0.998633 (0.001064)
KNN1: 0.998633 (0.001064)
RF: 0.999392 (0.001008)
CART: 0.999696 (0.000912)
NB: 0.998329 (0.001262)
SVM: 0.996809 (0.002089)
MLP: 0.998785 (0.001324)




   MLA used  Train Accuracy  Test Accuracy  Precision    Recall  F1-Score
6        RF          1.0000         1.0000   1.000000  1.000000  1.000000
7      CART          1.0000         0.9994   0.998801  1.000000  0.999400
4      KNN3          0.9989         0.9988   0.997605  1.000000  0.998801
5      KNN1          1.0000         0.9988   0.997605  1.000000  0.998801
0        LR          0.9995         0.9982   0.997602  0.998800  0.998200
3      KNN5          0.9986         0.9982   0.997602  0.998800  0.998200
10      MLP          0.9997         0.9

In [105]:
pd.options.display.float_format = '{:.10f}'.format
df = pd.DataFrame(
    zip(X.columns, abs(trained_models[6][-1].feature_importances_)),
    columns=["feature", "importance"],
).sort_values("importance", ascending=False).reset_index(drop=True)

df
# pd.set_option('display.max_rows', df.shape[0]+1)

# print(df)

Unnamed: 0,feature,importance
0,Magic,0.1488267019
1,Machine,0.1303571594
2,SizeOfOptionalHeader,0.121564441
3,MajorLinkerVersion,0.0822701908
4,MajorOperatingSystemVersion,0.077300395
5,SizeOfStackReserve,0.0700904742
6,ImageDirectoryEntryException,0.0648536332
7,MajorImageVersion,0.0579217626
8,MinorLinkerVersion,0.053108391
9,MajorSubsystemVersion,0.0436819482
