In [None]:
import pandas as pd  
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics
import sklearn
import time

from IPython.core.interactiveshell import InteractiveShell

from IPython.display import display
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, auc, roc_curve, f1_score
from sklearn.decomposition import PCA
from sklearn import model_selection
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

InteractiveShell.ast_node_interactivity = "all"

models = []
models.append(('LR', LogisticRegression(max_iter=1000)))
models.append(('KNN9', KNeighborsClassifier(n_neighbors=9)))
models.append(('KNN7', KNeighborsClassifier(n_neighbors=7)))
models.append(('KNN5', KNeighborsClassifier()))
models.append(('KNN3', KNeighborsClassifier(n_neighbors=3)))
models.append(('KNN1', KNeighborsClassifier(n_neighbors=1)))
models.append(('RF', RandomForestClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('MLP', MLPClassifier()))

features = []
trained_models = []

pd_b = pd.read_csv("./dataset_benign.csv")
pd_m = pd.read_csv("./dataset_malware.csv")

def getColumnsZeroStd():
    global pd_b     # Variable global de dataset benigno
    global pd_m     # Variable global de dataset maligno

    benign = pd_b
    malware = pd_m

    pd.set_option('display.max_columns', 10)

    drop = ['Name', 'Malware']
    
    b = benign.drop(['Name', 'Malware'], axis=1)
    m = malware.drop(['Name', 'Malware'], axis=1)
    for feature in b.columns:
        # print(feature)
        if b[feature].std() == 0:
            drop.append(feature)
    for feature in m.columns:
        # print(feature)
        if m[feature].std() == 0:
            if feature not in drop:
                drop.append(feature)

    print(drop)
    print(len(drop))

    return drop

def buildDataset(resample="", drop=['Name', 'Malware']):
    global pd_b
    global pd_m

    benign = pd_b
    malware = pd_m

    data = pd.concat([benign, malware], ignore_index=True)

    X = data.drop(drop, axis=1)
    y = data['Malware']


    # print(names)

    if (resample == "-o"):
        print("Aplicando oversampling...")
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
    elif (resample == "-u"):
        print("\n\n\nAplicando undersampling...")
        nearmiss = NearMiss(version=1)
        X, y = nearmiss.fit_resample(X, y)

    pd.set_option('display.max_columns', None)
    display(X)
    print("Número de muestras totales:", len(X), "\n\n\n")
    return X, y


def trainTest(X, y):
    global features
    features = X.columns

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=101)

    sc = StandardScaler().fit(X_train)

    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)

    print(f'Número de características usadas: {X_train.shape[1]} \n\n\n')

    return X_train, X_test, y_train, y_test


def crossValidationScore(X_train, y_train):
    results = []
    names = []
    scoring = 'accuracy'
    print("VALIDACIÓN CRUZADA")
    for name, model in models:
        kfold = model_selection.KFold(n_splits=5)
        inicio = time.time()
        trained_model = model.fit(X_train, y_train)
        fin = time.time()
        cv_results = model_selection.cross_val_score(
            trained_model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f) (%fs)" % (name, cv_results.mean(), cv_results.std(), (fin-inicio))
        print(msg)
        trained_models.append((name, trained_model))

    print("\n\n\n")


def compareMLAs(X_train, X_test, y_train, y_test):
    MLA_columns = []
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    row_index = 0
    for name, model in trained_models:

        inicio = time.time()
        predicted = model.predict(X_test)
        fin=time.time()

        tn, fp, fn, tp  = confusion_matrix(y_test, predicted).ravel()

        MLA_name = name
        MLA_compare.loc[row_index, 'MLA used'] = MLA_name
        MLA_compare.loc[row_index, 'Train Accuracy'] = round(
            model.score(X_train, y_train), 4)
        MLA_compare.loc[row_index, 'Test Accuracy'] = round(
            model.score(X_test, y_test), 4)
        MLA_compare.loc[row_index, 'Precision'] = round(precision_score(
            y_test, predicted), 4)
        MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),4)
        MLA_compare.loc[row_index, 'F1-Score'] = round(f1_score(
            y_test, predicted),4)
        MLA_compare.loc[row_index, 'Tiempo de ejecución'] = (fin-inicio)
        MLA_compare.loc[row_index, 'True negatives'] = tn
        MLA_compare.loc[row_index, 'False positives'] = fp
        MLA_compare.loc[row_index, 'False negatives'] = fn
        MLA_compare.loc[row_index, 'True positives'] = tp

        row_index += 1
        
    MLA_compare.sort_values(by=['Test Accuracy'],
                            ascending=False, inplace=True)
    display(MLA_compare)

In [None]:
print("BENIGN")
display(pd_b.describe())
print("MALWARE")
display(pd_m.describe())

In [None]:
drop = getColumnsZeroStd()

In [None]:
X,y = buildDataset("-u", drop)
X_train, X_test, y_train, y_test = trainTest(X, y)

In [None]:
corrMatrix = X.corr()
correlated_features = set()
for i in range(len(corrMatrix .columns)):
    for j in range(i):
        if abs(corrMatrix.iloc[i, j]) > 0.8:
            colname = corrMatrix.columns[i]
            correlated_features.add(colname)

len(correlated_features)
f, ax = plt.subplots(figsize =(13, 11))
sns.heatmap(corrMatrix, ax = ax, cmap ="YlGnBu", linewidths = 0.1)

In [None]:
resample="" #"-u" -> aplicar undersampling    "-o" -> aplicar oversampling    "" -> no aplica ningun resampling

X,y = buildDataset(resample, drop)
X.drop(labels=correlated_features, axis=1, inplace=True)
X_train, X_test, y_train, y_test = trainTest(X, y)
display(pd.DataFrame(features, columns=["Características"]))

In [None]:
crossValidationScore(X_train, y_train)

In [None]:
compareMLAs(X_train, X_test, y_train, y_test)