In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from imblearn.under_sampling import TomekLinks, OneSidedSelection,EditedNearestNeighbours
from imblearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, f1_score

# Load Data (Category)

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareCategory_DATASET_Increased_Imbalance.csv')

# Delete rows where 'Category' is equal to 'Unknown'
df = df[(df['Category'] != 'Unknown')]
df['Category'].value_counts()

# Load Data (Family)

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')\
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareFamily_DATASET_FINAL_Increased_Imbalance.csv')
# Delete rows where 'name' is equal to 'Unknown_Family'
df = df[(df['name'] != 'Unknown_Family')]
df['name'].value_counts()

# Features/Targets

In [None]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Prepare the target
targets = df["Category Target"].astype(int)

# # Prepare the target
# targets = df["Family Target"].astype(int)

# Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

# GridSearchCV

In [None]:
# Define the classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'MLP': MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
}

## 1. TomekLinks

In [None]:
# Define the TomekLinks undersampler
tomek_links = TomekLinks()

# List to hold all grid search results
results = []

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# Test each classifier with TomekLinks
for name, clf in classifiers.items():
    print(f"Testing {name} with TomekLinks")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(tomek_links, clf)

    # Set the parameter grid for the sampling_strategy in TomekLinks
    param_grid = {
        'tomeklinks__sampling_strategy': ['auto', 'majority']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## 2. EdittedNearestNeighbors

In [None]:
# Define the ENN undersampler
enn = EditedNearestNeighbours()

# List to hold all grid search results
results = []

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# Test each classifier with ENN
for name, clf in classifiers.items():
    print(f"Testing {name} with ENN")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(enn, clf)

    # Set the parameter grid for the sampling_strategy and n_neighbors in ENN
    param_grid = {
        'editednearestneighbours__sampling_strategy': ['auto', 'majority'],
        'editednearestneighbours__n_neighbors': list(range(1, 11))
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## 3. OneSidedSelection

In [None]:
# Define the OSS undersampler
oss = OneSidedSelection(random_state=0)

# List to hold all grid search results
results = []

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# Test each classifier with OSS
for name, clf in classifiers.items():
    print(f"Testing {name} with OSS")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(oss, clf)

    # Set the parameter grid for n_neighbors in OSS
    param_grid = {
        'onesidedselection__n_neighbors': list(range(1, 11))  # Test n_neighbors from 1 to 10
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")