In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, OneSidedSelection,EditedNearestNeighbours
from imblearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, f1_score

# Load Data (Category)

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareCategory_DATASET_Increased_Imbalance.csv')

# Delete rows where 'Category' is equal to 'Unknown'
df = df[(df['Category'] != 'Unknown')]
df['Category'].value_counts()

# Load Data (Family)

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/DATASET_SHUFFLED_VirusShare_proportions_and_targets.csv')\
df = pd.read_csv('/content/drive/MyDrive/Datasets/VirusShare_Opcodes/csvFiles/MalwareFamily_DATASET_FINAL_Increased_Imbalance.csv')
# Delete rows where 'name' is equal to 'Unknown_Family'
df = df[(df['name'] != 'Unknown_Family')]
df['name'].value_counts()

# Features/Targets

In [None]:
# Prepare the features
features = df.drop(columns=["file_name","name", "Category","Category Target", "Family Target"]).astype(float)

# Prepare the target
targets = df["Category Target"].astype(int)

# # Prepare the target
# targets = df["Family Target"].astype(int)

# Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    stratify=targets,
    test_size=0.2,
    random_state=0)


X_train.shape, X_test.shape

# GridSearchCV

In [None]:
# Define the classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'MLP': MLPClassifier(hidden_layer_sizes=(200,200, 200),early_stopping=True,random_state=0, n_iter_no_change= 5),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='rbf', random_state=0,class_weight= None, gamma= 'scale')
}

## SMOTE + TL

In [None]:
# Define the resampling techniques
smote = SMOTE(random_state=0)
tomek_links = TomekLinks(random_state=0)

# sm_8 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=8)
# X_resampled, y_resampled = sm_8.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of SMOTE and Tomek Links
for name, clf in classifiers.items():
    print(f"Testing {name} with SMOTE and Tomek Links")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(smote, tomek_links, clf)

    # Set the parameter grid for SMOTE k_neighbors and Tomek Links sampling_strategy
    param_grid = {
        'smote__k_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for simplicity
        'tomeklinks__sampling_strategy': ['auto', 'majority']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## SMOTE + ENN

In [None]:
# Define the resampling techniques
smote = SMOTE(random_state=0)
enn = EditedNearestNeighbours(random_state=0)

# sm_8 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=8)
# X_resampled, y_resampled = sm_8.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of SMOTE and ENN
for name, clf in classifiers.items():
    print(f"Testing {name} with SMOTE and ENN")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(smote, enn, clf)

    # Set the parameter grid for SMOTE and ENN
    param_grid = {
        'smote__k_neighbors': list(range(1, 21)),  # Testing a range from 1 to 2 for SMOTE
        'editednearestneighbours__n_neighbors': list(range(1, 11)),  # Testing a range from 1 to 10 for ENN
        'editednearestneighbours__sampling_strategy': ['auto', 'majority']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## SMOTE + OSS

In [None]:
# Define the resampling techniques
smote = SMOTE(random_state=0)
oss = OneSidedSelection(random_state=0)

# sm_8 = SMOTE(random_state=0,sampling_strategy='auto', k_neighbors=8)
# X_resampled, y_resampled = sm_8.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of SMOTE and OSS
for name, clf in classifiers.items():
    print(f"Testing {name} with SMOTE and OSS")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(smote, oss, clf)

    # Set the parameter grid for SMOTE and OSS
    param_grid = {
        'smote__k_neighbors': list(range(1, 20)),  # Testing a range from 1 to 20 for SMOTE
        'onesidedselection__n_neighbors': list(range(1, 11)),  # Testing a range from 1 to 10 for OSS
        'onesidedselection__sampling_strategy': ['auto', 'majority']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## ADASYN + TL

In [None]:
# Define the resampling techniques
adasyn = ADASYN(random_state=0)
tomek_links = TomekLinks(random_state=0)

# ada_5 = ADASYN(sampling_strategy='auto', random_state=0,n_neighbors=5)
# X_resampled, y_resampled = ada_5.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of ADASYN and Tomek Links
for name, clf in classifiers.items():
    print(f"Testing {name} with ADASYN and Tomek Links")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(adasyn, tomek_links, clf)

    # Set the parameter grid for ADASYN and Tomek Links
    param_grid = {
        'adasyn__n_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for ADASYN
        'tomeklinks__sampling_strategy': ['auto', 'majority']  # Sampling strategies for Tomek Links
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## ADASYN + ENN

In [None]:
# Define the resampling techniques
adasyn = ADASYN(random_state=0)
enn = EditedNearestNeighbours( random_state=0)

# ada_5 = ADASYN(sampling_strategy='auto', random_state=0,n_neighbors=5)
# X_resampled, y_resampled = ada_5.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of ADASYN and ENN
for name, clf in classifiers.items():
    print(f"Testing {name} with ADASYN and ENN")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(adasyn, enn, clf)

    # Set the parameter grid for ADASYN and ENN
    param_grid = {
        'adasyn__n_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for ADASYN
        'editednearestneighbours__n_neighbors': list(range(1, 11)),  # Testing a range from 1 to 10 for ENN
        'editednearestneighbours__sampling_strategy': ['auto', 'majority']  # Sampling strategies for ENN
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## ADASYN + OSS

In [None]:
# Define the resampling techniques
adasyn = ADASYN(random_state=0)
oss = OneSidedSelection(random_state=0)

# ada_5 = ADASYN(sampling_strategy='auto', random_state=0,n_neighbors=5)
# X_resampled, y_resampled = ada_5.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of ADASYN and OSS
for name, clf in classifiers.items():
    print(f"Testing {name} with ADASYN and OSS")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(adasyn, enn, clf)

    # Set the parameter grid for ADASYN and OSS
    param_grid = {
        'adasyn__n_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for ADASYN
        'onesidedselection__n_neighbors': list(range(1, 11)),  # Testing a range from 1 to 10 for OSS
        'onesidedselection__sampling_strategy': ['auto', 'majority']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## BSMOTE + TL

In [None]:
# Define the resampling techniques
borderline_smote = BorderlineSMOTE(random_state=0)
tomek_links = TomekLinks(random_state=0)

# bsm_3_18 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=18)
# X_resampled, y_resampled = bsm_3_18.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of BorderlineSMOTE and Tomek Links
for name, clf in classifiers.items():
    print(f"Testing {name} with BorderlineSMOTE and Tomek Links")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(borderline_smote, tomek_links, clf)

    # Set the parameter grid for BorderlineSMOTE and Tomek Links
    param_grid = {
        'borderlinesmote__k_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for BorderlineSMOTE k_neighbors
        'borderlinesmote__m_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for BorderlineSMOTE m_neighbors
        'tomeklinks__sampling_strategy': ['auto', 'majority']  # Sampling strategies for Tomek Links
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## BSMOTE + ENN

In [None]:
# Define the resampling techniques
borderline_smote = BorderlineSMOTE(random_state=0)
enn = EditedNearestNeighbours(random_state=0)

# bsm_3_18 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=18)
# X_resampled, y_resampled = bsm_3_18.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of BorderlineSMOTE and ENN
for name, clf in classifiers.items():
    print(f"Testing {name} with BorderlineSMOTE and ENN")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(borderline_smote, enn, clf)

    # Set the parameter grid for BorderlineSMOTE and ENN
    param_grid = {
        'borderlinesmote__k_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for k_neighbors in BorderlineSMOTE
        'borderlinesmote__m_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for m_neighbors in BorderlineSMOTE
        'editednearestneighbours__n_neighbors': list(range(1, 11)),  # Testing a range from 1 to 10 for n_neighbors in ENN
        'editednearestneighbours__sampling_strategy': ['auto', 'majority']  # Sampling strategies for ENN
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")


## BSMOTE + OSS

In [None]:
# Define the resampling techniques
borderline_smote = BorderlineSMOTE(random_state=0)
oss = OneSidedSelection(random_state=0)

# bsm_3_18 = BorderlineSMOTE(sampling_strategy='auto', random_state=0,k_neighbors=3, m_neighbors=18)
# X_resampled, y_resampled = bsm_3_18.fit_resample(X_train, y_train)

# Create a custom scorer that uses the macro average
f1_macro_scorer = make_scorer(f1_score, average='macro')

# List to hold all grid search results
results = []

# Test each classifier with the combination of BorderlineSMOTE and OSS
for name, clf in classifiers.items():
    print(f"Testing {name} with BorderlineSMOTE and OSS")

    # Define the pipeline with make_pipeline from imblearn
    pipeline = make_pipeline(borderline_smote, oss, clf)

    # Set the parameter grid for BorderlineSMOTE and OSS
    param_grid = {
        'borderlinesmote__k_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for k_neighbors in BorderlineSMOTE
        'borderlinesmote__m_neighbors': list(range(1, 21)),  # Testing a range from 1 to 20 for m_neighbors in BorderlineSMOTE
        'onesidedselection__n_neighbors': list(range(1, 11)),  # Testing a range from 1 to 10 for n_neighbors in OSS
        'onesidedselection__sampling_strategy': ['auto', 'majority']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=f1_macro_scorer, verbose=1)

    # Fit GridSearchCV
    # grid_search.fit(X_resampled, y_resampled)
    grid_search.fit(X_train, y_train)

    # Store the best estimator, its parameters, and the corresponding score
    results.append({
        'Classifier': name,
        'Best_Params': grid_search.best_params_,
        'Best_Score': grid_search.best_score_
    })

# Output the results
for result in results:
    print(f"Classifier: {result['Classifier']}, Best Score: {result['Best_Score']}, Params: {result['Best_Params']}")
