### Imports

In [None]:
import glob
import json
import pandas as pd
import numpy as np
import os
import re

## Define the objective

1. Mnemonics groups
2. Strings oer BERT
3. Section entropy
4. Functions
5. Generic information
6. All per BERT

In [None]:
size = 256
typefeature = f"entropy"

data = f"/home/sergio/Documents/TFM/repos/BinaryIntelligence/Models/DATA/entropy.csv"
output_history = f"data-vault/{typefeature}/scores_{typefeature}.json"
model_pickle = f"data-vault/{typefeature}/model_{typefeature}.pkl"
all_models_pickle = f"data-vault/{typefeature}/all_models_{typefeature}.pkl"
output_image = os.path.abspath(f"data-vault/{typefeature}/{typefeature}-results.png")

os.makedirs(f"data-vault/{typefeature}/", exist_ok=True)

### Prepare data to csv

In [None]:
location = "/home/sergio/Documents/TFM/data/dataset-static"
target = f"strings_{size}.feature"
csv = []
malware = 0
for file in glob.glob(f"**/{target}", recursive=True, root_dir=location):
    with open(f"{location}/{file}", "r") as f:
        current_data = json.loads(f.read())
    if "malware" in file:
        malware = 1
    
    row = [",".join([str(x) for x in current_data["STRINGS"]]) + f",{malware}"]
    csv.append(row)

header = ""
for i in range(len(csv[0][0].split(",")) - 1):
    header+=f"STRING_{i},"
header+="Malware"

csv.insert(0, [header])

output_file = open(data, "w")
for element in csv:
    if len(element[0].split(",")) != len(header.split(",")):
        print("problem", element[0].split(","))
        break

    output_file.write(f"{element[0]}\n")
output_file.close()

### Data to DF

In [None]:
df = pd.read_csv(data)
df.head()

In [None]:
from sklearn.model_selection import train_test_split


# Divide in characteristics and tags
X = df.drop('Malware', axis=1)
y = df['Malware']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)

In [None]:
import time
import warnings
import pickle

warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, precision_score
import matplotlib.pyplot as plt
import seaborn as sns

# Necessary imports
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, IsolationForest, RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


In [None]:
# List of models
models = [
    {"model": LogisticRegression(), "params": {"C": [0.1, 100.0], "penalty": ["l1", "l2", "elasticnet", "none"]}},
    {"model": RandomForestClassifier(), "params": {"n_estimators": [12000], "max_depth": [100, 1000]}},
    {"model": SVC(), "params": {"C": [0.1, 100.0], "kernel": ["linear", "poly", "rbf", "sigmoid"]}},
    {"model": XGBClassifier(), "params": {"n_estimators": [5000], "max_depth": [100]}},
    {"model": AdaBoostClassifier(), "params": {"n_estimators": [1000, 10000], "learning_rate": [0.1, 10.0]}},
    {"model": SGDClassifier(), "params": {"alpha": [0.0001, 0.1], "penalty": ["l1", "l2", "elasticnet"]}},
    {"model": LinearSVC(), "params": {"C": [0.1, 100.0], "penalty": ["l1", "l2"]}},
    {"model": DecisionTreeClassifier(), "params": {"max_depth": [None, 1000], "criterion": ["gini", "entropy"]}},
    {"model": LGBMClassifier(), "params": {"n_estimators": [5000, 12000], "max_depth": [1000]}},
    {"model": RidgeClassifier(), "params": {"alpha": [0.1, 100.0]}},
    {"model": GradientBoostingClassifier(), "params": {"n_estimators": [100, 12000], "learning_rate": [0.1, 10.0]}},
    {"model": ExtraTreesClassifier(), "params": {"n_estimators": [5000, 12000], "max_depth": [100, 500]}},
    {"model": KNeighborsClassifier(), "params": {"n_neighbors": [5, 100], "weights": ["uniform", "distance"]}},
    {"model": GaussianNB(), "params": {}},
    {"model": BaggingClassifier(), "params": {"n_estimators": [1000, 10000]}},
    # Here I just provide an empty list as a placeholder. 
    #{"model": StackingClassifier(estimators=[]), "params": {}},
    #{"model": VotingClassifier(estimators=[]), "params": {}},
]

def evaluate_model(model, params):
    try:
        model.set_params(**params)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score1 = accuracy_score(y_test, predictions)
        score2 = precision_score(y_test, predictions)
        score = (score1 + score2) / 2
    except Exception as e:
        # In some models there are incompatible parameters, 
        # with this try except these problems are prevented among others
        return None, -1, -1, -1, e
    return model, score, score1, score2, None

def random_search(timeout, models):
    history_params = set()
    best_model = None
    best_params = None
    best_score = 0

    for line in open(output_history, "r"):
        j = json.loads(line)
        history_params.add((j["model"], str(j["params"])))
        if j["score"] > best_score:
            best_score = float(j["score"])
            best_model = j["model"]
            best_params = j["params"]

    print(f"Actual best score: {best_score}, for model: {best_model}, params: {best_params}")

    start_time = time.time()

    all_scores = []
    train_models = []
    while time.time() - start_time < timeout:
        # Select a model and parameters randomly
        
        model_info = np.random.choice(models)
        model = model_info["model"]
        param_values = [np.random.choice(v) for v in model_info["params"].values()]
        params = dict(zip(model_info["params"].keys(), param_values))
        
        now = (type(model).__name__, str(params))
        tries = 1
        while now in history_params:
            time.sleep(0.1)
            if tries > 10000:
                print("Too many tries in random choice, aborting")
                return best_model, best_params, best_score, all_scores, train_models
            tries += 1
            model_info = np.random.choice(models)
            model = model_info["model"]
            param_values = [np.random.choice(v) for v in model_info["params"].values()]
            params = dict(zip(model_info["params"].keys(), param_values))
            now = (type(model).__name__, str(params))

        history_params.add(now) 
        print("[ Model choosed: " + type(model).__name__ + " with params: " + str(params) + "in " + str(tries) + " tries ]")

        trained_model, score, accuracy, precision, error = evaluate_model(model, params)
        if score < 0 or trained_model is None:
            print(f"[ (!) Failed to train model ({error}) ]")
            continue
        
        # Save the results to a file
        with open(output_history, 'a') as f:
            try:
                    log = {
                            "model": type(model).__name__,
                            "params": params,
                            "score": score,
                            "accuracy": accuracy,
                            "precision": precision
                        }
                    
                    j = json.dumps(log, cls=MyEncoder) # use the custom encoder
                    

                    output = "🔹 Model: \033[1m{}\033[0m\n".format(log["model"])
                    output += "🔹 Params: \033[1m{}\033[0m\n".format(log["params"])
                    output += "🔹 Score: \033[1m{}\033[0m\n".format(log["score"])
                    output += "🔹 Accuracy: \033[1m{}\033[0m\n".format(log["accuracy"])
                    output += "🔹 Precision: \033[1m{}\033[0m".format(log["precision"])

                    print(output)
                    f.write(j)
                    f.write("\n")
            except Exception as e:
                print(f"Error while writing to file: {e}")
                pass


        all_scores.append({"model": type(model).__name__, "params": params, "score": score})
        train_models.append(trained_model)

        if score > best_score:
            print("New best model: " + str(score) + ", last score was: " + str(best_score))
            best_model = model
            best_params = params
            best_score = score

    return best_model, best_params, best_score, all_scores, train_models

if not os.path.exists(output_history):
    open(output_history, "a").close()

# Run the search
timeout = 7200
best_model, best_params, best_score, all_scores, train_models = random_search(timeout, models)

# Save best model
if type(best_model) is not str:
    with open(f"{model_pickle}_{type(best_model).__name__}", 'wb') as f:
        try:
            pickle.dump(best_model, f)
        except Exception as e:
            print("Error pickling best model: ", str(e))

# Save all models
with open(all_models_pickle, 'wb') as f:
    try:
        pickle.dump(train_models, f)
    except Exception as e:
        print("Error pickling models: ", str(e))

# Print the best model and its score
if type(best_model) is str:
    print("Best model:", best_model)
    print("Best parameters:", best_params)
    print("Best score:", best_score)
else:
    print("Best model:", type(best_model).__name__)
    print("Best parameters:", best_params)
    print("Best score:", best_score)

In [None]:
# Read the file and parse the JSON data
with open(output_history, 'r') as file:
    data = [json.loads(line) for line in file]

# Sort the models by score in descending order
sorted_models = sorted(data, key=lambda x: x['score'], reverse=True)
    
# Keep track of model types already printed
printed_models = set()

# Print the model and params of the top 5 unique models
count = 0
for i, model_data in enumerate(sorted_models):
    model = model_data['model']
    params = model_data['params']
    score = sorted_models[i]['score']
    if model not in printed_models:
        print(f"Model: {model}")
        print(f"Params: {params}")
        print(f"Score: {score}")
        print()
        printed_models.add(model)
        count += 1
        if count == 13:
            break

In [None]:
# Load the data
model_scores = {}
for line in open(output_history, "r"):
    j = json.loads(line.strip())
    if j["model"] in model_scores:
        model_scores[j["model"]].append(j["score"])
    else:
        model_scores[j["model"]] = [j["score"]]

# Prepare data for seaborn
model_names = []
score_values = []

for model, scores in model_scores.items():
    for score in scores:
        model_names.append(model)
        score_values.append(score)

data = pd.DataFrame(list(zip(model_names, score_values)), columns=['Model', 'Score'])

# Compute max scores
max_scores = data.groupby('Model')['Score'].max().reset_index()

# Sort dataframe by max score
max_scores = max_scores.sort_values(by='Score', ascending=False)

# Create a color palette
pal = sns.color_palette("RdYlGn", len(max_scores))

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))
sns.set(style="whitegrid")
sns.barplot(x='Score', y='Model', data=max_scores, palette=np.array(pal[::-1]), ax=ax)

# Add grid
ax.grid(which='major', axis='both', linestyle='--')

# Add data labels
for i, v in enumerate(max_scores["Score"]):
    ax.text(v / 2, i, "{:.4f}".format(v), color='black', ha='center', va='center')

# Improve labels and ticks
ax.set_xlabel('Max Accuracy', fontsize=15, weight='bold', labelpad=10)
ax.set_ylabel('Model', fontsize=15, weight='bold', labelpad=10)
ax.set_title('Comparison of Model Scores', fontsize=20, weight='bold', pad=20)
plt.xticks(np.arange(0, 1.01, 0.1), fontsize=12)
plt.yticks(fontsize=12)

# Add a color bar
sm = plt.cm.ScalarMappable(cmap="RdYlGn", norm=plt.Normalize(vmin=min(max_scores['Score']), vmax=max(max_scores['Score'])))
cbar = fig.colorbar(sm, ax=ax)

plt.tight_layout()
plt.savefig(output_image, dpi=300, bbox_inches='tight')
plt.show()