In [None]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  AUC and similarity plots                                                  #
#                                                                            #
##############################################################################



# some parts of this code has been changed by sgr-ht in 2023-2024

In [None]:
import json
import math
import numpy as np
import os
import pandas as pd

from IPython.display import display
from collections import defaultdict
from sklearn import metrics

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

### Utility functions

In [None]:
def merge_data(df_pairs, df_similarity, is_pos):
    df_pairs = df_pairs.merge(
        df_similarity,
        how='left',
        left_on=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2', 'db_type'],
        right_on=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2', 'db_type'])
        #left_on=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2'],
        #right_on=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2'])

    if is_pos:
        # If positive pairs, the perfect similarity is 1
        df_pairs['gt'] = [1] * df_pairs.shape[0]
    else:
        # if negative pairs, the perfect similarity is 0
        df_pairs['gt'] = [-1] * df_pairs.shape[0]

    return df_pairs

In [None]:
def plot_sim_and_roc(df_pos, df_neg, test_name, output_dir):
    result_list = list()

    task_list = sorted(set(df_pos['db_type']))

    plt_height = 10
    plt_width = 3
    if len(task_list) == 3:
        plt_height = 3
        plt_width
    if len(task_list) == 4:
        plt_height = 3
        plt_width = 4

    fig_auc, axs = plt.subplots(
        int(len(task_list) / plt_width),
        plt_width,
        figsize=(10, plt_height))
    axs = axs.reshape(-1, 1)

    sim_list = list()
    labels_list = list()

    for i, task in enumerate(task_list):
        df_pos_task = df_pos[df_pos['db_type'] == task]
        df_neg_task = df_neg[df_neg['db_type'] == task]

        sim_list.append(df_pos_task['sim'])
        sim_list.append(df_neg_task['sim'])

        labels_list.append('Pos (task: {})'.format(task))
        labels_list.append('Neg (task: {})'.format(task))

        pred_list = list(df_pos_task['sim'].values)
        pred_list += list(df_neg_task['sim'].values)
        gt_list = list(df_pos_task['gt'].values)
        gt_list += list(df_neg_task['gt'].values)

        # AUC
        roc_auc = metrics.roc_auc_score(gt_list, pred_list)
        #result_list.append(["%20s" % (task), "%0.2f" % (roc_auc)])
        result_list.append(["%20s" % (task), "%0.3f" % (roc_auc)])

        # FPR vs. TPR plot
        fpr, tpr, thresholds = metrics.roc_curve(gt_list, pred_list)
        p_axs = axs[i][0].plot(fpr, tpr, linewidth=1.0)
        axs[i][0].set_xlabel('FPR')
        axs[i][0].set_ylabel('TPR')
        axs[i][0].set_xlim([0, 1])
        axs[i][0].set_ylim([0, 1])
        #axs[i][0].set_title("AUC = %0.2f - Task: %s" % (roc_auc, task))
        axs[i][0].set_title("AUC = %0.3f - Task: %s" % (roc_auc, task))

    fig_auc.tight_layout()
    # fig_path = os.path.join(output_dir, "{}_roc.png".format(test_name))
    # plt.savefig(fig_path, dpi=300)
    plt.show()

    fig_bplot, axs = plt.subplots(figsize=(10, plt_height))
    bplot = axs.boxplot(
        x=sim_list[::-1],
        labels=labels_list[::-1],
        showfliers=False,
        patch_artist=True,
        vert=False)
    axs.set_title("Similarity distribution for positive and negative pairs")
    for c, patch in enumerate(bplot['boxes']):
        if c % 2:
            patch.set_facecolor('lightblue')
    # fig_path = os.path.join(output_dir, "{}_boxplot.png".format(test_name))
    # plt.savefig(fig_path, dpi=300)
    plt.show()

    return result_list

In [None]:
def compute_auc_and_plot(df_pos, df_neg, results_dir, output_dir):
    results = list()
    for csv_file in sorted(os.listdir(results_dir)):
        if (not csv_file.endswith(".csv")) or \
                ("pos_testing" not in csv_file):
            continue

        print("[D] Processing\n\t{}\n\t{}".format(
            csv_file, csv_file.replace("pos_testing", "neg_testing")))

        test_name = csv_file.replace("pos_testing_", "")
        test_name = test_name.replace(".csv", "")

        df_pos_sim = pd.read_csv(
            os.path.join(results_dir, csv_file))

        df_neg_sim = pd.read_csv(
            os.path.join(results_dir, csv_file.replace(
                "pos_testing", "neg_testing")))

        assert(df_pos_sim.isna().sum()['sim'] == 0)
        assert(df_neg_sim.isna().sum()['sim'] == 0)

        # Plot the similarity distribution
        df_pos_sim['sim'].hist(bins=200)
        df_neg_sim['sim'].hist(bins=200, alpha=0.8)
        # fig_path = os.path.join(output_dir, "{}_sim.png".format(test_name))
        # plt.savefig(fig_path, dpi=300)
        plt.show()

        # Merge
        df_pos_m = merge_data(df_pos, df_pos_sim, is_pos=True)
        df_neg_m = merge_data(df_neg, df_neg_sim, is_pos=False)

        tmp_list = [['title', test_name]]
        tmp_list.extend(plot_sim_and_roc(
            df_pos_m, df_neg_m, test_name, output_dir))
        results.append(tmp_list)

    return results

In [None]:
def from_list_to_df(auc_list):
    pd_temp_dict = defaultdict(list)
    for xr in auc_list:
        columns_set = set()
        columns = [x[0].strip() for x in xr]
        values = [x[1].strip() for x in xr]
        for c, v in zip(columns, values):
            columns_set.add(c)
            pd_temp_dict[c].append(v)
    df_auc = pd.DataFrame.from_dict(pd_temp_dict)
    df_auc = df_auc.rename(columns={"title":"model_name"})
    df_auc['model_name'] = df_auc['model_name'].apply(lambda x: x.replace("Dataset-1_", ""))
    df_auc['model_name'] = df_auc['model_name'].apply(lambda x: x.replace("Dataset-2-CodeCMR_", ""))
    df_auc['model_name'] = df_auc['model_name'].apply(lambda x: x.replace("Dataset-2_", ""))
    return df_auc

In [None]:
# Create output folders
!mkdir -p metrics_and_plots/Dataset-1
!mkdir -p metrics_and_plots/Dataset-BINKIT
# !mkdir -p metrics_and_plots/Dataset-1-CodeCMR
# !mkdir -p metrics_and_plots/Dataset-2

## Dataset 1

In [None]:
RESULTS_DIR = "../data/Dataset-1/"
OUTPUT_DIR = "metrics_and_plots/Dataset-1/"

base_path = "../../DBs/Dataset-1/pairs/testing/"

df_pos_testing = pd.read_csv(
    os.path.join(base_path, "pos_testing_Dataset-1.csv"))

df_neg_testing = pd.read_csv(
    os.path.join(base_path, "neg_testing_Dataset-1.csv"))

auc_list = compute_auc_and_plot(df_pos_testing, df_neg_testing, RESULTS_DIR, OUTPUT_DIR)
df_auc = from_list_to_df(auc_list)
display(df_auc)
df_auc.to_csv(os.path.join(OUTPUT_DIR, "df_auc.csv"))

## Dataset-BINKIT

In [None]:
RESULTS_DIR = "../data/Dataset-BINKIT/"
OUTPUT_DIR = "metrics_and_plots/Dataset-BINKIT/"

base_path = "../../DBs/Dataset-1/pairs/testing-BINKIT/"

df_pos_testing = pd.read_csv(
    os.path.join(base_path, "pos_testing_Dataset-BINKIT.csv"))

df_neg_testing = pd.read_csv(
    os.path.join(base_path, "neg_testing_Dataset-BINKIT.csv"))

auc_list = compute_auc_and_plot(df_pos_testing, df_neg_testing, RESULTS_DIR, OUTPUT_DIR)
df_auc = from_list_to_df(auc_list)
display(df_auc)
df_auc.to_csv(os.path.join(OUTPUT_DIR, "df_auc.csv"))