# BM25-Based Evaluation for CTI vs. YARA Rules

In [4]:
# ðŸ“¦ Imports
import os
import pickle
import statistics
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import numpy as np
import re

#  Load Data

In [5]:
def load_from_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [6]:
# Adjust paths as needed
project_base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
project_base_path

'/home/ANONYMOUS/projects/FALCON'

In [7]:
cti_yara_eval_data_path = os.path.join(project_base_path, "data/evaluation/cti-rule/yara/cti_yara_eval_data.pkl")
cti_yara_eval_data = load_from_pickle(cti_yara_eval_data_path)
test_ctis = list(cti_yara_eval_data.keys())
len(test_ctis)

916

#  Helper Functions

In [8]:
def tokenize(text):
    return re.findall(r'\w+', text.lower())

def map_subset_indices(full_list, subset_list):
    index_map = {}
    for item in subset_list:
        try:
            index_map[item] = full_list.index(item)
        except ValueError:
            index_map[item] = -1
    return index_map

def evaluate_topk_match(gt_indices, sorted_pred_indices, top_k):
    top_k_preds = set(sorted_pred_indices[:top_k])
    matched = top_k_preds.intersection(set(gt_indices))
    return 100 * len(matched) / len(gt_indices) if gt_indices else 0

def average_precision(gt_indices, sorted_pred_indices):
    hits, score = 0, 0.0
    for rank, idx in enumerate(sorted_pred_indices, start=1):
        if idx in gt_indices:
            hits += 1
            score += hits / rank
    return score / len(gt_indices) if gt_indices else 0.0

#  BM25 Evaluation

In [16]:
consolidated_dummy_yara_rules = []
for rules in cti_yara_eval_data.values():
    consolidated_dummy_yara_rules.extend(rules)

tokenized_corpus = [tokenize(rule) for rule in consolidated_dummy_yara_rules]
bm25 = BM25Okapi(tokenized_corpus)

total_recall, total_map = 0, 0
recall_k_list = []
map_score_list = []

for cti in tqdm(test_ctis, desc="Evaluating CTI-YARA with BM25"):
    result_idx = map_subset_indices(consolidated_dummy_yara_rules, cti_yara_eval_data[cti])
    gt_indices = list(result_idx.values())

    tokenized_cti = tokenize(cti)
    scores = bm25.get_scores(tokenized_cti)
    sorted_indices = np.argsort(scores)[::-1].tolist()

    k = len(gt_indices)
    recall_k = evaluate_topk_match(gt_indices, sorted_indices, top_k=50)
    map_score = average_precision(gt_indices, sorted_indices)

    recall_k_list.append(recall_k)
    map_score_list.append(map_score)

    total_recall += recall_k
    total_map += map_score

Evaluating CTI-YARA with BM25:   0%|          | 0/916 [00:00<?, ?it/s]

Evaluating CTI-YARA with BM25: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 916/916 [01:04<00:00, 14.20it/s]


#  Evaluation Results

### Top - K

In [11]:
n = len(test_ctis)
print("\n=== BM25 Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== BM25 Evaluation Results ===
Average Recall@K: 20.42%
Mean Average Precision (MAP): 0.2185
Recall@K Standard Deviation: 22.2528
MAP Standard Deviation: 0.2219


### Top - 10

In [13]:
n = len(test_ctis)
print("\n=== BM25 Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== BM25 Evaluation Results ===
Average Recall@K: 34.38%
Mean Average Precision (MAP): 0.2185
Recall@K Standard Deviation: 32.7617
MAP Standard Deviation: 0.2219


### Top - 20

In [15]:
n = len(test_ctis)
print("\n=== BM25 Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== BM25 Evaluation Results ===
Average Recall@K: 41.25%
Mean Average Precision (MAP): 0.2185
Recall@K Standard Deviation: 35.0272
MAP Standard Deviation: 0.2219


### Top - 50

In [17]:
n = len(test_ctis)
print("\n=== BM25 Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== BM25 Evaluation Results ===
Average Recall@K: 51.35%
Mean Average Precision (MAP): 0.2185
Recall@K Standard Deviation: 35.5160
MAP Standard Deviation: 0.2219
