# TF-IDF + Cosine Similarity Evaluation for CTI vs. YARA Rules

In [1]:
# ðŸ“¦ Imports
import os
import pickle
import statistics
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

#  Load Data

In [2]:
def load_from_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Adjust paths as needed
project_base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
project_base_path

'/home/ANONYMOUS/projects/FALCON'

In [3]:
cti_yara_eval_data_path = os.path.join(project_base_path, "data/evaluation/cti-rule/yara/cti_yara_eval_data.pkl")
cti_yara_eval_data = load_from_pickle(cti_yara_eval_data_path)
test_ctis = list(cti_yara_eval_data.keys())
len(test_ctis)

916

#  Helper Functions

In [4]:
def map_subset_indices(full_list, subset_list):
    index_map = {}
    for item in subset_list:
        try:
            index_map[item] = full_list.index(item)
        except ValueError:
            index_map[item] = -1
    return index_map

def evaluate_topk_match(gt_indices, sorted_pred_indices, top_k):
    top_k_preds = set(sorted_pred_indices[:top_k])
    matched = top_k_preds.intersection(set(gt_indices))
    return 100 * len(matched) / len(gt_indices) if gt_indices else 0

def average_precision(gt_indices, sorted_pred_indices):
    hits, score = 0, 0.0
    for rank, idx in enumerate(sorted_pred_indices, start=1):
        if idx in gt_indices:
            hits += 1
            score += hits / rank
    return score / len(gt_indices) if gt_indices else 0.0

#  TF-IDF Evaluation

In [11]:
consolidated_dummy_yara_rules = []
for rules in cti_yara_eval_data.values():
    consolidated_dummy_yara_rules.extend(rules)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(consolidated_dummy_yara_rules)

total_recall, total_map = 0, 0
recall_k_list = []
map_score_list = []

for cti in tqdm(test_ctis, desc="Evaluating CTI-YARA with TF-IDF"):
    result_idx = map_subset_indices(consolidated_dummy_yara_rules, cti_yara_eval_data[cti])
    gt_indices = list(result_idx.values())

    tfidf_cti = vectorizer.transform([cti])
    cosine_scores = cosine_similarity(tfidf_cti, tfidf_matrix).flatten()
    sorted_indices = np.argsort(cosine_scores)[::-1].tolist()

    k = len(gt_indices)
    recall_k = evaluate_topk_match(gt_indices, sorted_indices, top_k=50)
    map_score = average_precision(gt_indices, sorted_indices)

    recall_k_list.append(recall_k)
    map_score_list.append(map_score)

    total_recall += recall_k
    total_map += map_score

Evaluating CTI-YARA with TF-IDF:   0%|          | 0/916 [00:00<?, ?it/s]

Evaluating CTI-YARA with TF-IDF: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 916/916 [00:01<00:00, 517.22it/s]


#  Evaluation Results

### Top - k

In [6]:
n = len(test_ctis)
print("\n=== TF-IDF Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== TF-IDF Evaluation Results ===
Average Recall@K: 19.85%
Mean Average Precision (MAP): 0.2132
Recall@K Standard Deviation: 22.3214
MAP Standard Deviation: 0.2195


### Top - 10

In [8]:
n = len(test_ctis)
print("\n=== TF-IDF Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== TF-IDF Evaluation Results ===
Average Recall@K: 33.91%
Mean Average Precision (MAP): 0.2132
Recall@K Standard Deviation: 32.2623
MAP Standard Deviation: 0.2195


### Top - 20

In [10]:
n = len(test_ctis)
print("\n=== TF-IDF Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== TF-IDF Evaluation Results ===
Average Recall@K: 41.48%
Mean Average Precision (MAP): 0.2132
Recall@K Standard Deviation: 34.8201
MAP Standard Deviation: 0.2195


### Top - 50

In [12]:
n = len(test_ctis)
print("\n=== TF-IDF Evaluation Results ===")
print(f"Average Recall@K: {total_recall / n:.2f}%")
print(f"Mean Average Precision (MAP): {total_map / n:.4f}")
print(f"Recall@K Standard Deviation: {statistics.stdev(recall_k_list):.4f}")
print(f"MAP Standard Deviation: {statistics.stdev(map_score_list):.4f}")


=== TF-IDF Evaluation Results ===
Average Recall@K: 52.14%
Mean Average Precision (MAP): 0.2132
Recall@K Standard Deviation: 35.9821
MAP Standard Deviation: 0.2195
