# Imports

In [4]:
import os
import gc
import pickle
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import statistics

# Path Declaration

In [5]:
project_base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
project_base_path

'/home/ANONYMOUS/projects/FALCON'

In [6]:
cti_yara_eval_data_path = os.path.join(project_base_path, "data/evaluation/cti-rule/yara/cti_yara_eval_data.pkl")
cti_yara_eval_data_path

'/home/ANONYMOUS/projects/FALCON/data/evaluation/cti-rule/yara/cti_yara_eval_data.pkl'

# Misc Functions

In [7]:
def load_from_pickle(file_path):
    """
    Loads data from a pickle file.

    :param file_path: Path to the pickle file
    :return: Loaded data
    """
    try:
        with open(file_path, 'rb') as file:
            return pickle.load(file)
    except Exception as e:
        print(f"Error loading data from pickle: {e}")
        return None

In [8]:
def map_subset_indices(full_list, subset_list):
    """
    Maps each string in the subset list to its index in the full list.

    Args:
        full_list (list of str): The complete list of strings.
        subset_list (list of str): A subset of strings present in the full list.

    Returns:
        dict: A dictionary with subset strings as keys and their indices in the full list as values.
    """
    index_map = {}
    for item in subset_list:
        try:
            index_map[item] = full_list.index(item)
        except ValueError:
            # Just in case the subset contains a string not found in full_list
            index_map[item] = -1
    return index_map

In [9]:
def evaluate_topk_match(gt_indices, sorted_pred_indices, top_k):
    top_k_preds = set(sorted_pred_indices[:top_k])
    matched = top_k_preds.intersection(set(gt_indices))
    return 100 * len(matched) / len(gt_indices) if gt_indices else 0

In [10]:
def reciprocal_rank(gt_indices, sorted_pred_indices):
    for rank, idx in enumerate(sorted_pred_indices, start=1):
        if idx in gt_indices:
            return 1.0 / rank
    return 0.0

In [11]:
def average_precision(gt_indices, sorted_pred_indices):
    hits, score = 0, 0.0
    for rank, idx in enumerate(sorted_pred_indices, start=1):
        if idx in gt_indices:
            hits += 1
            score += hits / rank
    return score / len(gt_indices) if gt_indices else 0.0


# Environment Setup

In [12]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
open_ai_key = "OPENAI_KEY"
os.environ['OPENAI_API_KEY'] = open_ai_key
llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

# Data Preparation

### Load Data

In [13]:
# Load the data back from the pickle file
cti_yara_eval_data = load_from_pickle(cti_yara_eval_data_path)
print(len(cti_yara_eval_data.keys()))

916


### Pre-processing

In [14]:
consolidated_dummy_yara_rules = []
for cti, rules in cti_yara_eval_data.items():
    consolidated_dummy_yara_rules.extend(rules)

In [15]:
len(consolidated_dummy_yara_rules)

5106

# Evaluation

In [16]:
import numpy as np

In [17]:
class RAG:
    def __init__(self, model="gpt-4o"):
        self.llm = ChatOpenAI(model=model)
        self.embeddings = OpenAIEmbeddings()
        self.doc_embeddings = None
        self.docs = None

    def load_documents(self, documents):
        self.docs = documents
        self.doc_embeddings = self.embeddings.embed_documents(documents)

    def get_top_k_docs(self, query, k=10):
        if not self.docs or not self.doc_embeddings:
            raise ValueError("Documents and their embeddings are not loaded.")

        query_embedding = self.embeddings.embed_query(query)
        similarities = [
            np.dot(query_embedding, doc_emb)
            / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
            for doc_emb in self.doc_embeddings
        ]
        top_k_indices = np.argsort(similarities)[::-1][:k]
        return [self.docs[i] for i in top_k_indices], top_k_indices


In [18]:
test_ctis = list(cti_yara_eval_data.keys())

r_at_10_list = []
map_list = []

rag = RAG()
rag.load_documents(consolidated_dummy_yara_rules)

for cti in tqdm(test_ctis, desc="Evaluating Recall@10 and MAP"):
    try:
        gt_rules = cti_yara_eval_data[cti]
        top_docs, top_indices = rag.get_top_k_docs(cti, k=10)

        # Match indices of GT rules in dummy rules
        gt_indices = set(map_subset_indices(consolidated_dummy_yara_rules, gt_rules).values())

        # R@10: At least one GT in top 10
        retrieved_set = set(top_indices)
        hit = len(gt_indices.intersection(retrieved_set)) > 0
        r_at_10_list.append(1.0 if hit else 0.0)

        # MAP: Compute average precision for this query
        ap = 0.0
        hits = 0
        for i, idx in enumerate(top_indices):
            if idx in gt_indices:
                hits += 1
                ap += hits / (i + 1)
        ap = ap / len(gt_indices) if gt_indices else 0
        map_list.append(ap)

    except Exception as e:
        print(f"Error processing CTI: {e}")
        continue


Evaluating Recall@10 and MAP: 100%|██████████| 916/916 [1:04:58<00:00,  4.26s/it]  


### Top - 10

In [19]:
import statistics

print("\n=== Retrieval Evaluation for RAGAS Retriever ===")
print(f"Recall@10: {sum(r_at_10_list) / len(r_at_10_list):.4f}")
print(f"MAP:       {sum(map_list) / len(map_list):.4f}")
print(f"Recall@10 Std Dev: {statistics.stdev(r_at_10_list):.4f}")
print(f"MAP Std Dev:       {statistics.stdev(map_list):.4f}")


=== Retrieval Evaluation for RAGAS Retriever ===
Recall@10: 0.6900
MAP:       0.2060
Recall@10 Std Dev: 0.4628
MAP Std Dev:       0.2118
