# Imports

In [70]:
import os
import pickle
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from torch.nn.functional import normalize
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
import time  # Optional, to slow down updates a little

# Path Declaration

In [71]:
project_base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
project_base_path

'/home/ANONYMOUS/projects/FALCON'

In [None]:
saved_v1_generated_data_path = os.path.join(project_base_path, "data/generation/yara/yara-rules_v1.pkl")
saved_v1_generated_data_path

'/home/ANONYMOUS/projects/FALCON/data/generation/yara/yara-rules_gpt.pkl'

In [None]:
saved_v2_generated_data_path = os.path.join(project_base_path, "data/generation/yara/yara-rules_v2.pkl")
saved_v2_generated_data_path

'/home/ANONYMOUS/projects/FALCON/data/generation/yara/yara-rules_llama_33.pkl'

# Environment Setup

In [74]:
SEED = 42

In [75]:
open_ai_key = "OPENAI_KEY"
os.environ['OPENAI_API_KEY'] = open_ai_key

# Load Dataset

In [76]:
def load_from_pickle(file_path) -> dict:
    """
    Loads data from a pickle file.

    :param file_path: Path to the pickle file
    :return: Loaded data
    """
    try:
        with open(file_path, 'rb') as file:
            return pickle.load(file)
    except Exception as e:
        print(f"Error loading data from pickle: {e}")
        return None

In [77]:
def get_first_n_elements(dictionary: dict, n: int) -> dict:
    """
    Get the first n elements of a dictionary.

    :param dictionary: The input dictionary
    :param n: The number of elements to retrieve
    :return: A dictionary with the first n elements
    """
    return dict(list(dictionary.items())[:n])

In [78]:
# Load the data back from the pickle file
loaded_v1_data = load_from_pickle(saved_v1_generated_data_path)
print(len(loaded_v1_data.keys()))

4588


In [79]:
yara_cti_sample_dict = get_first_n_elements(loaded_v1_data, 10)

In [80]:
# Load the data back from the pickle file
loaded_v2_data = load_from_pickle(saved_v2_generated_data_path)
print(len(loaded_v2_data.keys()))

4587


In [81]:
yara_cti_sample_dict

{'rule MSIETabularActivex\n{\n        meta:\n                ref = "CVE-2010-0805"\n                impact = 7\n                hide = true\n                author = "@d3t0n4t0r"\n        strings:\n                $cve20100805_1 = "333C7BC4-460F-11D0-BC04-0080C7055A83" nocase fullword\n                $cve20100805_2 = "DataURL" nocase fullword\n                $cve20100805_3 = "true"\n        condition:\n                ($cve20100805_1 and $cve20100805_3) or (all of them)\n}': 'Rule Name\n  MSIETabularActivex\n\nDescription\n  This YARA rule detects a specific vulnerability (CVE-2010-0805) associated with an ActiveX control. The rule targets potentially malicious strings that could be used in exploit attempts related to this vulnerability.\n\nReference\n  CVE-2010-0805\n\nIndicators / String Matches\n  This rule matches the following strings:\n\n  String ID\tPattern\tNotes\n  $cve20100805_1\t"333C7BC4-460F-11D0-BC04-0080C7055A83"\tActiveX control CLSID\n  $cve20100805_2\t"DataURL"\tPos

In [82]:
yaras, ctis = zip(*yara_cti_sample_dict.items())
yaras = list(yaras)
ctis = list(ctis)

In [83]:
len(yaras), len(ctis)

(10, 10)

In [84]:
def format_cti_yara_data_to_training_data(data: list[dict]) -> list[tuple]:
    """
    Format the CTI yara data into training data.

    :param data: The data to format
    :return: Formatted training data
    """
    training_data = []
    for dataset in data:
        for key, value in dataset.items():
            training_data.append((key, value))
    return training_data

In [85]:
# Sample Dataset Format (list of (anchor, positive) sentence pairs)
full_dataset = format_cti_yara_data_to_training_data([loaded_v1_data, loaded_v2_data])
print(len(full_dataset))

9175


In [86]:
def remove_10_test_samples(training_data: list[tuple], test_pairs: dict) -> list[tuple]:
    # Extract all test keys and values into sets for quick lookup
    test_keys = set(test_pairs.keys())
    test_values = set(test_pairs.values())
    
    # Filter training data
    filtered_data = [(key, value) for key, value in training_data if key not in test_keys and value not in test_values]
    
    return filtered_data

In [87]:
# Sample Dataset Format (list of (anchor, positive) sentence pairs)
full_dataset = remove_10_test_samples(full_dataset, yara_cti_sample_dict)
print(len(full_dataset))

9155


In [88]:
# Split into training and testing sets (80% train, 20% test)
train_pairs, test_pairs = train_test_split(full_dataset, test_size=0.1, random_state=SEED)

# Training Setup

In [89]:
# Custom Dataset
class ContrastiveDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        anchor, positive = self.data[idx]
        encoded = self.tokenizer([anchor, positive], padding="max_length", truncation=True,
                                 max_length=MAX_LEN, return_tensors="pt")
        return {
            "input_ids_a": encoded["input_ids"][0],
            "attention_mask_a": encoded["attention_mask"][0],
            "input_ids_b": encoded["input_ids"][1],
            "attention_mask_b": encoded["attention_mask"][1],
        }

In [90]:
# Bi-Encoder Model
class SentenceEncoder(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0]  # CLS token
        return nn.functional.normalize(embeddings, p=2, dim=1)  # Normalize for cosine similarity


In [91]:
# Contrastive Loss (InfoNCE / NT-Xent)
def contrastive_loss(emb_a, emb_b, temperature=0.05):
    similarity_matrix = torch.matmul(emb_a, emb_b.T) / temperature
    labels = torch.arange(len(emb_a)).to(emb_a.device)
    return nn.CrossEntropyLoss()(similarity_matrix, labels)


# Scaling Function

In [92]:
def analyze_dot_product_matrix(matrix: torch.Tensor):
    """
    Computes statistics for the principal diagonal and off-diagonal values of a dot product matrix.

    Args:
        matrix (torch.Tensor): A square 2D tensor (dot product matrix).

    Returns:
        Tuple[dict, dict]: (diagonal_stats, off_diagonal_stats)
    """
    assert matrix.ndim == 2 and matrix.shape[0] == matrix.shape[1], "Matrix must be square"

    diag_vals = torch.diag(matrix)
    all_vals = matrix.flatten()
    off_diag_mask = ~torch.eye(matrix.size(0), dtype=torch.bool, device=matrix.device)
    off_diag_vals = matrix[off_diag_mask]

    def stats(tensor):
        return {
            "mean": tensor.mean().item(),
            "max": tensor.max().item(),
            "min": tensor.min().item(),
            "std": tensor.std(unbiased=False).item(),  # population std
        }

    return stats(diag_vals), stats(off_diag_vals)

In [93]:
import numpy as np
from langchain.embeddings import OpenAIEmbeddings

def compute_dot_product_matrix_openai(test_yaras, test_ctis, batch_size=50):
    embedder = OpenAIEmbeddings()  # Uses text-embedding-ada-002 by default

    # Step 1: Get embeddings for all yara rules
    yara_embeddings = []
    for i in range(0, len(test_yaras), batch_size):
        batch = test_yaras[i:i + batch_size]
        yara_embeddings.extend(embedder.embed_documents(batch))  # List of vectors

    yara_embeddings = np.array(yara_embeddings)  # Shape: (N, D)
    yara_embeddings_norm = np.linalg.norm(yara_embeddings, axis=1, keepdims=True)

    # Step 2: Compute batched dot products with CTIs
    dot_product_matrix = []

    for i in range(0, len(test_ctis), batch_size):
        batch = test_ctis[i:i + batch_size]
        cti_embeddings = embedder.embed_documents(batch)
        cti_embeddings = np.array(cti_embeddings)
        cti_embeddings_norm = np.linalg.norm(cti_embeddings, axis=1, keepdims=True)

        # Normalize and compute dot product
        sim_matrix = np.dot(cti_embeddings, yara_embeddings.T) / (
            cti_embeddings_norm @ yara_embeddings_norm.T
        )
        dot_product_matrix.append(sim_matrix)

    dot_product_matrix = np.vstack(dot_product_matrix)  # Final shape: (len(test_ctis), len(test_yaras))
    return dot_product_matrix


## 10 Validation Set

### Run - 0

In [None]:
diag_stats, off_diag_stats = analyze_dot_product_matrix(dot_product_matrix)

print("Diagonal Stats:", diag_stats)
print("Off-Diagonal Stats:", off_diag_stats)

### Run - 1

In [None]:
diag_stats, off_diag_stats = analyze_dot_product_matrix(dot_product_matrix)

print("Diagonal Stats:", diag_stats)
print("Off-Diagonal Stats:", off_diag_stats)

### Run - 2

In [None]:
diag_stats, off_diag_stats = analyze_dot_product_matrix(dot_product_matrix)

print("Diagonal Stats:", diag_stats)
print("Off-Diagonal Stats:", off_diag_stats)

## Test Set

In [None]:
test_yaras = [i[0] for i in test_pairs]
test_ctis = [i[1] for i in test_pairs]

### Run - 0

In [None]:
dot_product_matrix_test = compute_dot_product_matrix_batched(
    model=model,
    tokenizer=tokenizer,
    test_yaras=test_yaras,
    test_ctis=test_ctis,
    batch_size=256
)

diag_stats, off_diag_stats = analyze_dot_product_matrix(dot_product_matrix_test)

print("Diagonal Stats:", diag_stats)
print("Off-Diagonal Stats:", off_diag_stats)

### Run - 1

In [None]:
dot_product_matrix_test = compute_dot_product_matrix_batched(
    model=model,
    tokenizer=tokenizer,
    test_yaras=test_yaras,
    test_ctis=test_ctis,
    batch_size=256
)

diag_stats, off_diag_stats = analyze_dot_product_matrix(dot_product_matrix_test)

print("Diagonal Stats:", diag_stats)
print("Off-Diagonal Stats:", off_diag_stats)

### Run - 2

In [None]:
dot_product_matrix_test = compute_dot_product_matrix_batched(
    model=model,
    tokenizer=tokenizer,
    test_yaras=test_yaras,
    test_ctis=test_ctis,
    batch_size=256
)

diag_stats, off_diag_stats = analyze_dot_product_matrix(dot_product_matrix_test)

print("Diagonal Stats:", diag_stats)
print("Off-Diagonal Stats:", off_diag_stats)

# Semantic Evaluation

In [94]:
import numpy as np
from sklearn.metrics import f1_score

def evaluate_similarity_with_auto_threshold_numpy(dot_product_matrix: np.ndarray):
    """
    Evaluates diagonal recall and best F1-score based on thresholded sigmoid scores
    using OpenAI-generated NumPy dot-product matrix.

    Args:
        dot_product_matrix (np.ndarray): Square similarity matrix (N x N)

    Returns:
        dict: {
            'recall_diag': float,
            'f1_best': float,
            'best_threshold': float,
            'sigmoid_min': float,
            'sigmoid_max': float,
        }
    """
    assert dot_product_matrix.ndim == 2 and dot_product_matrix.shape[0] == dot_product_matrix.shape[1], \
        "Input must be a square matrix."

    N = dot_product_matrix.shape[0]

    # Apply sigmoid to scores
    sigmoid_scores = 1 / (1 + np.exp(-dot_product_matrix))

    # Diagonal Recall: how often the highest score is at the correct (diagonal) position
    recall_diag = np.mean([np.argmax(dot_product_matrix[i]) == i for i in range(N)])

    # Prepare labels and scores
    labels = []
    flat_scores = []

    for i in range(N):
        for j in range(N):
            labels.append(1 if i == j else 0)
            flat_scores.append(sigmoid_scores[i, j])

    # Search for best threshold to maximize F1
    thresholds = np.linspace(min(flat_scores), max(flat_scores), num=100)
    best_f1 = 0.0
    best_threshold = 0.0

    for t in thresholds:
        preds = [1 if s >= t else 0 for s in flat_scores]
        f1 = f1_score(labels, preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    return {
        "recall_diag": recall_diag,
        "f1_best": best_f1,
        "best_threshold": best_threshold,
        "sigmoid_min": min(flat_scores),
        "sigmoid_max": max(flat_scores),
    }


In [95]:
test_yaras = [i[0] for i in test_pairs]
test_ctis = [i[1] for i in test_pairs]

In [96]:
len(test_yaras), len(test_ctis)

(916, 916)

## Pre-trained

In [97]:
dot_matrix = compute_dot_product_matrix_openai(test_yaras, test_ctis)
metrics = evaluate_similarity_with_auto_threshold_numpy(dot_matrix)

for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


recall_diag: 0.9094
f1_best: 0.6256
best_threshold: 0.7089
sigmoid_min: 0.6594
sigmoid_max: 0.7199
