# Imports

In [1]:
import os
from pdfminer.high_level import extract_text
import transformers
import torch
from tqdm import tqdm
import re

# File Path Declaration

In [2]:
project_base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
project_base_path

'/home/ANONYMOUS/projects/FALCON'

In [3]:
snort_cti_dir_path = os.path.join(project_base_path, "data/qualitative/snort_desc_pdf")
snort_gt_rule_dir_path = os.path.join(project_base_path, "data/qualitative/snort_rules")
print(snort_cti_dir_path)
print(snort_gt_rule_dir_path)

/home/ANONYMOUS/projects/FALCON/data/qualitative/snort_desc_pdf
/home/ANONYMOUS/projects/FALCON/data/qualitative/snort_rules


In [4]:
generated_rule_dir_path = os.path.join(project_base_path, "results/qualitative/snort/Llama-3.3-70B-Instruct")
generated_rule_dir_path

'/home/ANONYMOUS/projects/FALCON/results/qualitative/snort/Llama-3.3-70B-Instruct'

# Environment

In [5]:
# Llama 3.3 70B Environment 
model_path = "/data/common/models/meta-llama/Llama-3.3-70B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_path,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
def get_llama_33_70b_inst_output(prompt: str) -> str:
    messages = [
        {"role": "user", "content": prompt}
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=4096,
    )
    
    torch.cuda.empty_cache()
    
    return outputs[0]["generated_text"][-1]['content']

In [7]:
get_llama_33_70b_inst_output("Hello, how are you?")

"Hello! I'm just a language model, so I don't have feelings or emotions like humans do, but I'm functioning properly and ready to help with any questions or tasks you may have. How about you? How's your day going so far?"

# Helper Functions

In [8]:
def get_pdf_files(directory_path):
    """
    Returns a list of all PDF files in the given directory.

    Args:
        directory_path (str): The path to the directory to search.

    Returns:
        list: A list of PDF file paths.

    Raises:
        ValueError: If the directory_path is not a directory.
    """
    pdf_files = []
    try:
        if not os.path.isdir(directory_path):
            raise ValueError(f"Provided path '{directory_path}' is not a valid directory.")

        for file in os.listdir(directory_path):
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(directory_path, file))
    except FileNotFoundError:
        print(f"Error: The directory '{directory_path}' was not found.")
    except PermissionError:
        print(f"Error: Permission denied to access '{directory_path}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return pdf_files

In [9]:
def read_pdf_content(pdf_path):
    """
    Reads a PDF file from the specified path and returns its content as a string.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text content from the PDF.

    Raises:
        FileNotFoundError: If the PDF file does not exist.
        Exception: For other errors during extraction.
    """
    try:
        text = extract_text(pdf_path)
        return text
    except FileNotFoundError:
        print(f"Error: File '{pdf_path}' not found.")
    except PermissionError:
        print(f"Error: Permission denied to read '{pdf_path}'.")
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
    return ""

In [10]:
def get_pdf_filename(pdf_path):
    """
    Returns the filename of a PDF from its absolute path.

    Args:
        pdf_path (str): The absolute path to the PDF file.

    Returns:
        str: The filename of the PDF.
    """
    try:
        return os.path.basename(pdf_path).split('.')[0]
    except Exception as e:
        print(f"An error occurred while extracting the filename: {e}")
        return ""

In [11]:
def read_text_file(folder_path, file_name):
    """
    Reads a .txt file from the given folder path and returns its content as a string.
    If the file does not have a .txt extension, it is added automatically.

    Args:
        folder_path (str): Path to the folder containing the file.
        file_name (str): Name of the file (with or without .txt extension).

    Returns:
        str: Content of the text file.

    Raises:
        FileNotFoundError: If the file does not exist.
        Exception: For other errors during file reading.
    """
    if not file_name.lower().endswith('.txt'):
        file_name += '.txt'

    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except PermissionError:
        print(f"Error: Permission denied to read '{file_path}'.")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
    
    return ""


In [12]:
def generate_rule_from_cti_prompt(input_cti: str) -> str:

  rule_generation_prompt = f"""

    You are a cybersecurity expert tasked with performing Snort rule generation for a given Cyber Threat Intelligence (CTI).
    There is a sample task input and output provided below.
    
    Sample CTI Input and corresponding Snort Output:

    CTI Input:
        
      Title: Detection of QAZ Worm Client Login Activity over TCP Port 7597

      Threat Category: Malware – Backdoor

      Threat Name: QAZ Worm

      Detection Summary:

      This signature is designed to detect network traffic associated with the QAZ Worm, specifically its client login activity. The worm exhibits characteristic behavior by initiating a connection and transmitting a unique identifier string (qazwsx.hsq) to a remote server over TCP port 7597. This communication typically indicates the presence of a backdoor that allows unauthorized access to infected systems.

      Rule Metadata
      Classification: Misc Activity

      Ruleset: Community

      Rule Logic Breakdown
      Alert Type: alert

      Protocol: tcp

      Source IP: $EXTERNAL_NET (any IP address outside the local trusted network)

      Source Port: any

      Destination IP: $HOME_NET (any IP address inside the local trusted network)

      Destination Port: 7597 (known port used by the QAZ worm)

      Flow: to_server, established
      (Traffic must be flowing to a server and part of an established connection)

      Content Match: "qazwsx.hsq"
      (String in the payload that identifies the worm’s presence)

      Message: "MALWARE-BACKDOOR QAZ Worm Client Login access"

      Technical Details
      Port 7597 is not a standard well-known port and is leveraged by the QAZ Worm for backdoor communications.

      The content string "qazwsx.hsq" is a unique identifier used by the worm’s client when connecting to a command-and-control server or to another infected host.

      Detection relies on the presence of this string within an established TCP session directed to a host on the internal network.

      Indicators of Compromise (IOCs)
      String Pattern: qazwsx.hsq

      Destination Port: 7597/tcp

      Recommended Actions
      Block or restrict traffic on port 7597 at the perimeter firewall.

      Investigate any internal systems that initiate or receive such connections.

      Perform malware scanning and forensic analysis on potentially compromised hosts.

      Update endpoint and network defense signatures to ensure coverage against this and similar threats.
      
    Snort Output:
        
      alert tcp $EXTERNAL_NET any -> $HOME_NET 7597 ( msg:"MALWARE-BACKDOOR QAZ Worm Client Login access"; flow:to_server,established; content:"qazwsx.hsq"; metadata:ruleset community; classtype:misc-activity; sid:108; rev:12; )


    Generate Snort from the provided CTI. Do not include anything that is not provided.
    Do not print anything like sure here is the CTI or anything else. Only print the CTI. 

    CTI Input: 
    
      {input_cti}

    Snort Output:

  """

  return rule_generation_prompt

In [13]:
def save_string_as_txt(directory_path, file_name, content):
    """
    Saves a given string as a .txt file in the specified directory.

    Args:
        directory_path (str): Path to the directory where the file should be saved.
        file_name (str): Desired name of the file (with or without .txt extension).
        content (str): The string content to be written to the file.

    Returns:
        str: Full path to the saved file if successful, otherwise an empty string.
    """
    if not file_name.lower().endswith('.txt'):
        file_name += '.txt'
    
    file_path = os.path.join(directory_path, file_name)

    try:
        os.makedirs(directory_path, exist_ok=True)  # Create directory if it doesn't exist
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
        return file_path
    except PermissionError:
        print(f"Error: Permission denied to write to '{file_path}'.")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")

    return ""

In [14]:
def extract_triple_single_quote_content(input_string):
    """
    Checks if the input string contains a pattern enclosed in triple single quotes
    and returns the content inside the first occurrence.

    Args:
        input_string (str): The string to search.

    Returns:
        str: The content inside the triple single quotes if found, else an empty string.
    """
    match = re.search(r"```(.*?)```", input_string, re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# Test Code

In [15]:
snort_cti_pdf_path_list = get_pdf_files(snort_cti_dir_path)
len(snort_cti_pdf_path_list)

60

In [16]:
snort_cti_pdf_path_list[0]

'/home/ANONYMOUS/projects/FALCON/data/qualitative/snort_desc_pdf/rule18_e.pdf'

In [17]:
get_pdf_filename(snort_cti_pdf_path_list[0])

'rule18_e'

In [18]:
print(read_pdf_content(snort_cti_pdf_path_list[0]))

Threat Category: Reconnaissance, File Transfer Protocol Abuse

Threat  Description:  The  SNORT  rule  sid:1000018  is  designed  to  detect  attempts  to  initiate  file

transfers using the Trivial File Transfer Protocol (TFTP) over UDP port 69.

Indicators of Compromise (IoCs):

Protocol: UDP

Destination Port: 69

Traffic Direction: Any source to destination on port 69

Payload containing TFTP read/write request (e.g., RRQ or WRQ)

Detection Mechanism:

This  rule  detects  any  UDP  packet  sent  to  port  69  regardless  of  source  port  or  IP.  Since  TFTP

operates over UDP and does not require a connection setup like TCP, the rule listens for potential

read or write requests typically found at the beginning of a TFTP session.

Possible Attribution & Use Cases:

May indicate lateral movement or payload delivery during exploitation or post-exploitation phases.

Could be used by attackers to exfiltrate data from compromised systems.

Recommended Actions:

Determine  if  the  T

In [19]:
prompt = generate_rule_from_cti_prompt(read_pdf_content(snort_cti_pdf_path_list[0]))
print(prompt)



    You are a cybersecurity expert tasked with performing Snort rule generation for a given Cyber Threat Intelligence (CTI).
    There is a sample task input and output provided below.
    
    Sample CTI Input and corresponding Snort Output:

    CTI Input:
        
      Title: Detection of QAZ Worm Client Login Activity over TCP Port 7597

      Threat Category: Malware – Backdoor

      Threat Name: QAZ Worm

      Detection Summary:

      This signature is designed to detect network traffic associated with the QAZ Worm, specifically its client login activity. The worm exhibits characteristic behavior by initiating a connection and transmitting a unique identifier string (qazwsx.hsq) to a remote server over TCP port 7597. This communication typically indicates the presence of a backdoor that allows unauthorized access to infected systems.

      Rule Metadata
      Classification: Misc Activity

      Ruleset: Community

      Rule Logic Breakdown
      Alert Type: alert

      

In [23]:
test_rule = get_llama_33_70b_inst_output(prompt)
print(test_rule)

alert udp any any -> any 69 ( msg:"TFTP Read/Write Request"; content:"RRQ|00|WRQ|00"; metadata:ruleset community; classtype:attempted-recon; sid:1000018; rev:1; )


In [21]:
print(read_text_file(snort_gt_rule_dir_path, get_pdf_filename(snort_cti_pdf_path_list[0])))

alert udp any any -> any 69 (msg:"TFTP request detected"; sid:1000018; rev:1;)



In [22]:
save_string_as_txt(generated_rule_dir_path, 'test', test_rule)

'/home/ANONYMOUS/projects/FALCON/results/qualitative/snort/Llama-3.3-70B-Instruct/test.txt'

# Generate Rule from CTI

In [24]:
snort_cti_pdf_path_list = get_pdf_files(snort_cti_dir_path)
len(snort_cti_pdf_path_list)

60

In [24]:
for pdf_path in tqdm(snort_cti_pdf_path_list):  
  pdf_filename = get_pdf_filename(pdf_path)
  cti = read_pdf_content(pdf_path)
  prompt = generate_rule_from_cti_prompt(cti)
  rule = get_llama_33_70b_inst_output(prompt)
  save_string_as_txt(generated_rule_dir_path, pdf_filename, rule)
     
  torch.cuda.empty_cache()

 12%|█▏        | 7/60 [00:47<05:42,  6.47s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 60/60 [08:12<00:00,  8.21s/it]
