# Imports

In [1]:
import os
from pdfminer.high_level import extract_text
from transformers import pipeline
import torch
from tqdm import tqdm
import re

# File Path Declaration

In [2]:
project_base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
project_base_path

'/home/ANONYMOUS/projects/FALCON'

In [3]:
yara_cti_dir_path = os.path.join(project_base_path, "data/qualitative/yara_desc_pdf")
yara_gt_rule_dir_path = os.path.join(project_base_path, "data/qualitative/yara_rules")
print(yara_cti_dir_path)
print(yara_gt_rule_dir_path)

/home/ANONYMOUS/projects/FALCON/data/qualitative/yara_desc_pdf
/home/ANONYMOUS/projects/FALCON/data/qualitative/yara_rules


In [4]:
generated_rule_dir_path = os.path.join(project_base_path, "results/qualitative/yara/Mistral-Small-24B-Instruct-2501")
generated_rule_dir_path

'/home/ANONYMOUS/projects/FALCON/results/qualitative/yara/Mistral-Small-24B-Instruct-2501'

# Environment

In [5]:
model_path = "/data/common/models/mistral/Mistral-Small-24B-Instruct-2501"

pipeline = pipeline(
    "text-generation", 
    model=model_path, 
    max_new_tokens=4096, 
    torch_dtype=torch.bfloat16,
    pad_token_id=0
    )


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
def get_mistral_small_24b_inst_output(prompt: str) -> str:
    messages = [
        {"role": "user", "content": prompt},
    ]
   
    result = pipeline(messages)
    
    return result[0]['generated_text'][-1]['content']


In [7]:
get_mistral_small_24b_inst_output("Hello, how are you?")

"Hello! I'm functioning as intended, thank you. How can I assist you today?"

# Helper Functions

In [8]:
def get_pdf_files(directory_path):
    """
    Returns a list of all PDF files in the given directory.

    Args:
        directory_path (str): The path to the directory to search.

    Returns:
        list: A list of PDF file paths.

    Raises:
        ValueError: If the directory_path is not a directory.
    """
    pdf_files = []
    try:
        if not os.path.isdir(directory_path):
            raise ValueError(f"Provided path '{directory_path}' is not a valid directory.")

        for file in os.listdir(directory_path):
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(directory_path, file))
    except FileNotFoundError:
        print(f"Error: The directory '{directory_path}' was not found.")
    except PermissionError:
        print(f"Error: Permission denied to access '{directory_path}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return pdf_files

In [9]:
def read_pdf_content(pdf_path):
    """
    Reads a PDF file from the specified path and returns its content as a string.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text content from the PDF.

    Raises:
        FileNotFoundError: If the PDF file does not exist.
        Exception: For other errors during extraction.
    """
    try:
        text = extract_text(pdf_path)
        return text
    except FileNotFoundError:
        print(f"Error: File '{pdf_path}' not found.")
    except PermissionError:
        print(f"Error: Permission denied to read '{pdf_path}'.")
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
    return ""

In [10]:
def get_pdf_filename(pdf_path):
    """
    Returns the filename of a PDF from its absolute path.

    Args:
        pdf_path (str): The absolute path to the PDF file.

    Returns:
        str: The filename of the PDF.
    """
    try:
        return os.path.basename(pdf_path).split('.')[0][:-5]
    except Exception as e:
        print(f"An error occurred while extracting the filename: {e}")
        return ""

In [11]:
def read_text_file(folder_path, file_name):
    """
    Reads a .txt file from the given folder path and returns its content as a string.
    If the file does not have a .txt extension, it is added automatically.

    Args:
        folder_path (str): Path to the folder containing the file.
        file_name (str): Name of the file (with or without .txt extension).

    Returns:
        str: Content of the text file.

    Raises:
        FileNotFoundError: If the file does not exist.
        Exception: For other errors during file reading.
    """
    if not file_name.lower().endswith('.txt'):
        file_name += '.txt'

    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except PermissionError:
        print(f"Error: Permission denied to read '{file_path}'.")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
    
    return ""


In [25]:
def generate_rule_from_cti_prompt(input_cti: str) -> str:

  rule_generation_prompt = f"""

    You are a cybersecurity expert tasked with performing YARA rule generation for a given Cyber Threat Intelligence (CTI).
    There is a sample task input and output provided below.
    
    Sample CTI Input and corresponding YARA Output:

    CTI Input:
        
      Rule Name
        APT30_Sample_2

      Description
        This YARA rule is designed to detect a specific malware sample associated with the APT30 threat group, as documented in a report by FireEye. The binary appears to be masquerading as a legitimate Microsoft Word-related executable, with embedded strings referencing ForZRLnkWordDlg.EXE, suggesting impersonation of Microsoft Office components.

      Reference
        FireEye APT30 Report
        Full Report: https://www2.fireeye.com/rs/fireye/images/rpt-apt30.pdf

      Indicators / String Matches
        This rule matches the following wide (Unicode) strings:

        String ID	Pattern	Notes
        $s0	"ForZRLnkWordDlg.EXE"	Executable filename
        $s1	"ForZRLnkWordDlg Microsoft "	Vendor impersonation
        $s9	"ForZRLnkWordDlg 1.0 "	Fake version info
        $s11	"ForZRLnkWordDlg"	Generic name
        $s12	" (C) 2011"	Fake copyright year
        
        All string patterns use fullword and wide modifiers, meaning they match exact full Unicode words.

      Condition Logic
        The rule triggers if:

        The file size is less than 100KB.

        The file has a valid DOS MZ header (uint16(0) == 0x5A4D).

        All of the specified string patterns ($s0 through $s12) are found.

      Known File Hash
        SHA1: 0359ffbef6a752ee1a54447b26e272f4a5a35167

      Rule UUID
        821a2de9-48c4-58d8-acc4-1e25025ab5cf

      
    YARA Output:
        
      rule APT30_Sample_2 {{
        meta:
          description = "FireEye APT30 Report Sample"
          license = "Detection Rule License 1.1 https://github.com/Neo23x0/signature-base/blob/master/LICENSE"
          author = "Florian Roth (Nextron Systems)"
          reference = "https://www2.fireeye.com/rs/fireye/images/rpt-apt30.pdf"
          date = "2015/04/13"
          hash = "0359ffbef6a752ee1a54447b26e272f4a5a35167"
          id = "821a2de9-48c4-58d8-acc4-1e25025ab5cf"
        strings:
          $s0 = "ForZRLnkWordDlg.EXE" fullword wide
          $s1 = "ForZRLnkWordDlg Microsoft " fullword wide
          $s9 = "ForZRLnkWordDlg 1.0 " fullword wide
          $s11 = "ForZRLnkWordDlg" fullword wide
          $s12 = " (C) 2011" fullword wide
        condition:
          filesize < 100KB and uint16(0) == 0x5A4D and all of them
      }}


    Generate YARA from the provided CTI. Do not include anything that is not provided.
    Do not print anything like sure here is the CTI or anything else. Only print the CTI.
    The output should be a valid YARA rule. 

    CTI Input: 
    
      {input_cti}

    YARA Output:

  """

  return rule_generation_prompt

In [26]:
def save_string_as_txt(directory_path, file_name, content):
    """
    Saves a given string as a .txt file in the specified directory.

    Args:
        directory_path (str): Path to the directory where the file should be saved.
        file_name (str): Desired name of the file (with or without .txt extension).
        content (str): The string content to be written to the file.

    Returns:
        str: Full path to the saved file if successful, otherwise an empty string.
    """
    if not file_name.lower().endswith('.txt'):
        file_name += '.txt'
    
    file_path = os.path.join(directory_path, file_name)

    try:
        os.makedirs(directory_path, exist_ok=True)  # Create directory if it doesn't exist
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
        return file_path
    except PermissionError:
        print(f"Error: Permission denied to write to '{file_path}'.")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")

    return ""

In [42]:
def extract_triple_single_quote_content(input_string):
    """
    Checks if the input string contains a pattern enclosed in triple single quotes
    and returns the content inside the first occurrence.

    Args:
        input_string (str): The string to search.

    Returns:
        str: The content inside the triple single quotes if found, else an empty string.
    """
    match = re.search(r"```(.*?)```", input_string, re.DOTALL)
    if match:
        return match.group(1).strip()[5:]
    return ""

# Test Code

In [43]:
yara_cti_pdf_path_list = get_pdf_files(yara_cti_dir_path)
len(yara_cti_pdf_path_list)

60

In [44]:
yara_cti_pdf_path_list[0]

'/home/ANONYMOUS/projects/FALCON/data/qualitative/yara_desc_pdf/rule7_m_desc.pdf'

In [45]:
get_pdf_filename(yara_cti_pdf_path_list[0])

'rule7_m'

In [46]:
print(read_pdf_content(yara_cti_pdf_path_list[0]))

Threat Name: APT_Loader_Win64_REDFLARE

Threat Category: Advanced Persistent Threat (APT), Malware Loader

Threat  Description:  The  YARA  rule  APT_Loader_Win64_REDFLARE_1  detects  the  REDFLARE

malware  loader,  a  64-bit  executable  used  in  APT  campaigns  to  execute  malicious  payloads  in

memory.

Indicators of Compromise (IoCs):

Memory allocation and loading patterns:

41 B9 40 00 00 00 41 B8 00 30 00 00 33 C9

Unique assembly sequences related to in-memory execution.

MD5 Hash: f20824fa6e5c81e3804419f108445368

Detection Mechanism:

Identifies malware loaders in 64-bit Windows executables.

Possible Attribution & Use Cases:

Used in covert malware deployment by APT groups.

Avoids disk-based detection by executing payloads in-memory.

Recommended Actions:

Monitor for unusual memory allocation events.

Conduct behavioral analysis to detect fileless malware execution.

Author & Attribution:

Rule Author: FireEye




In [47]:
prompt = generate_rule_from_cti_prompt(read_pdf_content(yara_cti_pdf_path_list[0]))
print(prompt)



    You are a cybersecurity expert tasked with performing YARA rule generation for a given Cyber Threat Intelligence (CTI).
    There is a sample task input and output provided below.
    
    Sample CTI Input and corresponding YARA Output:

    CTI Input:
        
      Rule Name
        APT30_Sample_2

      Description
        This YARA rule is designed to detect a specific malware sample associated with the APT30 threat group, as documented in a report by FireEye. The binary appears to be masquerading as a legitimate Microsoft Word-related executable, with embedded strings referencing ForZRLnkWordDlg.EXE, suggesting impersonation of Microsoft Office components.

      Reference
        FireEye APT30 Report
        Full Report: https://www2.fireeye.com/rs/fireye/images/rpt-apt30.pdf

      Indicators / String Matches
        This rule matches the following wide (Unicode) strings:

        String ID	Pattern	Notes
        $s0	"ForZRLnkWordDlg.EXE"	Executable filename
        $s1	"Fo

In [48]:
test_rule = get_mistral_small_24b_inst_output(prompt)
print(extract_triple_single_quote_content(test_rule))

rule APT_Loader_Win64_REDFLARE_1 {
  meta:
    description = "The YARA rule APT_Loader_Win64_REDFLARE_1 detects the REDFLARE malware loader, a 64-bit executable used in APT campaigns to execute malicious payloads in memory."
    author = "FireEye"
    hash = "f20824fa6e5c81e3804419f108445368"
  strings:
    $s0 = { 41 B9 40 00 00 00 41 B8 00 30 00 00 33 C9 }
  condition:
    $s0
}


In [35]:
print(read_text_file(yara_gt_rule_dir_path, get_pdf_filename(yara_cti_pdf_path_list[0])))

rule APT_Loader_Win64_REDFLARE_1
{
    meta:
        date_created = "2020-11-27"
        date_modified = "2020-11-27"
        md5 = "f20824fa6e5c81e3804419f108445368"
        rev = 1
        author = "FireEye"
    strings:
        $alloc_n_load = { 41 B9 40 00 00 00 41 B8 00 30 00 00 33 C9 [1-10] FF 50 [4-80] F3 A4 [30-120] 48 6B C9 28 [3-20] 48 6B C9 28 }
        $const_values = { 0F B6 ?? 83 C? 20 83 F? 6D [2-20] 83 C? 20 83 F? 7A }
    condition:
        (uint16(0) == 0x5A4D) and (uint32(uint32(0x3C)) == 0x00004550) and (uint16(uint32(0x3C)+0x18) == 0x020B) and all of them
}


In [36]:
save_string_as_txt(generated_rule_dir_path, 'test', test_rule)

'/home/ANONYMOUS/projects/FALCON/results/qualitative/yara/Mistral-Small-24B-Instruct-2501/test.txt'

# Generate Rule from CTI

In [37]:
yara_cti_pdf_path_list = get_pdf_files(yara_cti_dir_path)
len(yara_cti_pdf_path_list)

60

In [49]:
for pdf_path in tqdm(yara_cti_pdf_path_list):  
  retry_count = 0
  
  pdf_filename = get_pdf_filename(pdf_path)
  cti = read_pdf_content(pdf_path)
  prompt = generate_rule_from_cti_prompt(cti)
  
  
  while retry_count < 3:
    rule = extract_triple_single_quote_content(get_mistral_small_24b_inst_output(prompt))
    if len(rule) > 0:
      save_string_as_txt(generated_rule_dir_path, pdf_filename, rule)
      break
    else:
      retry_count += 1
      print(f"Retrying... Attempt {retry_count}")
  torch.cuda.empty_cache()

 47%|████▋     | 28/60 [05:05<08:33, 16.06s/it]

Retrying... Attempt 1


 48%|████▊     | 29/60 [05:32<09:55, 19.21s/it]

Retrying... Attempt 1


 57%|█████▋    | 34/60 [07:01<06:50, 15.80s/it]

Retrying... Attempt 1


 58%|█████▊    | 35/60 [07:22<07:16, 17.48s/it]

Retrying... Attempt 1


 60%|██████    | 36/60 [07:28<05:35, 13.98s/it]

Retrying... Attempt 1


 63%|██████▎   | 38/60 [07:44<04:04, 11.13s/it]

Retrying... Attempt 1


 67%|██████▋   | 40/60 [07:55<02:38,  7.92s/it]

Retrying... Attempt 1


 78%|███████▊  | 47/60 [08:50<01:12,  5.54s/it]

Retrying... Attempt 1


100%|██████████| 60/60 [13:19<00:00, 13.32s/it]
