## Imports ##

In [35]:
from runningLLAMA import llama_local_generate
import pandas as pd
import re
import time
import os

## Prepare the prompts ##

In [36]:
sys_prompt = 'You are a cybersecurity expert specializing in cyberthreat intelligence.'

In [None]:
prompt = """You are given a vulnerability description and a CVSS vector. Your task is to determine the post-condition privilege level â€” the level of access the attacker gains *after* successful exploitation.

Classify the post-condition privilege as one of the following:

- None: Attacker gains no additional access or capability.
- User: Attacker gains user-level access (e.g., running code as a normal user, accessing user files).
- Root: Attacker gains full system or administrative access (e.g., root privileges, complete control over the system or application).

Instructions:
- Use both the CVE description and CVSS vector to make your decision.
- Briefly justify your classification.
- Your final output must be in two parts:
  1. A short explanation (1-2 sentences)
  2. A single line containing only one of: `None`, `User`, or `Root`

Example format:

Explanation of reasoning...

<Privilege>

Now process the following:
"""
# Append the following
# CVE Description: <Insert CVE description here>  
# CVSS Vector: <Insert CVSS vector here>


In [38]:
temperature = 0.1
top_p = 1
seed = 42
max_tokens = 2048

## Prompt the LLM ##

In [39]:

def format_post_condition(text):
    """
    Extract post-condition privilege classification from LLM response text.
    Returns the last valid classification found and whether extraction was successful.
    """
    # Define the regex pattern for matching privilege classification
    # It must be exactly 'None', 'User', or 'Root' on a line by itself (case-sensitive)
    privilege_pattern = r'^(None|User|Root)\s*$'

    # Split the text into lines (from bottom up) and search for a matching line
    lines = text.strip().splitlines()

    for line in reversed(lines):
        if re.match(privilege_pattern, line.strip()):
            return line.strip(), True

    # If no valid classification found
    return text, False

In [40]:
def run_evaluation(file_path):
    """
    Run CVSS prediction evaluation on a dataset using specified model.
    Processes each CVE description, extracts CVSS vectors, and saves results.
    """
    # Track performance metrics for the evaluation run
    start_time = time.time()
    count_chars = 0  # Total characters generated by the model
    instructions_failed = 0  # Count of responses that didn't follow CVSS format
    
    # Load the dataset (TSV format with CVE descriptions and prompts)
    data = pd.read_csv(file_path, encoding='utf-8', sep='\t')

    all_results = []
    all_full_responses = []  # Store full LLM responses for analysis
    
    # Process each row in the dataset
    for index, row in data.iterrows():
        llm_prompt = prompt + " CVE Description: " + row['Description'] + " CVSS Vector: " + row['CVSS']
        try:
            # Get prediction from the specified model
            output = llama_local_generate(sys_prompt, llm_prompt, max_tokens,temperature, top_p, seed)
            count_chars += len(output)
            
            # Store the full response
            all_full_responses.append(f"=== CVE {index+1} ===\n{output}\n")
            
            # Try to extract CVSS vector from the response
            answer, success = format_post_condition(output)
            if not success:
                instructions_failed += 1  # Model didn't follow CVSS format instructions
            
            all_results.append(answer)
            print(index+1, answer)
        except Exception as e:
            # Handle any API errors or model failures
            answer = 'Error'
            error_msg = f"=== CVE {index+1} ===\nERROR: {str(e)}\n"
            all_full_responses.append(error_msg)
            all_results.append(answer)
            print('Exception at row ', index+1)
            print(e)

    # Calculate and display performance metricsprompt
    time_taken = time.time() - start_time
    print('Time taken:', time_taken)
    print('#Characters generated:', count_chars)
    print('#Instructions failed:', instructions_failed)

    # Ensure output directory structure exists
    output_dir = os.path.join('responses', 'individual-results')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save extracted results to file with standardized naming convention
    out_result = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_postcondition.txt')
    with open(out_result, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_results))

    # Save full LLM responses for analysis and debugging
    out_full_responses = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_full_postcondition_responses.txt')
    with open(out_full_responses, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_full_responses))
    
    print(f'Results saved to: {out_result}')
    print(f'Full responses saved to: {out_full_responses}')
    print('------- Done --------')

<!--  -->

In [41]:
data_set_file_path = "../datasets/postcondition-prediction.tsv"
run_evaluation(data_set_file_path)

1 Root
2 Root
2 Root
3 User
3 User


KeyboardInterrupt: 