# Notebook to run GPT, Gemini, LLAMA models

### Install All Required Packages
Run this cell first to install all necessary packages if not already installed.

In [1]:
# Install required packages
# %pip install openai replicate python-dotenv pandas numpy matplotlib google.genai transformers torch bitsandbytes llamaapi accelerate

# Notebook to run GPT, Gemini, LLAMA models

In [2]:
from openai import OpenAI
from google import genai
from google.genai import types
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt
from llamaapi import LlamaAPI

from runningLLAMA import llama_local_generate




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 11.62 GiB of which 1002.44 MiB is free. Process 1290068 has 5.64 GiB memory in use. Including non-PyTorch memory, this process has 4.86 GiB memory in use. Of the allocated memory 4.60 GiB is allocated by PyTorch, and 86.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Setup all APIs

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
#llama api
llama_client = OpenAI(
api_key = os.getenv("LLAMA_API_KEY"),
base_url = "https://api.llmapi.com"
)

In [None]:
# ChatGPT
openai_client = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))

In [None]:
# Gemini
gemini_client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))


## LLM Predictions

In [None]:
# Parameters for deterministic and consistent model outputs across different LLMs
# Low temperature and top_p reduce randomness, seed ensures reproducibility
temperature = 0
top_p = 0
seed = 42
max_tokens = 2048

In [None]:
sys_prompt = 'You are a cybersecurity expert specializing in cyberthreat intelligence.'

In [None]:
# Map internal model identifiers to actual API model names
model_mapping = {
    'api-llama3.1': 'llama3.1-8b',
    'api-llama3.3': 'llama3.3-70b',
    'gemini': 'gemini-2.0-flash', 
    'gpt-4o-mini': 'gpt-4o-mini',
}

In [None]:
def get_single_prediction(question, model_name):
    """
    Get a single prediction from various LLM providers based on model name.
    Handles OpenAI GPT, Google Gemini, local LLAMA, and API-based LLAMA models.
    """
    if model_name.startswith('gpt'):
        # ChatGPT API call with parameters
        model = model_mapping[model_name]
        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': question}
            ],
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed
        )
        output = response.choices[0].message.content
    elif model_name.startswith('gemini'):
        # Gemini API with safety settings and retry logic for rate limiting
        model = model_mapping[model_name]

        # Create message content combining system prompt and user question
        contents = [
            types.Content(role="user", parts=[types.Part(text=sys_prompt + " " + question)])
        ]        
        
        # Configure safety settings to allow more content through
        safety_settings = [
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
        ]
       
        generation_config = types.GenerateContentConfig(
            temperature=temperature,
            top_p=top_p,
            max_output_tokens=max_tokens,
            safety_settings=safety_settings
        )
       
        response = gemini_client.models.generate_content(
            model=model,
            contents=contents,
            config=generation_config,
        )

        # Retry logic to handle rate limiting (429 errors)
        max_retries = 5
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = gemini_client.models.generate_content(
                    model=model,
                    contents=contents,
                    config=generation_config,
                )
                output = response.candidates[0].content.parts[0].text
                time.sleep(1)  # Regular delay between requests
                break
            except Exception as e:
                error_str = str(e)
                if "429 RESOURCE_EXHAUSTED" in error_str:
                    retry_count += 1
                    print(f"Rate limit hit, retrying in 2 seconds... (attempt {retry_count}/{max_retries})")
                    time.sleep(2)  # Wait for 2 seconds as suggested by API
                    if retry_count == max_retries:
                        output = f"Error: Rate limit exceeded after {max_retries} attempts."
                else:
                    output = f"Error: {str(e)}"
                    break
                
    elif model_name.startswith('llama-local'):
        # Local LLAMA model - temperature must be > 0 to avoid errors
        output = llama_local_generate(sys_prompt, question, max_tokens=max_tokens, temperature=0.01, top_p=top_p, seed=seed)

    elif model_name.startswith('api-llama'):
         # API-based LLAMA models through LLMapi service
         model = model_mapping[model_name]
         response = llama_client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': question}
            ],
            temperature=0.01,  # Slightly above 0 for LLAMA models
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed
        )
         output = response.choices[0].message.content
        
    else:
        raise ValueError(f"Model '{model_name}' not supported or not found in model_mapping")
        
    return output


#### Test

In [None]:
# Test prompt to see if the API calls are working correctly
question = (
    "Analyze the following CVE description and calculate the CVSS v3.1 Base Score. "
    "Determine the values for each base metric: AV, AC, PR, UI, S, C, I, and A. "
    "Summarize each metric's value and provide the final CVSS v3.1 vector string.   "
    "Valid options for each metric are as follows: \n"
    "- **Attack Vector (AV)**: Network (N), Adjacent (A), Local (L), Physical (P)\n"
    "- **Attack Complexity (AC)**: Low (L), High (H)\n"
    "- **Privileges Required (PR)**: None (N), Low (L), High (H)\n"
    "- **User Interaction (UI)**: None (N), Required (R)\n"
    "- **Scope (S)**: Unchanged (U), Changed (C)\n"
    "- **Confidentiality (C)**: None (N), Low (L), High (H)\n"
    "- **Integrity (I)**: None (N), Low (L), High (H)\n"
    "- **Availability (A)**: None (N), Low (L), High (H)\n"
    "Summarize each metric's value and provide the final CVSS v3.1 vector string. "
    "Ensure the final line of your response contains ONLY the CVSS v3 Vector String (no other text) "
    "in the following format:  \n"
    "Example format: CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H\n\n"
    "CVE Description: In the Linux kernel through 6.7.1, there is a use-after-free "
    "in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and "
    "drivers/media/cec/core/cec-api.c."
)

In [17]:
print(get_single_prediction(question, 'llama-local'))

NameError: name 'llama_local_generate' is not defined

# Run Evaluation on Dataset

While this captures most output format of the LLMs, sometimes have to manually collect some responses from the generated response file

In [25]:
def format(text):
    """
    Extract CVSS v3.1 vector string from LLM response text.
    Returns the last valid CVSS vector found and whether extraction was successful.
    """
    # Define the regex pattern for CVSS v3.1 vector string format
    # Matches: AV:X/AC:X/PR:X/UI:X/S:X/C:X/I:X/A:X where X can be letters
    cvss_pattern = r'AV:[A-Za-z]+/AC:[A-Za-z]+/PR:[A-Za-z]+/UI:[A-Za-z]+/S:[A-Za-z]+/C:[A-Za-z]+/I:[A-Za-z]+/A:[A-Za-z]+'

    # Find all matches in the text
    matches = re.findall(cvss_pattern, text)

    # Return the last match (most likely to be the final answer) if any match is found
    if matches:
        return matches[-1], True
    else:
        # Return original text if no valid CVSS vector found (indicates parsing failure)
        return text, False

In [26]:
def run_evaluation(file_path, model_name):
    """
    Run CVSS prediction evaluation on a dataset using specified model.
    Processes each CVE description, extracts CVSS vectors, and saves results.
    """
    # Track performance metrics for the evaluation run
    start_time = time.time()
    count_chars = 0  # Total characters generated by the model
    instructions_failed = 0  # Count of responses that didn't follow CVSS format
    
    # Load the dataset (TSV format with CVE descriptions and prompts)
    data = pd.read_csv(file_path, encoding='utf-8', sep='\t')

    all_results = []
    
    # Process each row in the dataset
    for index, row in data.iterrows():
        prompt = row['Prompt']
        try:
            # Get prediction from the specified model
            output = get_single_prediction(prompt, model_name)
            count_chars += len(output)
            
            # Try to extract CVSS vector from the response
            answer, success = format(output)
            if not success:
                instructions_failed += 1  # Model didn't follow CVSS format instructions
            
            all_results.append(answer)
            print(index+1, answer)
        except Exception as e:
            # Handle any API errors or model failures
            answer = 'Error'
            all_results.append(answer)
            print('Exception at row ', index+1)
            print(e)
            print(index+1, answer)

    # Calculate and display performance metrics
    time_taken = time.time() - start_time
    print('Time taken:', time_taken)
    print('#Characters generated:', count_chars)
    print('#Instructions failed:', instructions_failed)

    # Ensure output directory structure exists
    output_dir = os.path.join('responses', 'individual-results')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save results to file with standardized naming convention
    # Format: SENG402_<dataset-name>_<model-name>_result.txt
    out_result = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_' + model_name + '_result.txt')
    with open(out_result, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_results))

    print('------- Done --------')

In [None]:
run_evaluation('../datasets/2024-and-2025-SMALL.tsv', 'gemini')

1 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
2 AV:N/AC:L/PR:L/UI:R/S:C/C:L/I:L/A:N
3 AV:N/AC:L/PR:N/UI:R/S:U/C:N/I:H/A:N
4 AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:N/A:N
5 AV:N/AC:L/PR:N/UI:R/S:U/C:N/I:N/A:H
6 AV:N/AC:L/PR:L/UI:N/S:C/C:H/I:H/A:H
Time taken: 40.646018505096436
#Characters generated: 10827
#Instructions failed: 0
------- Done --------
