# Notebook to run GPT, Gemini, Replicate models

## Install All Required Packages
Run this cell first to install all necessary packages if not already installed.

In [1]:
# Install required packages
# %pip install openai replicate python-dotenv pandas numpy matplotlib google.genai transformers torch bitsandbytes llamaapi accelerate

# Notebook to run GPT, Gemini, Replicate models

In [2]:
from openai import OpenAI
import replicate

from google import genai
from google.genai import types
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt
from llamaapi import LlamaAPI

from runningLLAMA import llama_local_generate




tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

## Setup all APIs

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
#llama api
llama_client = OpenAI(
api_key = os.getenv("LLAMA_API_KEY"),
base_url = "https://api.llmapi.com"
)

In [5]:
# grok
grok_client = OpenAI(
  api_key=os.getenv("GROK_API_KEY"),
  base_url="https://api.x.ai/v1",
)


In [6]:
# ChatGPT
# openai_client = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))

In [7]:
# Gemini
# project_id = ""   # add project ID and location
# vertexai.init(project=project_id, location="")

# gemini_client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))


## Prediction Params & Method

In [8]:
# set parameters for more deterministic output
temperature = 0
top_p = 1
seed = 42
max_tokens = 2048

In [9]:
sys_prompt = 'You are a cybersecurity expert specializing in cyberthreat intelligence.'

In [None]:
model_mapping = {
    'llama3-70b': 'meta/meta-llama-3-70b-instruct',
    'llama3-8b': 'meta/meta-llama-3-8b-instruct',
    'api-llama3.1-8b': 'llama3.1-8b',
    'gemini-2.0-flash': 'gemini-2.0-flash', 
    'gemini-2.5-pro': 'gemini-2.5-pro-exp-03-25', 
    'gpt3': 'gpt-3.5-turbo',
    'gpt4': 'gpt-4-turbo',
    'gpt-4o-mini': 'gpt-4o-mini',
    'grok-3-beta': 'grok-3-beta',
}

In [11]:
def get_single_prediction(question, model_name):
    if model_name.startswith('mistral'):
        # replicate
        model = model_mapping[model_name]
        prompt = sys_prompt + ' ' + question
        input = {'prompt': prompt, 'max_tokens': max_tokens, 'temperature': temperature, 'top_p': top_p, 'seed': seed}
        output = replicate.run(model, input=input)
        output = "".join(output)
    elif model_name.startswith('gemma'):
        # replicate
        model = model_mapping[model_name]
        prompt = sys_prompt + ' ' + question
        input = {'prompt': prompt, 'max_tokens': max_tokens, 'temperature': 0.01, 'top_p': top_p, 'seed': seed}
        output = replicate.run(model, input=input)
        output = "".join(output)
    elif model_name.startswith('01-ai'):
        # replicate
        model = model_mapping[model_name]
        prompt = sys_prompt + ' ' + question
        input = {'prompt': prompt, 'max_tokens': max_tokens, 'temperature': temperature, 'top_p': top_p, 'seed': seed}
        output = replicate.run(model, input=input)
        output = "".join(output)
    elif model_name.startswith('gpt'):
        # ChatGPT
        model = model_mapping[model_name]
        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': question}
            ],
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed
        )
        output = response.choices[0].message.content
    elif model_name.startswith('gemini'):
        # Gemini
        model = model_mapping[model_name]

        # Create message content
        contents = [
            types.Content(role="user", parts=[types.Part(text=sys_prompt + " " + question)])
        ]        
        
        safety_settings = [
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
        ]
       
        generation_config = types.GenerateContentConfig(
            temperature=temperature,
            top_p=top_p,
            max_output_tokens=max_tokens,
            safety_settings=safety_settings
        )
       
        response = gemini_client.models.generate_content(
            model=model,
            contents=contents,
            config=generation_config,
        )

        max_retries = 5
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = gemini_client.models.generate_content(
                    model=model,
                    contents=contents,
                    config=generation_config,
                )
                output = response.candidates[0].content.parts[0].text
                time.sleep(1)  # Regular delay between requests
                break
            except Exception as e:
                error_str = str(e)
                if "429 RESOURCE_EXHAUSTED" in error_str:
                    retry_count += 1
                    print(f"Rate limit hit, retrying in 2 seconds... (attempt {retry_count}/{max_retries})")
                    time.sleep(2)  # Wait for 2 seconds as suggested by API
                    if retry_count == max_retries:
                        output = f"Error: Rate limit exceeded after {max_retries} attempts."
                else:
                    output = f"Error: {str(e)}"
                    break
                
    elif model_name.startswith('llama-local'):
        # temperature must be strictly positive (< 0) or will get error
        output = llama_local_generate(sys_prompt, question, max_tokens=max_tokens, temperature=0.01, top_p=top_p, seed=seed)

    elif model_name.startswith('api-llama3.1-8b'):
         model = model_mapping[model_name]
         response = llama_client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': question}
            ],
            temperature=0.01,
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed
        )
         output = response.choices[0].message.content

    elif model_name.startswith('grok-3-beta'):
        model = model_mapping[model_name]
        response = grok_client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': question}
            ],
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed
        )
        output = response.choices[0].message.content

        
        
    return output


#### Test

In [12]:
question = (
    "Analyze the following CVE description and map it to the appropriate CWE. "
    "Provide a brief justification. The last line of your answer should only contain the CWE ID.\n\n"
    "CVE Description:\n\n"
    "Dell EMC CloudLink 7.1 and all prior versions contain an Improper Input Validation Vulnerability. "
    "A remote low privileged attacker may potentially exploit this vulnerability, "
    "leading to execution of arbitrary files on the server."
)

question2 = (
    "Analyze the following CVE description and calculate the CVSS v3.1 Base Score. "
    "Determine the values for each base metric: AV, AC, PR, UI, S, C, I, and A. "
    "Summarize each metric's value and provide the final CVSS v3.1 vector string.   "
    "Valid options for each metric are as follows: \n"
    "- **Attack Vector (AV)**: Network (N), Adjacent (A), Local (L), Physical (P)\n"
    "- **Attack Complexity (AC)**: Low (L), High (H)\n"
    "- **Privileges Required (PR)**: None (N), Low (L), High (H)\n"
    "- **User Interaction (UI)**: None (N), Required (R)\n"
    "- **Scope (S)**: Unchanged (U), Changed (C)\n"
    "- **Confidentiality (C)**: None (N), Low (L), High (H)\n"
    "- **Integrity (I)**: None (N), Low (L), High (H)\n"
    "- **Availability (A)**: None (N), Low (L), High (H)\n"
    "Summarize each metric's value and provide the final CVSS v3.1 vector string. "
    "Ensure the final line of your response contains ONLY the CVSS v3 Vector String (no other text) "
    "in the following format:  \n"
    "Example format: CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H\n\n"
    "CVE Description: In the Linux kernel through 6.7.1, there is a use-after-free "
    "in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and "
    "drivers/media/cec/core/cec-api.c."
)

##### Are all the APIS working?

In [None]:
# print(get_single_prediction(question2, 'llama-local'))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Based on the provided CVE description, I will analyze the base metrics for the CVSS v3.1 score.

**Attack Vector (AV)**: Since the vulnerability is in the Linux kernel, an attacker would need to have access to the system or network to exploit this vulnerability. Therefore, the Attack Vector is **Network (N)**.

**Attack Complexity (AC)**: The complexity of exploiting this vulnerability is considered **Low (L)**, as it is a use-after-free vulnerability, which can be exploited with a relatively simple attack.

**Privileges Required (PR)**: To exploit this vulnerability, an attacker would need to have some level of access to the system, but not necessarily administrative privileges. Therefore, the Privileges Required are **Low (L)**.

**User Interaction (UI)**: This vulnerability can be exploited without any user interaction, as it is a kernel-level vulnerability. Therefore, the User Interaction is **None (N)**.

**Scope (S)**: The Scope of this vulnerability is **Unchanged (U)**, as it d

In [58]:
# print(get_single_prediction(question2, 'grok-3-beta'))

In [59]:
# print(get_single_prediction(question, 'llama3-8b'))

# Run Evaluation for a Dataset

### All formatting comes here
While these captures most output format of the LLMs we studied, we still had to manually collect some responses from the generated response file

In [60]:
def format_rcm(text):
    # Define the regex pattern for CWE ID
    cwe_pattern = r'CWE-\d+'

    # Find all matches in the text
    matches = re.findall(cwe_pattern, text)

    # Return the last match if any match is found, otherwise return the original text
    if matches:
        return matches[-1], True
    else:
        return text, False

def format_vsp(text):
    # Define the regex pattern for CVSS v3.1 vector string
    #cvss_pattern = r'AV:[^/]+?/AC:[^/]+?/PR:[^/]+?/UI:[^/]+?/S:[^/]+?/C:[^/]+?/I:[^/]+?/A:[^/]+?'
    cvss_pattern = r'AV:[A-Za-z]+/AC:[A-Za-z]+/PR:[A-Za-z]+/UI:[A-Za-z]+/S:[A-Za-z]+/C:[A-Za-z]+/I:[A-Za-z]+/A:[A-Za-z]+'


    # Find all matches in the text
    matches = re.findall(cvss_pattern, text)

    # Return the last match if any match is found, otherwise return the original text
    if matches:
        return matches[-1], True
    else:
        return text, False

def format_mcq(text):
    last_line = text.split('\n')[-1].rstrip()
    if last_line.startswith('A)') or last_line.startswith('B)') or last_line.startswith('C)') or last_line.startswith('D)'):
        return last_line[0]
    if last_line.endswith('A') or last_line.endswith('B') or last_line.endswith('C') or last_line.endswith('D'):
        return last_line[-1]
    if last_line.endswith('**'):
        return last_line[-3]
    if len(last_line) == 0:
        last_line = text.split('\n')[-2].rstrip()
        if last_line.startswith('A)') or last_line.startswith('B)') or last_line.startswith('C)') or last_line.startswith('D)'):
            return last_line[0]
        if last_line.endswith('A') or last_line.endswith('B') or last_line.endswith('C') or last_line.endswith('D'):
            return last_line[-1]
        if last_line.endswith('**'):
            return last_line[-3]
    return ' '.join(text.split('\n'))

def format_taa(text):
    # need to manually extract the attribution
    return ' '.join(text.split('\n'))

In [None]:
def run_evaluation(file_path, task, model_name):
    # Keep track of time and total #chars generated
    start_time = time.time()
    count_chars = 0
    instructions_failed = 0
    
    data = pd.read_csv(file_path, encoding='utf-8', sep='\t')

    # response contain the entire response, result the formatted result
    all_responses = []
    all_results = []
    
    for index, row in data.iterrows():
        prompt = row['Prompt']
        try:
            output = get_single_prediction(prompt, model_name)
            
            count_chars += len(output)
            all_responses.append(output)
            if task == 'rcm':
                answer, success = format_rcm(output)
                if not success:
                    instructions_failed += 1
            elif task == 'vsp':
                answer, success = format_vsp(output)
                if not success:
                    instructions_failed += 1      
            elif task == 'mcq':
                answer = format_mcq(output)
            elif task == 'taa':
                answer = format_taa(output)
            else:
                raise ValueError('Task unknown!')
        except Exception as e:
            answer = 'Error'
            all_responses.append(answer)
            print('Exception at row ', index+1)
            print(e)
        all_results.append(answer)
        print(index+1, answer)
        # print(index+1)


    time_taken = time.time() - start_time
    print('Time taken:', time_taken)
    print('#Characters generated:', count_chars)
    print('#Instructions failed:', instructions_failed)

    # Create responses-new directory if it doesn't exist
    output_dir = 'responses-new'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save all the responses & results to the responses-new folder
    out_response = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_' + model_name + '_response.txt')
    out_result = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_' + model_name + '_result_FORINSTRUCT.txt')

    with open(out_response, 'w', encoding='utf-8') as f:
        out_str = ''
        for i in range(len(all_responses)):
            out_str += '#####' + str(i+1) + '#####\n'
            out_str += all_responses[i]
            out_str += '\n\n'
        f.write(out_str)
    with open(out_result, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_results))

    print('------- Done --------')

In [44]:
run_evaluation('../new-data/cti-vsp-only-2024-and-2025-BIG.tsv', 'vsp', 'llama-local')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


3 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


4 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
5 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


6 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


7 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


8 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


9 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


10 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


11 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
12 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


13 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


14 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


15 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


16 AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


17 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


18 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H


KeyboardInterrupt: 