# Notebook to run GPT, Gemini, Replicate models

## Install All Required Packages
Run this cell first to install all necessary packages if not already installed.

In [1]:
# Install required packages
# %pip install openai replicate python-dotenv pandas numpy matplotlib google.genai transformers torch

# Notebook to run GPT, Gemini, Replicate models

In [None]:
from openai import OpenAI
import replicate

from google import genai
from google.genai import types
import pandas as pd
import numpy as np
import os
import time
import re
import matplotlib.pyplot as plt

from runningLLAMA import llama_local_generate




## Setup all APIs

In [None]:
# replicate
import os
from dotenv import load_dotenv

load_dotenv()

False

In [None]:
# ChatGPT
# openai_client = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))

In [None]:
# Gemini
# project_id = ""   # add project ID and location
# vertexai.init(project=project_id, location="")

# gemini_client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))


## Prediction Params & Method

In [None]:
# set parameters for more deterministic output
temperature = 0
top_p = 1
seed = 42
max_tokens = 2048

In [None]:
sys_prompt = 'You are a cybersecurity expert specializing in cyberthreat intelligence.'

In [None]:
model_mapping = {
    'llama3-70b': 'meta/meta-llama-3-70b-instruct',
    'llama3-8b': 'meta/meta-llama-3-8b-instruct',
    'gemini-2.0-flash': 'gemini-2.0-flash', 
    'gemini-2.5-pro': 'gemini-2.5-pro-exp-03-25', 
    'gpt3': 'gpt-3.5-turbo',
    'gpt4': 'gpt-4-turbo',
    'gpt-4o-mini': 'gpt-4o-mini',
}

In [None]:
def get_single_prediction(question, model_name):
    if model_name.startswith('mistral'):
        # replicate
        model = model_mapping[model_name]
        prompt = sys_prompt + ' ' + question
        input = {'prompt': prompt, 'max_tokens': max_tokens, 'temperature': temperature, 'top_p': top_p, 'seed': seed}
        output = replicate.run(model, input=input)
        output = "".join(output)
    elif model_name.startswith('gemma'):
        # replicate
        model = model_mapping[model_name]
        prompt = sys_prompt + ' ' + question
        input = {'prompt': prompt, 'max_tokens': max_tokens, 'temperature': 0.01, 'top_p': top_p, 'seed': seed}
        output = replicate.run(model, input=input)
        output = "".join(output)
    elif model_name.startswith('01-ai'):
        # replicate
        model = model_mapping[model_name]
        prompt = sys_prompt + ' ' + question
        input = {'prompt': prompt, 'max_tokens': max_tokens, 'temperature': temperature, 'top_p': top_p, 'seed': seed}
        output = replicate.run(model, input=input)
        output = "".join(output)
    elif model_name.startswith('gpt'):
        # ChatGPT
        model = model_mapping[model_name]
        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': sys_prompt},
                {'role': 'user', 'content': question}
            ],
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed
        )
        output = response.choices[0].message.content
    elif model_name.startswith('gemini'):
        # Gemini
        model = model_mapping[model_name]

        # Create message content
        contents = [
            types.Content(role="user", parts=[types.Part(text=sys_prompt + " " + question)])
        ]        
        
        safety_settings = [
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH),
        ]
       
        generation_config = types.GenerateContentConfig(
            temperature=temperature,
            top_p=top_p,
            max_output_tokens=max_tokens,
            safety_settings=safety_settings
        )
       
        response = gemini_client.models.generate_content(
            model=model,
            contents=contents,
            config=generation_config,
        )

        max_retries = 5
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = gemini_client.models.generate_content(
                    model=model,
                    contents=contents,
                    config=generation_config,
                )
                output = response.candidates[0].content.parts[0].text
                time.sleep(1)  # Regular delay between requests
                break
            except Exception as e:
                error_str = str(e)
                if "429 RESOURCE_EXHAUSTED" in error_str:
                    retry_count += 1
                    print(f"Rate limit hit, retrying in 2 seconds... (attempt {retry_count}/{max_retries})")
                    time.sleep(2)  # Wait for 2 seconds as suggested by API
                    if retry_count == max_retries:
                        output = f"Error: Rate limit exceeded after {max_retries} attempts."
                else:
                    output = f"Error: {str(e)}"
                    break
                
    elif model_name.startswith('llama-local'):
        prompt = sys_prompt + ' ' + question
        output = llama_local_generate(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
    return output


#### Test

In [None]:
question = (
    "Analyze the following CVE description and map it to the appropriate CWE. "
    "Provide a brief justification. The last line of your answer should only contain the CWE ID.\n\n"
    "CVE Description:\n\n"
    "Dell EMC CloudLink 7.1 and all prior versions contain an Improper Input Validation Vulnerability. "
    "A remote low privileged attacker may potentially exploit this vulnerability, "
    "leading to execution of arbitrary files on the server."
)

question2 = (
    "Analyze the following CVE description and calculate the CVSS v3.1 Base Score. "
    "Determine the values for each base metric: AV, AC, PR, UI, S, C, I, and A. "
    "Summarize each metric's value and provide the final CVSS v3.1 vector string.   "
    "Valid options for each metric are as follows: \n"
    "- **Attack Vector (AV)**: Network (N), Adjacent (A), Local (L), Physical (P)\n"
    "- **Attack Complexity (AC)**: Low (L), High (H)\n"
    "- **Privileges Required (PR)**: None (N), Low (L), High (H)\n"
    "- **User Interaction (UI)**: None (N), Required (R)\n"
    "- **Scope (S)**: Unchanged (U), Changed (C)\n"
    "- **Confidentiality (C)**: None (N), Low (L), High (H)\n"
    "- **Integrity (I)**: None (N), Low (L), High (H)\n"
    "- **Availability (A)**: None (N), Low (L), High (H)\n"
    "Summarize each metric's value and provide the final CVSS v3.1 vector string. "
    "Ensure the final line of your response contains only the CVSS v3 Vector String "
    "in the following format:  \n"
    "Example format: CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H\n\n"
    "CVE Description: In the Linux kernel through 6.7.1, there is a use-after-free "
    "in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and "
    "drivers/media/cec/core/cec-api.c."
)

##### Are all the APIS working?

In [None]:
# print(get_single_prediction(question, 'gpt-4o-mini'))

In [None]:
print(get_single_prediction(question2, 'llama-local'))

NameError: name 'llama_local_generate' is not defined

In [337]:
# print(get_single_prediction(question, 'llama3-8b'))

# Run Evaluation for a Dataset

### All formatting comes here
While these captures most output format of the LLMs we studied, we still had to manually collect some responses from the generated response file

In [338]:
def format_rcm(text):
    # Define the regex pattern for CWE ID
    cwe_pattern = r'CWE-\d+'

    # Find all matches in the text
    matches = re.findall(cwe_pattern, text)

    # Return the last match if any match is found, otherwise return the original text
    if matches:
        return matches[-1], True
    else:
        return text, False

def format_vsp(text):
    # Define the regex pattern for CVSS v3.1 vector string
    #cvss_pattern = r'AV:[^/]+?/AC:[^/]+?/PR:[^/]+?/UI:[^/]+?/S:[^/]+?/C:[^/]+?/I:[^/]+?/A:[^/]+?'
    cvss_pattern = r'AV:[A-Za-z]+/AC:[A-Za-z]+/PR:[A-Za-z]+/UI:[A-Za-z]+/S:[A-Za-z]+/C:[A-Za-z]+/I:[A-Za-z]+/A:[A-Za-z]+'


    # Find all matches in the text
    matches = re.findall(cvss_pattern, text)

    # Return the last match if any match is found, otherwise return the original text
    if matches:
        return matches[-1], True
    else:
        return text, False

def format_mcq(text):
    last_line = text.split('\n')[-1].rstrip()
    if last_line.startswith('A)') or last_line.startswith('B)') or last_line.startswith('C)') or last_line.startswith('D)'):
        return last_line[0]
    if last_line.endswith('A') or last_line.endswith('B') or last_line.endswith('C') or last_line.endswith('D'):
        return last_line[-1]
    if last_line.endswith('**'):
        return last_line[-3]
    if len(last_line) == 0:
        last_line = text.split('\n')[-2].rstrip()
        if last_line.startswith('A)') or last_line.startswith('B)') or last_line.startswith('C)') or last_line.startswith('D)'):
            return last_line[0]
        if last_line.endswith('A') or last_line.endswith('B') or last_line.endswith('C') or last_line.endswith('D'):
            return last_line[-1]
        if last_line.endswith('**'):
            return last_line[-3]
    return ' '.join(text.split('\n'))

def format_taa(text):
    # need to manually extract the attribution
    return ' '.join(text.split('\n'))

In [339]:
def run_evaluation(file_path, task, model_name):
    # Keep track of time and total #chars generated
    start_time = time.time()
    count_chars = 0
    instructions_failed = 0
    
    data = pd.read_csv(file_path, encoding='utf-8', sep='\t')

    # response contain the entire response, result the formatted result
    all_responses = []
    all_results = []
    
    for index, row in data.iterrows():
        prompt = row['Prompt']
        try:
            output = get_single_prediction(prompt, model_name)
            
            count_chars += len(output)
            all_responses.append(output)
            if task == 'rcm':
                answer, success = format_rcm(output)
                if not success:
                    instructions_failed += 1
            elif task == 'vsp':
                answer, success = format_vsp(output)
                if not success:
                    instructions_failed += 1      
            elif task == 'mcq':
                answer = format_mcq(output)
            elif task == 'taa':
                answer = format_taa(output)
            else:
                raise ValueError('Task unknown!')
        except Exception as e:
            answer = 'Error'
            all_responses.append(answer)
            print('Exception at row ', index+1)
            print(e)
        all_results.append(answer)
        print(index+1, answer)
        # print(index+1)


    time_taken = time.time() - start_time
    print('Time taken:', time_taken)
    print('#Characters generated:', count_chars)
    print('#Instructions failed:', instructions_failed)

    # Create responses-new directory if it doesn't exist
    output_dir = 'responses-new'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save all the responses & results to the responses-new folder
    out_response = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_' + model_name + '_response.txt')
    out_result = os.path.join(output_dir, 'SENG402_' + os.path.basename(file_path).split('.')[0] + '_' + model_name + '_result.txt')

    with open(out_response, 'w', encoding='utf-8') as f:
        out_str = ''
        for i in range(len(all_responses)):
            out_str += '#####' + str(i+1) + '#####\n'
            out_str += all_responses[i]
            out_str += '\n\n'
        f.write(out_str)
    with open(out_result, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_results))

    print('------- Done --------')

In [340]:
# run_evaluation('datasets/cti-mcq.tsv', 'mcq', 'gpt3')

In [341]:
# run_evaluation('datasets/cti-rcm.tsv', 'rcm', 'gpt3')

In [None]:
# run_evaluation('../new-data/cti-vsp-only-2024-and-2025-BIG.tsv', 'vsp', 'gemini-2.0-flash')

1 AV:L/AC:H/PR:L/UI:N/S:C/C:H/I:H/A:H
2 AV:N/AC:L/PR:L/UI:R/S:C/C:L/I:L/A:N
3 AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
4 AV:N/AC:L/PR:L/UI:R/S:C/C:H/I:H/A:H
5 AV:N/AC:L/PR:N/UI:R/S:C/C:H/I:H/A:N
6 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
7 AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N
8 AV:N/AC:L/PR:L/UI:N/S:C/C:H/I:H/A:H
9 AV:N/AC:L/PR:N/UI:N/S:C/C:H/I:H/A:H
10 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
11 AV:N/AC:L/PR:N/UI:N/S:C/C:H/I:H/A:H
12 AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N
13 AV:N/AC:L/PR:N/UI:N/S:C/C:H/I:H/A:H
14 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
15 AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N
16 AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H
17 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
18 AV:N/AC:L/PR:N/UI:N/S:C/C:N/I:H/A:H
19 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
20 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
21 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
22 AV:L/AC:H/PR:N/UI:R/S:U/C:L/I:L/A:L
23 AV:N/AC:L/PR:L/UI:R/S:C/C:L/I:L/A:N
24 AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:L/A:N
25 AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
26 AV:N/AC:H/PR:N/UI:R/S:C/C:H/I:H

In [32]:
# run_evaluation('datasets/cti-taa.tsv', 'taa', 'gpt3')