Prompt

In [None]:
import json
import sys
import openai
import os
sys.path.append(os.path.dirname(os.getcwd()))
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaTokenizerFast
import matplotlib.pyplot as plt
from openai import OpenAI
def cve_keywordlist(CVE):
    cve_query = f"""I want you to create a proper keyword list for a CVE, I will give you a list of tips for proper keyword list creation which are given below between triple backticks. Then I will give you and example CVE and the example keyword list in JSON, to help you understand the task better. Your result should also have 3 related keywords per item like the example result. Lastly I will give you a CVE and you have to give me the keyword list in JSON format.
    Tips: ```
    1. Text Normalization: Standardize the text by converting it to lowercase, removing special characters, and expanding abbreviations. This helps in reducing variations in the text.

    2. Abstraction: given the CVE description, create an abstraction of it by leaving aside the terms that are too specific (e.g. if a CVE is specifically targeting a cisco router, this detail is not important)

    3. Keyword Extraction: Identify and extract key terms and concepts from both the CVE and CWE descriptions. For instance, phrases like "incorrectly handles a length field" or "buffer overflow" are critical.

    ```

    CVE_example: {{
        "CVE description": "Patient Information Center iX (PICiX) Versions B.02, PerformanceBridge Focal Point Version A.01, IntelliVue patient monitors MX100 IntelliVue X3 and X2 Versions N and prior. The software parses a formatted message or structure but does not handle or incorrectly handles a length field that is inconsistent with the actual length of the associated data, causing the application on the surveillance station to restart."
    }}      
    CVE_example result: {{
        "core_terms": ["Parsing Vulnerability", "Inconsistent Length Field Handling", "Application Restart"],
        "contextual_terms": ["Length field", "Inconsistency", "Improper handling" ],
        "consequences": ["Unexpected restart", "Data length mismatch", "Application instability"]}}

    CVE: {json.dumps(CVE, indent=4)}

    Give the result like the example result in a JSON list, with proper representation of the keywords list applicable for this CVE.
    """

    #Get CVE keyword list
    client = OpenAI(api_key='sk-...')
    response = client.chat.completions.create(
    # model="gpt-4-1106-preview",
    model="gpt-3.5-turbo-1106",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful cybersecurity expert designed to help me standardize CVE keyword lists, so they can be accurately compared to Common Weakness Enumerations (CWE)."},
        {"role": "user", "content": cve_query},
    ]
    )
    return response.choices[0].message.content

Embed

In [None]:
import time
import pickle
import json
from concurrent.futures import ThreadPoolExecutor
import tqdm
def transform_entry(entry):
    
    try:
        cve_processed = json.loads(cve_keywordlist(entry['cve_description']))
        transformed_data = {
            'cve_id': entry['cve_id'],
            'cve_description': entry['cve_description'],
            'cve_terms': cve_processed,
            'cve_descr_no_subj':entry['cve_descr_no_subj'],
            'cve_descr_replaced_subj':entry['cve_descr_replaced_subj'],
            'cwe': entry['cwe'],
            'cwe_class':entry['cwe_class']
            
        }
        # print(transformed_data)
        return transformed_data
    except Exception as e:
        print(f"Error processing entry: {entry}, Error: {e}")
        time.sleep(20)
        transform_entry(entry)

def transform_json(input_file, output_file, dump_interval=2000):
    # Read input JSON file
    with open(input_file, 'rb') as infile:
        original_json = pickle.load(infile)
    transformed_data_list = []
    processed_count = 0
    # Use ThreadPoolExecutor to parallelize the processing of JSON entries
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit tasks for each entry in the JSON file
        futures = tqdm.tqdm([executor.submit(transform_entry, entry) for entry in original_json])

        # Collect results as they become available
        for future in futures:
            result = future.result()
            if result is not None:
                transformed_data_list.append(result)
                processed_count += 1

                # Dump to JSON every dump_interval entries
                if processed_count % dump_interval == 0:
                    with open(output_file, 'wb') as out:
                        pickle.dump(transformed_data_list, out)
    
    # Write the final result to the output file
    with open(output_file, 'wb') as out:
        pickle.dump(transformed_data_list, out)

input_file_path = 'replacement.pickle'
output_file_path = 'replacement_with_terms.pickle'
transform_json(input_file_path, output_file_path)

Verb identification

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_svos(sentence):
    # Process the sentence using spaCy
    doc = nlp(sentence)

    svos = []
    subject = ""
    for token in doc:
        # Check if token is a verb
        if token.pos_ == "VERB":
            # If we already found a subject, add the SVO triple
            if subject:
                svos.append((subject, token.text, ""))
                subject = ""  # Reset subject for next SVO triple
        else:
            # Accumulate words before the first verb as subject
            subject += token.text + " "
    return svos

In [None]:
import tqdm
def transform_json2(input_file, output_file):

    with open(input_file, 'rb') as infile:
        original_json = pickle.load(infile)

    transformed_data_list = []
    for entry in tqdm.tqdm(original_json):
        sentence = entry['cve_description']
        first_subject=""
        first_verb=""
        svos = extract_svos(sentence)
        if svos:
            first_subject = svos[0][0]
            first_verb = svos[0][1]
            verb_index = sentence.find(first_verb)
            text_after_subject = sentence[verb_index :].strip()
        else:
            continue
        try:    
            transformed_data = {
                'cve_id': entry['cve_id'],
                'cve_description': entry['cve_description'],
                'cve_terms': entry['cve_terms'],
                'cve_descr_no_subj':text_after_subject,
                'cve_descr_replaced_subj':"This vulnerability "+text_after_subject,
                'cwe': entry['cwe'],
                'cwe_class':entry['cwe_class']
            }
            transformed_data_list.append(transformed_data)
        except:
            with open(output_file, 'wb') as out:
                json.dump(transformed_data_list, out)
     
 
    with open(output_file, 'wb') as out:
        pickle.dump(transformed_data_list, out)  

Embeddings in multithreading

In [1]:
import pickle
import time
from openai import OpenAI
import tqdm
from concurrent.futures import ThreadPoolExecutor
def embed(entry):
    modelemb = "text-embedding-ada-002"
    # Read input JSON file
    client = OpenAI(api_key='sk-...')

    transformed_data_list = []
    
    try:    
        cve_description_ada = client.embeddings.create(input=[entry['cve_description']], model=modelemb).data[0].embedding
        cve_description_no_sub_ada = client.embeddings.create(input=[entry['cve_descr_no_subj']], model=modelemb).data[0].embedding
        cve_description_replaced_sub_ada = client.embeddings.create(input=[entry['cve_descr_replaced_subj']], model=modelemb).data[0].embedding
        concat_cve_terms= str(entry['cve_terms']["core_terms"] + entry['cve_terms']["contextual_terms"] + entry['cve_terms']["consequences"])
        concat_core_contextual= str(entry['cve_terms']["core_terms"] + entry['cve_terms']["contextual_terms"])
        concat_core_consequences= str(entry['cve_terms']["core_terms"] + entry['cve_terms']["consequences"])
        cve_terms_ada = client.embeddings.create(input=[concat_cve_terms], model=modelemb).data[0].embedding
        cve_core_terms_ada = client.embeddings.create(input=[(str(entry['cve_terms']["core_terms"]))], model=modelemb).data[0].embedding
        cve_core_contextual_terms_ada = client.embeddings.create(input=[concat_core_contextual], model=modelemb).data[0].embedding
        cve_core_consequences_terms_ada = client.embeddings.create(input=[concat_core_consequences], model=modelemb).data[0].embedding
        print(concat_cve_terms)
        transformed_data = {
            'cve_id': entry['cve_id'],
            'cve_description': entry['cve_description'],
            'cve_terms': entry['cve_terms'],
            'cve_descr_no_subj':entry['cve_descr_no_subj'],
            'cve_descr_replaced_subj':entry['cve_descr_replaced_subj'],
            'cwe': entry['cwe'],
            'cwe_class':entry['cwe_class'],
            'cve_description_ada_embedding': cve_description_ada
            'cve_description_no_subject_ada_embedding': cve_description_no_sub_ada,
            'cve_description_replaced_subject_ada_embedding': cve_description_replaced_sub_ada,
            'cve_terms_ada_embedding' : cve_terms_ada,
            'cve_core_ada_embedding' : cve_core_terms_ada,
            'cve_core_contextual_ada_embedding' : cve_core_contextual_terms_ada,
            'cve_core_consequences_ada_embedding' : cve_core_consequences_terms_ada
        }

        transformed_data_list.append(transformed_data)
        return transformed_data  # Return the transformed data
    except Exception as e:
        print(f"Error processing entry: {entry}, Error: {e}")
        time.sleep(20)
        # Don't call transform_entry(entry) here

def transform_json(input_file, output_file):

    with open(input_file, 'rb') as infile:
        original_json = pickle.load(infile)

    transformed_data_list = []

    # Use ThreadPoolExecutor to parallelize the processing of JSON entries
    with ThreadPoolExecutor() as executor:
        # Submit tasks for each entry in the JSON file
        futures = tqdm.tqdm([executor.submit(embed, entry) for entry in original_json])

        # Collect results as they become available
        for future in futures:
            result = future.result()
            if result is not None:
                transformed_data_list.append(result)

    # Write the final result to the output file
    with open(output_file, 'wb') as out:
        pickle.dump(transformed_data_list, out)

input_file_path = 'replacement_with_terms2.pickle'
output_file_path = 'replacement_with_terms2_core_consequences.pickle'
transform_json(input_file_path, output_file_path)

100%|██████████| 3784/3784 [07:47<00:00,  8.09it/s]
