# CTI-MCQ

In [17]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import google.generativeai as genai


with open('gemini_key.txt', 'r') as file:
    GOOGLE_API_KEY = file.read().strip()
        
    genai.configure(api_key=GOOGLE_API_KEY)
        
    model = genai.GenerativeModel('gemini-1.5-flash')

with open('cti_mcq_generated.tsv', 'a') as file:
    file.write('URL' + '\t' + 'Question' + '\t' + 'Option A' + '\t' + 'Option B' + '\t' + 'Option C' + '\t' + 'Option D' 
               + '\t' + 'GT' + '\t' + 'Prompt' + '\n')
    

url = 'https://attack.mitre.org'
    
# Fetch the webpage
response = requests.get(url)

webpage_content = response.text

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(webpage_content, 'html.parser')

div_element = soup.find_all('div', class_="technique-cell")

# Find the a tag within the div element
for div in div_element:
    a_tag = div.find('a')
    
    # Get the text in the href attribute
    href_text = a_tag['href']
    
    response = requests.get('https://attack.mitre.org'+href_text)
    
    # Check if the request was successful
    if response.status_code == 200:
        webpage_content = response.text
        print('Webpage fetched successfully', href_text)
    
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(webpage_content, 'html.parser')

        try:
            paragraphe = soup.find_all('div', class_="description-body")[0].get_text().strip()
        except:
            continue
    
        # Find all tables with class "tables-mobile"
        tables = soup.find_all('div', class_="tables-mobile")
    
        res = []
        
        # Extract and print the content of each table
        for i, table in enumerate(tables, start=1):
            res.append([])
            #print(f'Table {i}:')
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all(['th', 'td'])
                cols = [col.get_text().strip() for col in cols]
                res[-1].append(cols)
                #print('\t'.join(cols))
            #print('\n' + '-'*50 + '\n')

        if len(res) < 2:
            continue

        prompt = """You are a cybersecurity expert specializing in cyber threat intelligence. Given the text below, please generate a maximum of 5 multiple-choice questions with four possible options each.

        Follow these requirements:
        
        1. Question Format: Each question must have four options. The options should be challenging and require careful consideration. Avoid creating options that could be interpreted as correct under different circumstances.
        
        2. Target Audience: The questions should be suitable for security professionals with three to five years of experience in cyber threat intelligence. Avoid generic questions such as "What is the objective?", "Which operating system can be targeted?".
        
        3. Content Coverage: Aim to cover various sections of the document to ensure a comprehensive evaluation of the candidate's knowledge. Include context-specific questions that require an understanding of the document's content.
        
        4. Technical Precision: Use precise terminology and concepts relevant to cyber threat intelligence. Incorporate situational or scenario-based questions where applicable.
        
        5. Include Technique IDs and Names: Ensure that all questions, where applicable, mention both the ID and the full name of the MITRE ATT&CK pattern technique.
        
        6. Premise Inclusion: Each question should include a premise indicating it pertains to MITRE ATT&CK, specifying the relevant platform (Enterprise, ICS, or Mobile) where necessary.
        
        7. Output Format: Return the output in TSV format (must be tab-separated) with the following columns:
        Question, Option A, Option B, Option C, Option D and Correct Answer (A, B, C, D).
        
        Important: Only return the TSV (tab separator \t) content as specified. Do not include any additional text or commentary outside the TSV format.
        
        Text:
        
        """ + paragraphe + str(res)
        
        response = model.generate_content(prompt)

        try:
            with open('cti_mcq_generated.tsv', 'a') as file:
                for line in response.text.strip().split('\n')[1:]:
                    options=line.split('\t')
                    prompt = """You are a cybersecurity expert specializing in cyber threat intelligence. You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.

Question:
"""+ options[0] +"""

 Options:
 A) """+ options[1] +"""
 B) """+ options[2] +"""
 C) """+ options[3] +"""
 D) """+ options[4] +"""

Important: The last line of your answer should contain only the single letter corresponding to the best option, with no additional text."""
                    file.write(url+href_text + '\t' + line + '\t' + prompt.replace('\n',' ').replace('\t',' ') + '\n')
        except:
            continue
    
    else:
        print(f'Failed to fetch webpage. Status code: {response.status_code}', href_text)

    

Webpage fetched successfully /techniques/T1595
Webpage fetched successfully /techniques/T1595/001
Webpage fetched successfully /techniques/T1595/002
Webpage fetched successfully /techniques/T1595/003
Webpage fetched successfully /techniques/T1592
Webpage fetched successfully /techniques/T1592/001
Webpage fetched successfully /techniques/T1592/002
Webpage fetched successfully /techniques/T1592/003
Webpage fetched successfully /techniques/T1592/004
Webpage fetched successfully /techniques/T1589
Webpage fetched successfully /techniques/T1589/001
Webpage fetched successfully /techniques/T1589/002
Webpage fetched successfully /techniques/T1589/003
Webpage fetched successfully /techniques/T1590
Webpage fetched successfully /techniques/T1590/001
Webpage fetched successfully /techniques/T1590/002
Webpage fetched successfully /techniques/T1590/003
Webpage fetched successfully /techniques/T1590/004
Webpage fetched successfully /techniques/T1590/005
Webpage fetched successfully /techniques/T1590/

ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

# CTI-RCM

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import random


nb = 0
try:
    with open('cti_rcm_generated.tsv', 'a') as file:
        file.write('URL'+'\t'+'Description'+'\t'+'Prompt'+'\t'+'GT'+'\n')
        while nb < 1500:
            year = random.choice([2023, 2024])
            sub_url = random.randint(1000, 500000)
            # URL of the website
            url = 'https://nvd.nist.gov/vuln/detail/CVE-'+str(year)+'-'+str(sub_url)+'/'
            
            # Fetch the webpage
            response = requests.get(url)
            
            # Check if the request was successful
            if response.status_code == 200:
                try:
                    webpage_content = response.text
                    print('Webpage fetched successfully', year, sub_url)
                
                    # Parse the HTML content using BeautifulSoup
                    soup = BeautifulSoup(webpage_content, 'html.parser')
            
                    vuln_description = soup.find('p', {'data-testid': 'vuln-description'})
            
                    vuln_cwes_link = soup.find('td', {'data-testid': 'vuln-CWEs-link-0'})
            
                    if vuln_description and vuln_cwes_link:
                        description_text = vuln_description.get_text()
                        cwes_text = vuln_cwes_link.get_text()
                        if "-noinfo" not in cwes_text and 'CWE-' in cwes_text:
                            #print(description_text, cwes_text)
                            prompt = "Analyze the following CVE description and map it to the appropriate CWE. Provide a brief justification for your choice. Ensure the last line of your response contains only the CWE ID.  CVE Description:" + description_text
                            file.write(url.replace('\n', '') + '\t' + description_text.replace('\n', '') + '\t' + prompt.replace('\n', '') + '\t' + cwes_text.replace('\n', '') + '\n')
                            nb += 1
                            print(url.replace('\n', '') + '\t' + description_text.replace('\n', '') + '\t' + prompt.replace('\n', '') + '\t' + cwes_text.replace('\n', '') + str(nb) + '\n')
                except:
                    continue
                    
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Webpage fetched successfully 2024 169041
Webpage fetched successfully 2024 38198
Webpage fetched successfully 2024 344575
Webpage fetched successfully 2024 41769
Webpage fetched successfully 2023 285099
Webpage fetched successfully 2024 270165
Webpage fetched successfully 2023 422106
Webpage fetched successfully 2023 203901
Webpage fetched successfully 2023 442166
Webpage fetched successfully 2024 167604
Webpage fetched successfully 2023 243302
Webpage fetched successfully 2024 138646
Webpage fetched successfully 2024 294792
Webpage fetched successfully 2024 325268
Webpage fetched successfully 2024 406209
Webpage fetched successfully 2024 195460
Webpage fetched successfully 2023 473165
Webpage fetched successfully 2023 250616
Webpage fetched successfully 2023 129663
Webpage fetched successfully 2023 320592
Webpage fetched successfully 2024 196263
Webpage fetched successfully 2024 132201
Webpage fetched successfully 2024 320369
Webpage fetched successfully 2024 191529
Webpage fetched su

# CTI-VSP

In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup


links = pd.read_csv("cti_rcm_generated.tsv", sep="\t", encoding='latin1')['URL']

with open('cti_vsp_generated.tsv', 'a') as file:
        file.write('URL'+'\t'+'Description'+'\t'+'Prompt'+'\t'+'GT'+'\n')
        for link in links:
            # Fetch the webpage
            response = requests.get(link)
            
            # Check if the request was successful
            if response.status_code == 200:
                webpage_content = response.text
                print('Webpage fetched successfully', link)
            
                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(webpage_content, 'html.parser')
        
                vuln_description = soup.find('p', {'data-testid': 'vuln-description'})
        
                gt = soup.find('span', {'data-testid': 'vuln-cvss3-nist-vector'})
                
                if gt is None:
                    gt = soup.find('span', {'data-testid': 'vuln-cvss3-cna-vector'})
        
                if vuln_description and gt:
                    description_text = vuln_description.get_text()
                    gt_text = gt.get_text()
                    #print(description_text, cwes_text)
                    prompt = "Analyze the following CVE description and calculate the CVSS v3.1 Base Score. Determine the values for each base metric: AV, AC, PR, UI, S, C, I, and A. Summarize each metric's value and provide the final CVSS v3.1 vector string.   Valid options for each metric are as follows: - **Attack Vector (AV)**: Network (N), Adjacent (A), Local (L), Physical (P) - **Attack Complexity (AC)**: Low (L), High (H) - **Privileges Required (PR)**: None (N), Low (L), High (H) - **User Interaction (UI)**: None (N), Required (R) - **Scope (S)**: Unchanged (U), Changed (C) - **Confidentiality (C)**: None (N), Low (L), High (H) - **Integrity (I)**: None (N), Low (L), High (H) - **Availability (A)**: None (N), Low (L), High (H)  Summarize each metric's value and provide the final CVSS v3.1 vector string. Ensure the final line of your response contains only the CVSS v3 Vector String in the following format:  Example format: CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H  CVE Description: " + description_text
                    file.write(link + '\t' + description_text.replace('\n', '') + '\t' + prompt.replace('\n', '') + '\t' + gt_text.replace('\n', '') + '\n')
                    #print(link + '\t' + description_text.replace('\n', '') + '\t' + prompt.replace('\n', '') + '\t' + gt_text.replace('\n', '') + '\n')
            else:
                print(link, response.status_code)
                
            
    
        

Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-3027/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2024-32462/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2024-39821/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-7133/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-39544/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-38905/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-42299/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-26032/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2024-26173/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-2669/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-6560/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-2023-3938/
Webpage fetched successfully https://nvd.nist.gov/vuln/detail/CVE-202

UnicodeEncodeError: 'charmap' codec can't encode characters in position 116-119: character maps to <undefined>