In [5]:
import csv
import json

# Function to parse CSV and convert it to JSON-like structure
def parse_csv_to_json(csv_file_path, json_file_path):
    nodes = []
    
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            print(row)
            node = {
                "id": f"CAPEC-{row['ID']}",
                "type": "CAPEC",
                "features": {
                    "description": row["Description"],
                }
            }
            nodes.append(node)
    
    # Write the nodes list to a JSON file
    with open(json_file_path, mode='w', encoding='utf-8') as json_file:
        json.dump({"nodes": nodes}, json_file, indent=4)

# Helper functions to parse specific fields
def parse_related_attack_patterns(patterns_str):
    if not patterns_str:
        return []
    patterns = patterns_str.split("::")
    result = []
    for pattern in patterns:
        if "CAPEC ID" in pattern:
            parts = pattern.split(":")
            result.append({"nature": parts[1], "capec_id": parts[-1]})
    return result

def parse_execution_flow(flow_str):
    if not flow_str:
        return []
    steps = flow_str.split("::STEP:")
    result = []
    for step in steps[1:]:
        step_parts = step.split(":")
        step_number = step_parts[0]
        phase = extract_between(step, "PHASE:", ":")
        description = extract_between(step, "DESCRIPTION:", ":TECHNIQUE:")
        techniques = step.split(":TECHNIQUE:")[1:] if ":TECHNIQUE:" in step else []
        techniques = [t.split(":")[0] for t in techniques]
        result.append({"step": int(step_number), "phase": phase, "description": description, "techniques": techniques})
    return result

def parse_skills_required(skills_str):
    if not skills_str:
        return []
    skills = skills_str.split("::")
    result = []
    for skill in skills:
        if "SKILL:" in skill:
            skill_level = extract_between(skill, "LEVEL:", "::")
            result.append({"skill": skill.split(":")[1], "level": skill_level})
    return result

def parse_consequences(consequences_str):
    if not consequences_str:
        return []
    consequences = consequences_str.split("::")
    result = []
    for consequence in consequences:
        if "SCOPE:" in consequence:
            parts = consequence.split(":")
            result.append({"scope": parts[1], "impact": parts[-1]})
    return result

def parse_taxonomy_mappings(taxonomy_str):
    if not taxonomy_str:
        return []
    mappings = taxonomy_str.split("::")
    result = []
    for mapping in mappings:
        if "TAXONOMY NAME:" in mapping:
            parts = mapping.split(":")
            result.append({"taxonomy_name": parts[1], "entry_id": parts[3], "entry_name": parts[5]})
    return result

def extract_between(text, start, end):
    if start in text and end in text:
        return text.split(start)[1].split(end)[0]
    return ""

# Example usage
if __name__ == "__main__":
    parse_csv_to_json("CWE_to_technique.csv", "output.json")


{'ID': '1', 'Name': 'Accessing Functionality Not Properly Constrained by ACLs', 'Abstraction': 'Standard', 'Status': 'Draft', 'Description': "In applications, particularly web applications, access to functionality is mitigated by an authorization framework. This framework maps Access Control Lists (ACLs) to elements of the application's functionality; particularly URL's for web apps. In the case that the administrator failed to specify an ACL for a particular element, an attacker may be able to access it with impunity. An attacker with the ability to access functionality not properly constrained by ACLs can obtain sensitive information and possibly compromise the entire application. Such an attacker can access resources that must be available only to users at a higher privilege level, can access management sections of the application, or can run queries for data that they otherwise not supposed to.", 'Alternate Terms': '', 'Likelihood Of Attack': 'High', 'Typical Severity': 'High', 'Re

In [2]:
import json
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to create BERT embedding for a given text
def create_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the embeddings from the last hidden state
    last_hidden_state = outputs.last_hidden_state
    # Average the token embeddings to get a single embedding for the text
    embedding = torch.mean(last_hidden_state, dim=1).squeeze()
    return embedding.tolist()

# Load the JSON file
with open('capec_descr.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Iterate over the nodes and replace the description with its BERT embedding
for node in data['nodes']:
    description = node['features']['description']
    embedding = create_bert_embedding(description)
    node['features']['description_embedding'] = embedding
    # Remove the original description if needed
    del node['features']['description']

# Write the updated JSON to a file
with open('output_with_embeddings.json', 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, indent=4)