In [None]:
import os
import json
import random
import re

# Input and Output Paths
output_file = "masked_descriptions.json"  # Use absolute path for output
cve_folder = "cve_list"  # Folder containing JSON files with CVE data

# Function to mask part of the description based on sentences or commas
def mask_description(description):
    # Use regex to split into sentences based on ". " while keeping the delimiter
    sentences = re.split(r"(?<=\.)\s", description)
    if len(sentences) == 1:
        # No valid sentence boundary (no ". "), check for commas
        if "," in description:
            parts = description.split(",", 1)  # Split into two parts at the first comma
            if random.choice([True, False]):
                return parts[1]
            else:
                return parts[0] 
        else:
            # No commas found, return None
            return None
    elif len(sentences) == 2:
        # Single sentence boundary, randomly mask before or after the boundary
        if random.choice([True, False]):
            return sentences[1]
        else:
            return sentences[0] 
    else:
        # Multiple sentences, randomly choose to mask the first or a later sentence
        if random.choice([True, False]):
            return " ".join(sentences[1:])
        else:
            return sentences[0] + " ".join(sentences[1:])

# Collect all descriptions
all_descriptions = []

# Read all JSON files in the folder
for filename in os.listdir(cve_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(cve_folder, filename)
        with open(file_path, "r", encoding="utf-8") as cve_file:
            cve_data = json.load(cve_file)
            for entry in cve_data:
                description = entry.get("Description", "").strip()
                if len(description) >= 5:  # Skip descriptions less than 5 characters
                    all_descriptions.append(description)

# Apply masking and ensure unique results
masked_descriptions = set()  # Use a set to automatically handle duplicates
for description in all_descriptions:
    masked_desc = mask_description(description)
    if masked_desc is not None:
        masked_descriptions.add(masked_desc)

# Convert the set to a list for saving
masked_descriptions = list(masked_descriptions)

# Print the number of unique masked descriptions generated
print(f"Total unique masked descriptions generated: {len(masked_descriptions)}")

# Print the first 10 masked descriptions for inspection
print("First 10 unique masked descriptions:")
for i in range(min(10, len(masked_descriptions))):
    print(f"Masked: {masked_descriptions[i]}")
    print("-" * 40)

# Save to output file
with open(output_file, "w", encoding="utf-8") as out_file:
    json.dump(masked_descriptions, out_file, indent=2)

print(f"Processed and saved {len(masked_descriptions)} unique masked descriptions to {output_file}")
