In [1]:
# Import libraries
import pandas as pd
import random
from datetime import datetime, timedelta

In [2]:

# Define OWASP WSTG Categories (updated with specified categorisation)
OWASP_WSTG_CATEGORIES = {
    "InformationGathering": "WSTG-IGV-01",
    "ConfigurationDeploymentManagement": "WSTG-CONF-02",
    "IdentityManagement": "WSTG-IDM-03",
    "AuthenticationTesting": "WSTG-ATHN-04",
    "AuthorizationTesting": "WSTG-ATHZ-05",
    "SessionManagementTesting": "WSTG-SESS-06",
    "DataValidationTesting": "WSTG-DV-07",
    "ErrorHandling": "WSTG-ERRH-08",
    "CryptographyTesting": "WSTG-CRYP-09",
    "BusinessLogicTesting": "WSTG-BUSL-10",
    "ClientSideTesting": "WSTG-CLNT-11",
    "APITesting": "WSTG-API-12"
}


In [4]:
# Define MITRE ATT&CK Technique IDs as a dictionary (Selected based on relevance - NOT all are here)

MITRE_TECHNIQUES = {
    "T1190": "Exploit Public-Facing Application",  # Exploit Public-Facing Application - Targets OWASP Injection, XSS, and RCE vulnerabilities
    "T1078": "Valid Accounts", # Valid Accounts - Covers credential-based attacks like brute force and credential stuffing
    "T1133": "External Remote Services", # External Remote Services - Exploits misconfigured VPN, RDP, or SSH, aligning with OWASP Config Testing
    "T1059": "Command and Scripting Interpreter", # Command and Scripting Interpreter - Tests script execution vulnerabilities (JS, Python, Bash)
    "T1087": "Account Discovery", # Account Discovery - Focuses on enumerating user accounts and roles, relevant for Identity Testing
    "T1110": "Brute Force",  # Brute Force - Checks password strength, rate-limiting, and credential stuffing attacks
    "T1595": "Active Scanning", # Active Scanning - Simulates attacker reconnaissance to identify open ports, services, and vulnerabilities
    "T1195": "Supply Chain Compromise", # Supply Chain Compromise - Targets dependencies, 3rd-party software, and package vulnerabilities
    "T1203": "Exploitation for Client Execution",  # Exploitation for Client Execution - Examines drive-by downloads, malicious script execution
    "T1566": "Phishing" # Phishing - Tests social engineering scenarios related to email-based attacks
}

In [None]:
# Define Compliance Frameworks (Random selection for Vantage Point Security Relevance)
COMPLIANCE_FRAMEWORKS = ["CREST Penetration Testing", "NIST 800-53", "MAS TRM (Technology Risk Management)"]

# CREST Penetration Testing - Recognized industry standard for security testing.
# NIST 800-53 - A key framework for security and risk management.git pull origin testing
# MAS TRM (Technology Risk Management) - Critical for financial institutions in Singapore.



In [None]:
# Define CVSS score categories - Industry standards
CVSS_SCORES = {
    "Critical": "9.8 - CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H",
    "High":    "7.5 - CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:H/A:H",
    "Medium":  "5.4 - CVSS:3.1/AV:N/AC:H/PR:N/UI:R/S:U/C:L/I:L/A:L",
    "Low":     "3.2 - CVSS:3.1/AV:L/AC:H/PR:N/UI:N/S:U/C:L/I:N/A:N"
}

# CVSS base score - version of the CVSS / Attack Vector: Network / Attack Complexity: Low / Privileges Required: None / User Interaction: None / Scope: Unchanged / Confidentiality Impact: High / Integrity Impact: High / Availability Impact: High

In [8]:
# Main Function to Generate Test Cases
def generate_test_cases(num_cases=100,
                        project_ids=None,
                        user_ids=None,
                        seed=None):
    """
    Generates synthetic application security test cases referencing a given list
    of ProjectIDs and UserIDs. If none are provided, random IDs are generated.
    """

    if seed is not None: # seed parameter to initialise Python’s random number generator in a predictable way.
        random.seed(seed)
    
    # Fallbacks if no external IDs were provided
    if not project_ids:
        project_ids = [f"PRJ-{i:03}" for i in range(1, 11)]  # 10 dummy projects
    
    if not user_ids:
        user_ids = [f"USER-{i:03}" for i in range(1, 21)]   # 20 dummy users
    
    test_cases = []
    for i in range(1, num_cases + 1):
        test_id = f"APPSEC-{i:03}"
        project_id = random.choice(project_ids)
        tester_id  = random.choice(user_ids)
        user_id    = random.choice(user_ids)
        
        days_allocated = random.randint(1, 30)
        
        owasp_category = random.choice(list(OWASP_WSTG_CATEGORIES.keys()))
        owasp_ref      = OWASP_WSTG_CATEGORIES[owasp_category]
        mitre_id = random.choice(list(MITRE_TECHNIQUES.keys()))
        compliance     = random.choice(COMPLIANCE_FRAMEWORKS)
        
        test_prereq = random.choice([
            "ValidUserCredentials", 
            "VPNAccess", 
            "AdminPrivilegesRequired"
        ])
        
        severity_label = random.choice(list(CVSS_SCORES.keys()))
        cvss_vector    = CVSS_SCORES[severity_label]
        
        remediation_status = random.choice(["Open", "InProgress", "VerifiedFixed"])
        status            = random.choice(["Pending", "InProgress", "Completed"])
        
        # For a more realistic timeline, let's say 'CreatedOn' is up to 30 days in the past
        created_on_dt  = datetime.now() - timedelta(days=random.randint(0, 30))
        modified_on_dt = created_on_dt + timedelta(days=random.randint(0, 10))
        
        test_case = {
            'TestID':             test_id,
            'ProjectID':          project_id,
            'TesterID':           tester_id,   # The security engineer or QA person
            'UserID':             user_id,     # Possibly the user who requested the test
            'DaysAllocated':      days_allocated,
            'OWASPCategory':      owasp_category,
            'OWASPReference':     owasp_ref,
            'MITRETechniqueID':   mitre_id,
            'ComplianceFramework': compliance,
            'TestPrerequisites':  test_prereq,
            'FindingSeverity':    severity_label,
            'CVSSScore':          cvss_vector,
            'RemediationStatus':  remediation_status,
            'Status':             status,
            'CreatedOn':          created_on_dt.strftime('%Y-%m-%d'),
            'ModifiedOn':         modified_on_dt.strftime('%Y-%m-%d')
        }
        test_cases.append(test_case)
    
    df = pd.DataFrame(test_cases)
    return df

In [9]:
def generate_test_execution_logs(test_cases_df, num_logs=200, seed=None):
    """
    Generates synthetic test execution logs referencing existing test cases.
    """
    if seed is not None:
        random.seed(seed)
    
    all_test_ids   = test_cases_df['TestID'].tolist()
    all_project_ids= test_cases_df['ProjectID'].tolist()
    
    logs = []
    for i in range(1, num_logs + 1):
        log_id      = f"LOG-{i:03}"
        test_id_idx = random.randint(0, len(all_test_ids) - 1)
        test_id     = all_test_ids[test_id_idx]
        proj_id     = all_project_ids[test_id_idx]
        
        # Random date logic: execution took place after the test "CreatedOn"
        exec_start_dt = datetime.now() - timedelta(days=random.randint(0, 20), hours=random.randint(0, 23))
        exec_end_dt   = exec_start_dt + timedelta(minutes=random.randint(30, 180))  # 0.5 - 3 hours
        
        outcome = random.choice(["Pass", "Fail"])
        defects_found = random.randint(0, 5) if outcome == "Fail" else 0
        critical_defect_flag = "Yes" if defects_found > 0 and random.random() < 0.3 else "No"
        
        # If a critical defect is found, maybe we escalate
        escalate = "Yes" if critical_defect_flag == "Yes" and random.random() < 0.5 else "No"
        
        # Time to notify stakeholders (random between 10 - 120 min if critical)
        notif_time = random.randint(10, 120) if critical_defect_flag == "Yes" else 0
        
        # If there's at least a Fail, we might have an initial findings submission a few hours later
        initial_findings_dt = exec_end_dt + timedelta(hours=random.randint(1, 8))
        initial_findings_submitted = (
            initial_findings_dt.strftime('%Y-%m-%d %H:%M:%S') 
            if outcome == "Fail" else ""
        )
        
        log_record = {
            "LogID":                 log_id,
            "TestID":                test_id,
            "ProjectID":             proj_id,
            "ExecutionStart":        exec_start_dt.strftime('%Y-%m-%d %H:%M:%S'),
            "ExecutionEnd":          exec_end_dt.strftime('%Y-%m-%d %H:%M:%S'),
            "Outcome":               outcome,
            "DefectsFound":          defects_found,
            "CriticalDefectFlag":    critical_defect_flag,
            "InitialFindingsSubmitted": initial_findings_submitted,
            "EscalationRequired":    escalate,
            "NotificationTimeMins":  notif_time
        }
        logs.append(log_record)
    
    return pd.DataFrame(logs)

In [None]:
if __name__ == "__main__":
    try:
        crm_projects = pd.read_csv("projects_data.csv")  # from your CRM generation script
        existing_project_ids = crm_projects["ProjectID"].unique().tolist()
    except FileNotFoundError:
        print("No CRM project file found; falling back to dummy ProjectIDs.")
        existing_project_ids = None

    # Generate test cases, referencing real project IDs if available
    test_cases_df = generate_test_cases(
        num_cases=100,
        project_ids=existing_project_ids,  # or None if you want dummy ones
        user_ids=None,                     # or your custom user pool
        seed=42
    )
    print("\n--- Sample Test Cases ---")
    print(test_cases_df.head())

    # Save to CSV
    test_cases_df.to_csv("updated_security_test_cases.csv", index=False)
    print("\nSaved 'updated_security_test_cases.csv'.")


--- Sample Test Cases ---
       TestID ProjectID  TesterID    UserID  DaysAllocated  \
0  APPSEC-001   PRJ-082  USER-004  USER-001             24   
1  APPSEC-002   PRJ-012  USER-019  USER-014              2   
2  APPSEC-003   PRJ-090  USER-018  USER-014              8   
3  APPSEC-004   PRJ-020  USER-007  USER-011              4   
4  APPSEC-005   PRJ-094  USER-015  USER-018              4   

                       OWASPCategory OWASPReference MITRETechniqueID  \
0               AuthorizationTesting   WSTG-ATHZ-05            T1059   
1               InformationGathering    WSTG-IGV-01            T1078   
2                      ErrorHandling   WSTG-ERRH-08            T1566   
3  ConfigurationDeploymentManagement   WSTG-CONF-02            T1595   
4              DataValidationTesting     WSTG-DV-07            T1078   

                    ComplianceFramework     TestPrerequisites FindingSeverity  \
0             CREST Penetration Testing  ValidUserCredentials        Critical   
1    

In [None]:
# use of matching Function (match_logs_to_tests) -- e.g, to match and link the testing dataset to logs 

# A new function match_logs_to_tests() will be added to processes the log data and matches each log entry to its corresponding test case based on keywords in the log event and test case categories.
# Matching Logic: The function looks for matches between log events (like "login", "SQL injection") and test case categories (like "Authentication Testing" or MITRE attack techniques like "T1190")

# Also require adding of things after liasing with Team 2
# Testing requirements: 