# CASTLE Benchmark

## Preparations

In [None]:
import os
import json
from math import pi
from collections import defaultdict

import yaml
import adjustText
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib_venn import venn3, venn3_circles
import squarify
import scipy.stats as stats
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

In [None]:
## Settings

dataset_file = 'dataset.json'
reports_directory = 'reports'
report_suffix = '-report.json'
report_version = '1.2' # the charts will only load the reports with this version!
human_results = {
    'name': 'Developers',
    'type': 'human',
    'tp': 11,
    'fp': 10,
}
castle_scoring = {
    'tp': 5,
    'tn': 2,
    'fp': -1,
    'fn': 0,
}
castle_toplist_bonus = 5
relevant_findings = [
    '', 'none', # systems that do not include severity we treat them as equal
    'warning', 'error', # systems using error type
    'medium', 'high', 'critical', # systems using severity type
    'portability' # portability-related findings often cause unexpected behavior
]
color_map = {
    'sast': { 'color': '#648fff', 'label': 'Static Analyzer' },
    'llm': { 'color': '#ffb347', 'label': 'LLM' },
    'fv': { 'color': '#5bb9a9', 'label': 'Formal Verification' },
}
# https://www.color-hex.com/color-palette/1046567

test_per_cwe = 10
# 2024 (https://cwe.mitre.org/top25/archive/2024/2024_cwe_top25.html)
top_25_cwes = [ 79, 787, 89, 352, 22, 125, 78, 416, 862, 434, 94, 20, 77, 287, 269, 502, 200, 863, 918, 119, 476, 798, 190, 400, 306 ]

# Display names
display_names = {
    # Tools
    'aikido': 'Aikido',
    'clang-analyzer': 'Clang Analyzer',
    'codeql': 'CodeQL',
    'codethreat': 'CodeThreat',
    'cppcheck': 'Cppcheck',
    'gcc-fanalyzer': 'GCC Fanalyzer',
    'gitlab-sast': 'GitLab SAST',
    'jit': 'Jit',
    'semgrep-code': 'Semgrep Code',
    'snyk': 'Snyk',
    'sonarqube': 'SonarQube',
    'splint': 'Splint',
    'coverity': 'Coverity',
    
    # BMCs
    'esbmc': 'ESBMC',
    'cbmc': 'CBMC',
    
    # LLMs
    'falcon3-7b': 'Falcon 3 (7B)',
    'gemma-2-9b': 'Gemma 2 (9B)',
    'gpt-4o-mini': 'GPT-4o Mini',
    'gpt-4o': 'GPT-4o',
    'o1': 'GPT-o1',
    'o3-mini': 'GPT-o3 Mini',
    'qwen-2.5-coder-32b-instruct': 'QWEN 2.5CI (32B)',
    'deepseek-reasoner': 'DeepSeek R1',
    'llama-3.1-8b': 'LLAMA 3.1 (8B)',
    'mistral-7b-ins': 'Mistral Ins. (7B)',
}
tool_color_map = {
    'sast': '#648fff',
    'gca': '#18795b',
    'fv': '#5bb9a9',
    'llm': '#ffcc00',
    'combination': '#ffb347'
}

tool_type_map = {
    # Tools
    'aikido': 'sast',
    'clang-analyzer': 'gca',
    'codeql': 'sast',
    'codethreat': 'sast',
    'cppcheck': 'gca',
    'gcc-fanalyzer': 'gca',
    'gitlab-sast': 'sast',
    'jit': 'sast',
    'semgrep-code': 'sast',
    'snyk': 'sast',
    'sonarqube': 'sast',
    'splint': 'sast',
    'coverity': 'sast',
    
    # BMCs
    'esbmc': 'fv',
    'cbmc': 'fv',
    
    # LLms
    'falcon3-7b': 'llm',
    'gemma-2-9b': 'llm',
    'gpt-4o-mini': 'llm',
    'gpt-4o': 'llm',
    'o1': 'llm',
    'o3-mini': 'llm',
    'qwen-2.5-coder-32b-instruct': 'llm',
    'deepseek-reasoner': 'llm',
    'llama-3.1-8b': 'llm',
    'mistral-7b-ins': 'llm',
}

manual_version_map = {
    # Tools
    'aikido': '-',
    'clang-analyzer': '18.1.3',
    'codeql': '2.20.1',
    'codethreat': '-',
    'cppcheck': '2.13.0',
    'gcc-fanalyzer': '13.3.0',
    'gitlab-sast': '15.2.1',
    'jit': '-',
    'semgrep-code': '1.110.0',
    'snyk': '1.1295.4',
    'sonarqube': '25.3.0',
    'splint': '3.1.2',
    'coverity': '2024.12.1',
    
    # BMCs
    'esbmc': '7.8.1',
    'cbmc': '5.95.1',
    
    # LLms
    'falcon3-7b': '-',
    'gemma-2-9b': '-',
    'gpt-4o-mini': '-',
    'gpt-4o': '-',
    'o1': '-',
    'o3-mini': '-',
    'qwen-2.5-coder-32b-instruct': '-',
    'deepseek-reasoner': '-',
    'llama-3.1-8b': '-',
    'mistral-7b-ins': '-',
}

## Data Loading

In [None]:
## Load data

# Load dataset
with open(dataset_file) as f:
    dataset = json.load(f)
print(f"Loaded dataset with {len(dataset['tests'])} tests.")


# Load all reports
names = [
    report.removesuffix(report_suffix)
    for report
    in os.listdir(reports_directory)
    if report.endswith(report_suffix)
]

reports = {}
for name in names:
    with open( os.path.join(reports_directory, f'{name}{report_suffix}')) as f:
        report = json.load(f)
        if report['version'] != report_version:
            print(f"[!!!!] Skipping report {name} with version {report['version']} (expected {report_version})")
            continue
        reports[name] = report
print(f"Loaded {len(reports)} reports.")


# Load CWE dictionary
cwes = None
with open('cwe-collection.yaml') as f:
    cwes = yaml.load(f, Loader=yaml.FullLoader)
print(f"Loaded {len(cwes)} CWEs.")


In [None]:
## Calculate general stats
total_vulnerable = len([ t for t in dataset['tests'] if t['vulnerable'] ])
total_not_vulnerable = len(dataset['tests']) - total_vulnerable
present_cwes = list(set([ t['cwe'] for t in dataset['tests'] if t['cwe'] != 0 ]))

total_vulnerable, total_not_vulnerable, len(cwes), len(present_cwes)

## Global Helper Functions & Resources

In [None]:
## Helpers

# Define the normalized CASTLE score
# legacy V1
#def castle_norm(tp, tn, fp, fn, w=5):
#    return ( tp - (fp/w) ) / ( tp + fp + fn )

def castle_combination(tp, tn, fp, fn, bonus):
    return (
        tp * castle_scoring['tp'] +
        tn * castle_scoring['tn'] +
        fp * castle_scoring['fp'] +
        fn * castle_scoring['fn'] +
        bonus
    )

def castle(cwe_toplist, toplist_bonus, results):
    tps = 0
    tns = 0
    fps = 0
    fns = 0
    bonus = 0
    for res in results:
        tps += res['result']['tp']
        tns += res['result']['tn']
        fps += res['result']['fp']
        fns += res['result']['fn']
        assert len(res['expected']['cwe']) > 0, res
        cwe = res['expected']['cwe'][0]
        assert cwe > 0
        if cwe in cwe_toplist and tps > 0:
            bonus += toplist_bonus - cwe_toplist.index(cwe) // toplist_bonus
    return castle_combination(tps, tns, fps, fns, bonus), tps, tns, fps, fns, bonus

In [None]:
all_cwes = list(set([test['cwe'] for test in dataset['tests']]))
all_perfect_tests = []
for cwe in all_cwes:
    true_positive = {
        'result': { 'tp': 1, 'tn': 0, 'fp': 0, 'fn': 0 },
        'expected': { 'cwe': [cwe] }
    }
    true_negative = {
        'result': { 'tp': 0, 'tn': 1, 'fp': 0, 'fn': 0 },
        'expected': { 'cwe': [cwe] }
    }
    all_perfect_tests += [ true_positive ] * (total_vulnerable // len(all_cwes)) + [ true_negative ] * (total_not_vulnerable // len(all_cwes))
score, tp, tn, fp, fn, bonus = castle(top_25_cwes, castle_toplist_bonus, all_perfect_tests)

print(score, tp, tn, fp, fn, bonus)
perfect_castle_score = score

## Report Parsing

In [None]:
## Parse reports

# Parse CWEs
def cwe_collection_to_dict(cwe_collection: dict[str, any]) -> dict[int, list[int]]:
    cwe_dict = {
        0: []
    }
    for c in cwe_collection:
        cwes = [ int(c) ]
        cwes += [ int(list(p.keys())[0]) for p in cwe_collection[c]['children'] ]
        cwes += [ int(list(p.keys())[0]) for p in cwe_collection[c]['parents'] ]
        cwe_dict[int(c)] = cwes
    return cwe_dict
accepted_cwe_lists = cwe_collection_to_dict(dataset['cwes'])

# Parser functions
def filter_findings(parsed: list[dict]):
    filtered = [ f for f in parsed if f['severity'].lower() in relevant_findings ]
    return filtered


def is_cwe_match(collection, correct: int, reported: int):
    # Matches the correct CWE, any of its children or its direct parent
    if reported in collection[correct]:
        return True
    
    return False

def is_line_match(corrects: list[int], reported: int):
    if reported == 0:
        return False
    return reported in corrects

def validate_findings(test: list[dict], findings: list[dict]):
    # True negative
    if len(findings) == 0 and not test['vulnerable']:
        return {
            "id": test['id'],
            "actual": {
                "vulnerable": False,
                "line": 0,
                "cwe": 0,
                "message": '',
            },
            "expected": {
                "vulnerable": test['vulnerable'],
                "line": test['lines'],
                "cwe": accepted_cwe_lists[test['cwe']],
                "message": test['description'],
            },
            "result": {
                'tp': 0,
                'fp': 0,
                'tn': 1,
                'fn': 0,
            }
        }
        
    # Filter true findings
    #matches = [ f for f in findings if f['line'] in test['lines'] or f['cwe'] in accepted_cwe_lists[test['cwe']] ]
    matches = []
    for f in findings:
        # Skip overrides to false and include overrides to true
        if 'override' in f:
            if f['override'] == True:
                matches.append(f)
            continue
        
        # Include if line or CWE matches
        if is_line_match(test['lines'], f['line']) or is_cwe_match(accepted_cwe_lists, test['cwe'], f['cwe']):
            matches.append(f)
    tp_count = len(matches)
    fp_count = len(findings) - tp_count
        
    # False negative
    if tp_count == 0 and test['vulnerable']:
        return {
            "id": test['id'],
            "actual": {
                "vulnerable": False,
                "line": 0,
                "cwe": 0,
                "message": '',
            },
            "expected": {
                "vulnerable": test['vulnerable'],
                "line": test['lines'],
                "cwe": accepted_cwe_lists[test['cwe']],
                "message": test['description'],
            },
            "result": {
                'tp': 0,
                'fp': fp_count,
                'tn': 0,
                'fn': 1,
            }
        }
        
    # True positive
    if tp_count > 0:
        m = matches[0]
        false_positive_count = fp_count # if two findings are reported on the same line, we count it as one, so we are not subtracting points. Sone systems may report on an issue in the given line multiple times, because they separate out the underlying cause and its effects. We are not deducting points for that.
        return {
            "id": test['id'],
            "actual": {
                "vulnerable": True,
                "line": m['line'],
                "cwe": m['cwe'],
                "message": m['message'],
            },
            "expected": {
                "vulnerable": test['vulnerable'],
                "line": test['lines'],
                "cwe": accepted_cwe_lists[test['cwe']],
                "message": test['description'],
            },
            "result": {
                'tp': 1,
                'fp': false_positive_count,
                'tn': 0,
                'fn': 0,
            }
        }
        
    # False positive
    return {
        "id": test['id'],
        "actual": {
            "vulnerable": False,
            "line": 0,
            "cwe": 0,
            "message": '',
        },
        "expected": {
            "vulnerable": test['vulnerable'],
            "line": test['lines'],
            "cwe": accepted_cwe_lists[test['cwe']],
            "message": test['description'],
        },
        "result": {
            'tp': 0,
            'fp': fp_count,
            'tn': 0,
            'fn': 0,
        }
    }
    
print(f'Processing {len(reports)} reports...')
results = {}
for i,name in enumerate(reports):
    print(f'   [{i+1}/{len(reports)}] {name}')
    results[name] = []
    for i, rep in enumerate(reports[name]['tests']):
        # Remove low level / irrelevant findings with exceptions
        findings = filter_findings(rep['findings'])
        # Determine result
        result = validate_findings(dataset['tests'][i], findings)
        results[name].append(result)

In [None]:
## CWE toplist standings
cwe_toplist = {}
for cwe in accepted_cwe_lists:
    cwe_toplist[cwe] = -1
    for c in accepted_cwe_lists[cwe]:
        if c in top_25_cwes:
            index = top_25_cwes.index(c)
            if cwe_toplist[cwe] == -1 or index < cwe_toplist[cwe]:
                cwe_toplist[cwe] = index
    
cwe_toplist

## Calculating Metrics

In [None]:
# CASTLE
scores = {}
for name in results:
    castle_score, tps, tns, fps, fns, bonus = castle(top_25_cwes, castle_toplist_bonus, results[name])
    scores[name] = {
        'tp': tps,
        'tn': tns,
        'fp': fps,
        'fn': fns,
        'castle': castle_score,
        'precision': tps / (tps + fps) if tps + fps > 0 else 0, # avoid division by zero
        'recall': tps / (tps + fns) if tps + fns > 0 else 0, # recall is the same thing as 'coverage'
        'accuracy': (tps + tns) / (tps + tns + fps + fns), # how many did it get right out of all predictions
        'type': tool_type_map[name],
    }

scores['aikido']

In [None]:
# Count of true positives for each test. CASTLE-1, CASTLE-2, etc.
tp_counts = defaultdict(int)
for name in results:
    for finding in results[name]:
        if finding['result']['tp'] == 1:
            tp_counts[ finding['id'] ] += 1

sorted_tp_counts = sorted(tp_counts.items(), key=lambda item: item[1], reverse=True)
sorted_tp_counts[:10], sorted_tp_counts[-10:]

In [None]:
for i in cwes:
    cwes[i]['test_count'] = 0

for test in dataset['tests']:
    cwes[ str(test['cwe']) ]['test_count'] += 1

ordered_cwes = { k: cwes[k] for k in sorted(cwes, key=lambda k: int(k), reverse=False) }

for i in ordered_cwes:
    top_index = cwe_toplist[int(i)]
    name = ordered_cwes[i]['name'].split("('")[0]
    print(f'CWE-{i} & {top_index+1 if top_index != -1 else '-'} & {name} \\\\')

In [None]:
# Combination score
def combination_score(results, a, b):
    resa = results[a]
    resb = results[b]
    assert len(resa) == len(resb), f"Test count mismatch: {len(resa)} != {len(resb)}, {a}:{b}"
    
    combined = []
    for i in range(len(resa)):
        assert resa[i]['id'] == resb[i]['id'], f"Test ID mismatch at index {i}: {resa[i]['id']} != {resb[i]['id']}, {a}:{b}"
        combined.append({
            'name': resa[i]['id'],
            'result': {
                'tp': resa[i]['result']['tp'] == 1 or resb[i]['result']['tp'] == 1,
                'tn': resa[i]['result']['tn'] == 1 and resb[i]['result']['tn'] == 1,
                'fp': resa[i]['result']['fp'] + resb[i]['result']['fp'],
                'fn': resa[i]['result']['fn'] == 1 or resb[i]['result']['fn'] == 1,
            },
            'expected': {
                'cwe': resa[i]['expected']['cwe'],
            },
        })
    
    return castle(top_25_cwes, castle_toplist_bonus, combined)

combination_score(results, 'codeql', 'aikido')

In [None]:
# Run all matrix combinations
combinations = []
for a in results:
    for b in results:
        if a == b:
            continue
        combinations.append({
            'a': a,
            'b': b,
            'score': combination_score(results, a, b)[0]
        })
        
len(combinations), combinations[:15]

## Generating Charts

In [None]:
# Number of tests per CWE
x = [ cwe for cwe in ordered_cwes ]
y = [ ordered_cwes[cwe]['test_count'] for cwe in ordered_cwes ]

plt.bar(x, y)
plt.title('Number of Tests per CWE')
plt.xlabel('CWE')
plt.ylabel('Number of Tests')

# set plot size
fig = plt.gcf()
fig.set_figwidth(12)
fig.set_figheight(3)

plt.show()

In [None]:
# Chart of color-coded castle scores
scores_sorted = sorted(scores.items(), key=lambda item: item[1]['castle'], reverse=False)

x = [ display_names[ s[0] ] for s in scores_sorted ]
y = [ s[1]['castle'] for s in scores_sorted ]
colors = [tool_color_map[tool_type_map[s[0]]] for s in scores_sorted]

fig, ax = plt.subplots()
bar = ax.bar(x, y, color=colors)
plt.ylabel('CASTLE Score')

ax.set_xticks(range(len(x)), labels=x, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

# Legend
legend_elements = [ Patch(facecolor=color_map[color]['color'], label=color_map[color]['label']) for color in color_map ]
ax.legend(handles=legend_elements, bbox_to_anchor=(0.95, 1.1), ncol=3)

# set plot size
fig = plt.gcf()

plt.savefig('assets/castle-scores.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Chart of castle scores divided between SCAs and LLMs
scores_sca = { k: v for k, v in scores.items() if v['type'] in [ 'sast', 'bmc' ] }
scores_llm = { k: v for k, v in scores.items() if v['type'] == 'llm' }
print(f"SCAs: {len(scores_sca)}, LLMs: {len(scores_llm)}")

scores_sca_sorted = sorted(scores_sca.items(), key=lambda item: item[1]['castle'], reverse=False)
scores_llm_sorted = sorted(scores_llm.items(), key=lambda item: item[1]['castle'], reverse=False)

x_sca = [ display_names[ s[0] ] for s in scores_sca_sorted ]
y_sca = [ s[1]['castle'] for s in scores_sca_sorted ]

x_llm = [ display_names[ s[0] ] for s in scores_llm_sorted ]
y_llm = [ s[1]['castle'] for s in scores_llm_sorted ]

fig, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw={'width_ratios': [len(scores_sca), len(scores_llm)]})
ax = plt.subplot(1, 2, 1)

bar = ax.bar(x_sca, y_sca)
plt.ylim(-800, 1000)
plt.ylabel('CASTLE Score')

ax.set_xticks(range(len(x_sca)), labels=x_sca, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

## 2: LLMs

ax = plt.subplot(1, 2, 2)
bar = ax.bar(x_llm, y_llm)
ax.yaxis.set_label_position("right")
ax.yaxis.tick_right()

ax.set_xticks(range(len(x_llm)), labels=x_llm, rotation=45, ha="right", rotation_mode="anchor")


for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

plt.savefig('assets/castle-scores-separate.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Scatter plot for number of True positives vs False negatives
import numpy as np

# Data
x = [ scores[e]['fp'] if scores[e]['fp'] != 0 else 0.1 for e in scores if scores[e]['type'] != 'llm' ] # avoid log(0) error
y = [ scores[e]['tp'] for e in scores if scores[e]['type'] != 'llm' ]
colors = [ tool_color_map[ tool_type_map[e] ] for e in scores if scores[e]['type'] != 'llm' ]
names = [ e for e in scores if scores[e]['type'] != 'llm' ]

# Base plot
fig, ax = plt.subplots(figsize=(5, 3))
ax.scatter(x, y,
    c=colors,
    marker='o', # https://matplotlib.org/stable/api/markers_api.html#module-matplotlib.markers
)


# Labels
ax.set_xlabel('False Positives (lower is better)')
ax.set_ylabel('True Positives (higher is better)')
ax.set_xscale("log")
labels = []
fig.figure.set_size_inches(8, 4)
for i,n in enumerate(names):
    display_name = display_names[n]
    labels += [ ax.text(x[i], y[i], display_name) ]

adjustText.adjust_text(labels, x=x, y=y, ax=ax, expand_points=(1.2, 1.2), expand_text=(1.2, 1.2), force_text=(0.5, 0.5), force_points=(0.5, 0.5))

x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()

# Generate x values in logarithmic space
x_values = np.logspace(np.log10(x_min), np.log10(x_max), 100)
#x_values = np.linspace(x_min, x_max, 100)

# Corresponding y values (1:1 ratio)
#y_values = np.clip(x_values, y_min, y_max)  # Ensures y-values do not exceed plot limits
y_values = np.clip(x_values, y_min, y_max)  # Ensures y-values do not exceed plot limits

# Plot the 1:1 reference line
ax.plot(x_values, y_values, linestyle="--", color="gray", label="1:1 Ratio Line")
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)

# Legend
# Legend
legend_elements = [
    Patch(facecolor=tool_color_map['sast'], label='Static Application Security Tester'),
    Patch(facecolor=tool_color_map['gca'], label='Generic Code Analyzer'),
    Patch(facecolor=tool_color_map['fv'], label='Formal Verification'),
    #Patch(facecolor=tool_color_map['combination'], label='Tool Combination'),
    Patch(facecolor='none', edgecolor='gray', linestyle='--', label="1:1 Ratio Line"),
]
ax.legend(handles=legend_elements, bbox_to_anchor=(0.85, 1.25), ncol=2)

plt.savefig('assets/tp-fp-scatter.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# CASTLE Scores of tool combinations (/10)
x = results.keys()
y = results.keys()

data = np.array([ [ int(combination_score(results, a, b)[0]/10) for a in x ] for b in y ])
mask = np.tri(data.shape[0], k=0).T
data = np.ma.array(data, mask=mask)

fig, ax = plt.subplots()
# Filter out combinations of the same tools and ensure each combination is only shown once
im = ax.imshow(data)

# Show all ticks and label them with the respective list entries
ax.set_xticks(range(len(y)), labels=y,
              rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks(range(len(x)), labels=x)

# Loop over data dimensions and create text annotations.
for i in range(len(x)):
    for j in range(len(y)):
        if i > j: # Only show labels on lower triangle
            text = ax.text(j, i, data[i, j], ha="center", va="center", color="w")

ax.set_title("CASTLE Score of Tool Combinations")
fig.tight_layout()
fig.figure.set_size_inches(10, 10)
plt.show()

In [None]:
# Create a midpoint normalization generator for matplotlib heatmaps
class MidpointNormalize(matplotlib.colors.Normalize):
    def __init__(self, vmin, vmax, midpoint=0, gamma=1.0, clip=False):
        self.midpoint = midpoint
        self.gamma = gamma
        super().__init__(vmin, vmax, clip)

    def __call__(self, value, clip=None):
        result, is_scalar = self.process_value(value)
        vmin, vmax, midpoint, gamma = self.vmin, self.vmax, self.midpoint, self.gamma

        # Initialize array for normalized values
        rescaled = np.empty_like(result)
        # Normalize positive values
        pos_mask = result >= midpoint
        rescaled[pos_mask] = 0.5 + 0.5 * ((result[pos_mask] - midpoint) / (vmax - midpoint))**gamma
        # Normalize negative values
        neg_mask = result < midpoint
        rescaled[neg_mask] = 0.5 - 0.5 * ((midpoint - result[neg_mask]) / (midpoint - vmin))**gamma

        return np.ma.array(rescaled, mask=np.ma.getmask(result))

In [None]:
# Score delta from higer tool score

x = list([r for r in results if scores[r]['type'] != 'llm'])
y = list([r for r in results if scores[r]['type'] != 'llm'])

data = np.array([
    [
        combination_score(results, a, b)[0] - max( scores[a]['castle'], scores[b]['castle'])
        # int(round( ( (combination_score(results, a, b)[0] - max(scores[a]['castle'], scores[b]['castle'])) / max(scores[a]['castle'], scores[b]['castle']) ) * 100, 0))
        for a in x
    ] for b in y
])

# Mask the upper right triangle to not show duplicates
mask = np.tri(data.shape[0], k=0).T
data = np.ma.array(data, mask=mask)
data.set_fill_value(1)  # Set fill value for masked elements
masked_data = np.ma.filled(data, fill_value=0)  # Replace masked elements with NaN
cmap = plt.cm.RdYlGn
cmap.set_bad(color='white')  # Set the color for masked elements to white

fig, ax = plt.subplots()
norm = MidpointNormalize(vmin=np.min(data), vmax=np.max(data), midpoint=0, gamma=0.5)
im = ax.imshow(data, cmap="RdYlGn", norm=norm) # Diverging bymonotonic colormap


# Show all ticks and label them with the respective list entries
ax.set_xticks(range(len(y)), labels=[ display_names[name] for name in results if scores[name]['type'] != 'llm' ], rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks(range(len(x)), labels=[ display_names[name] for name in results if scores[name]['type'] != 'llm' ])

# Loop over data dimensions and create text annotations.
for i in range(len(x)):
    for j in range(len(y)):
        if i >= j:
            text = str(data[i, j])
            font_size = 10
            text_color = "w"
            
            if data[i, j] > 0:
                text = f'+{text}'
            if len(text) >= 4:
                font_size = 8
            if abs(data[i, j]) < 25:
                text_color = "black"
            ax.text(j, i, text, ha="center", va="center", color='black', fontsize=8, rotation=45)

fig.tight_layout()
fig.figure.set_size_inches(6, 6)
ax.set_rasterization_zorder(1) # This solves an issue where the masked background was black when exported to eps

plt.savefig('assets/combination-delta.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
## Combination toplist
combination_scores = [ [ (a, b, combination_score(results, a, b)[0]) for a in x ] for b in y ]
combination_scores = [ item for sublist in combination_scores for item in sublist ] # flatten

combination_scores = sorted(combination_scores, key=lambda x: x[2], reverse=True)
# filter duplicate combinations
combination_scores = [ c for c in combination_scores if c[0] < c[1] ]
combination_scores[30:50]

In [None]:
# Toplist for Latex
toplist = [ { 'name': e, **scores[e] } for e in scores ]
tool_toplist = sorted([t for t in toplist if tool_type_map[t['name']] != 'llm' ], key=lambda x: x['castle'], reverse=True)
llm_toplist = sorted([t for t in toplist if tool_type_map[t['name']] == 'llm' ], key=lambda x: x['castle'], reverse=True)

norm = MidpointNormalize(vmin=-1000, vmax=1250, midpoint=0, gamma=0.5)
colormap = plt.get_cmap("RdYlGn")

def cellcolor(value):
    color = colormap(value)
    return f'\\cellcolor[rgb]{{{color[0]:.4f}, {color[1]:.4f}, {color[2]:.4f}}}'
    
def print_toplist(toplist):
    for i in toplist:
        precision = round(i['precision']*100, 1)
        recall = round(i['recall']*100, 1)
        accuracy = round(i['accuracy']*100, 1)
        
        castle_score = round(i['castle'])
        color = cellcolor(castle_score/977)
        
        version = manual_version_map[i['name']]
        if version == None:
            version = '-'
        
        print(f'{ display_names[i["name"]] } & {version} & {i["tp"]} & {i["tn"]} & {i["fp"]} & {i["fn"]} & {precision:.0f}\\% & {recall:.0f}\\% & {accuracy:.0f}\\% & {castle_score} {color} \\\\')

print_toplist(tool_toplist)
print('\\hline \\hline')
print_toplist(llm_toplist)

In [None]:
# Cubic combinations

def cubic_combination_score(results, a, b, c):
    resa = results[a]
    resb = results[b]
    resc = results[c]
    assert len(resa) == len(resb) == len(resc), f"Test count mismatch: {len(resa)} != {len(resb)} != {len(resc)}"
    
    combined = []
    for i in range(len(resa)):
        assert resa[i]['id'] == resb[i]['id'] == resc[i]['id'], f"Test ID mismatch at index {i}: {resa[i]['id']} != {resb[i]['id']} != {resa[i]['id']}"
        combined.append({
            'name': resa[i]['id'],
            'result': {
                'tp': resa[i]['result']['tp'] == 1 or resb[i]['result']['tp'] == 1 or resc[i]['result']['tp'] == 1,
                'tn': resa[i]['result']['tn'] == 1 and resb[i]['result']['tn'] == 1 and resc[i]['result']['tn'] == 1,
                'fp': resa[i]['result']['fp'] + resb[i]['result']['fp'] + resc[i]['result']['fp'],
                'fn': resa[i]['result']['fn'] == 1 or resb[i]['result']['fn'] == 1 or resc[i]['result']['fn'] == 1,
            },
            'expected': {
                'cwe': resa[i]['expected']['cwe'],
            },
        })
    
    castle_score, tps, tns, fps, fns, bonus = castle(top_25_cwes, castle_toplist_bonus, combined)
    return castle_score

combination_score(results, 'codeql', 'aikido')[0]

cubic_combinations = []
for a in results:
    for b in results:
        for c in results:
            if a == b or a == c or b == c:
                continue
            order = list(results.keys())
            if order.index(a) > order.index(b) or order.index(b) > order.index(c):
                continue
            cubic_combinations.append({
                'a': a,
                'b': b,
                'c': c,
                'best': max(scores[a]['castle'], scores[b]['castle'], scores[c]['castle']),
                'score': cubic_combination_score(results, a, b, c)
            })
            
# Filter out worse results
cubic_combinations = [ c for c in cubic_combinations if c['score'] > c['best'] ]
# Sort by biggest delta
cubic_combinations = sorted(cubic_combinations, key=lambda x: abs(x['score'] - x['best']), reverse=True)

print(len(cubic_combinations))
for i in cubic_combinations[:25]:
    print(f'{i["a"]} & {i["b"]} & {i["c"]} ||| best={i["best"]} togeher={i["score"]} (Δ {i["score"] - i["best"]}) ')

In [None]:
# Matrix for vulnerability coverage
cwe_list = list(ordered_cwes.keys())
tool_list = list(results.keys())

coverage_matrix = np.zeros((len(cwe_list), len(tool_list)))

# Fill the matrix with coverage data
for i, cwe in enumerate(cwe_list):
    for j, tool in enumerate(tool_list):
        for test in results[tool]:
            if test['expected']['cwe'][0] == int(cwe) and test['expected']['vulnerable'] and test['result']['tp'] == 1:
                coverage_matrix[i, j] += 1

# Plot the heatmap
fig, ax = plt.subplots(figsize=(12, 8))
im = ax.imshow(coverage_matrix, cmap="YlGnBu")

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(tool_list)), labels=[ display_names[name] for name in results ], rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks(np.arange(len(cwe_list)), labels=[f'CWE-{cwe}' for cwe in cwe_list])

# Loop over data dimensions and create text annotations.
for i in range(len(cwe_list)):
    for j in range(len(tool_list)):
        count = int(coverage_matrix[i, j])
        text = str(count)
        if count > 4:
            ax.text(j, i, text, ha="center", va="center", color="white")
        else:
            ax.text(j, i, text, ha="center", va="center", color="black")

fig.tight_layout()

plt.savefig('assets/tp-heatmap.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Matrix for vulnerability coverage, separated by tool type
cwe_list = list(ordered_cwes.keys())
tool_list = [ r for r in results if scores[r]['type'] != 'llm'  ]

coverage_matrix = np.zeros((len(cwe_list), len(tool_list)))

# Fill the matrix with coverage data
for i, cwe in enumerate(cwe_list):
    for j, tool in enumerate(tool_list):
        for test in results[tool]:
            if test['expected']['cwe'][0] == int(cwe) and test['expected']['vulnerable'] and test['result']['tp'] == 1:
                coverage_matrix[i, j] += 1

# Plot the heatmap
fig, axs = plt.subplots(1, 2, figsize=(12, 8))
ax = axs[0]
im = ax.imshow(coverage_matrix, cmap="YlGnBu")

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(tool_list)), labels=[ display_names[name] for name in tool_list ], rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks(np.arange(len(cwe_list)), labels=[f'CWE-{cwe}' for cwe in cwe_list])

# Loop over data dimensions and create text annotations.
for i in range(len(cwe_list)):
    for j in range(len(tool_list)):
        count = int(coverage_matrix[i, j])
        text = str(count)
        if count > 4:
            ax.text(j, i, text, ha="center", va="center", color="white")
        else:
            ax.text(j, i, text, ha="center", va="center", color="black")

ax.title.set_text('Static Analysis Tools')

## 2 LLM

cwe_list = list(ordered_cwes.keys())
tool_list = [ r for r in results if scores[r]['type'] == 'llm'  ]

coverage_matrix = np.zeros((len(cwe_list), len(tool_list)))

# Fill the matrix with coverage data
for i, cwe in enumerate(cwe_list):
    for j, tool in enumerate(tool_list):
        for test in results[tool]:
            if test['expected']['cwe'][0] == int(cwe) and test['expected']['vulnerable'] and test['result']['tp'] == 1:
                coverage_matrix[i, j] += 1
                

# Plot the heatmap
ax = axs[1]
im = ax.imshow(coverage_matrix, cmap="YlGnBu")

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(tool_list)), labels=[ display_names[name] for name in tool_list ], rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks([])

# Loop over data dimensions and create text annotations.
for i in range(len(cwe_list)):
    for j in range(len(tool_list)):
        count = int(coverage_matrix[i, j])
        text = str(count)
        if count > 4:
            ax.text(j, i, text, ha="center", va="center", color="white")
        else:
            ax.text(j, i, text, ha="center", va="center", color="black")
ax.title.set_text('Large Language Models')

fig.tight_layout()
plt.subplots_adjust(hspace=0, wspace=-0.56)

plt.savefig('assets/tp-heatmap-separate.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Matrix for false positive rates on non-vulnerable tests
cwe_list = list(ordered_cwes.keys())
tool_list = list(results.keys())

fp_rate_matrix = np.zeros((len(cwe_list), len(tool_list)))

# Fill the matrix with false positive rate data
for i, cwe in enumerate(cwe_list):
    for j, tool in enumerate(tool_list):
        total_non_vulnerable_tests = sum(1 for test in dataset['tests'] if test['cwe'] == int(cwe) and not test['vulnerable'])
        if total_non_vulnerable_tests > 0:
            false_positives = sum(1 for test in results[tool] if test['expected']['cwe'][0] == int(cwe) and not test['expected']['vulnerable'] and test['result']['fp'] > 0)
            fp_rate_matrix[i, j] = false_positives / total_non_vulnerable_tests

# Plot the heatmap
fig, ax = plt.subplots(figsize=(12, 8))
im = ax.imshow(fp_rate_matrix, cmap="Reds")

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(tool_list)), labels=[ display_names[name] for name in results ], rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticks(np.arange(len(cwe_list)), labels=[f'CWE-{cwe}' for cwe in cwe_list])

# Loop over data dimensions and create text annotations.
for i in range(len(cwe_list)):
    for j in range(len(tool_list)):
        rate = int(fp_rate_matrix[i, j] * 4)
        text = str(rate)
        if rate > 0.4:
            ax.text(j, i, text, ha="center", va="center", color="white")
        else:
            ax.text(j, i, text, ha="center", va="center", color="black")

fig.tight_layout()

plt.savefig('assets/fp-heatmap.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Radar charts for precision, recall and accuracy for all tools (DISABLED TO SAVE SPACE)
def create_radar_chart(data, categories, title):
    N = len(categories)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variables)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
    
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)
    
    # Draw one axe per variable + add labels
    plt.xticks(angles[:-1], categories)
    
    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], color="grey", size=7)
    plt.ylim(0, 1)
    
    # Ind1
    values = list(data.values())
    values += values[:1]
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)
    
    # Add a title
    plt.title(title, size=20, color='b', y=1.1)

# Define the categories
categories = ['precision', 'recall', 'accuracy']

# Create radar charts for each tool
for tool in scores:
    data = {
        'precision': scores[tool]['precision'],
        'recall': scores[tool]['recall'],
        'accuracy': scores[tool]['accuracy'],
    }
    create_radar_chart(data, categories, tool)
    #plt.show()
    plt.clf()

In [None]:
# Radar charts for categorically combined tools
# Function to create radar chart with multiple datasets
def create_stacked_radar_chart(ax, datasets, categories, title):
    N = len(categories)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variables)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
    
    # Draw one axe per variable + add labels
    plt.xticks(angles[:-1], categories)
    
    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], color="grey", size=7)
    plt.ylim(0, 1)
    
    # Plot each dataset
    for data in datasets:
        values = list(data['values'].values())
        values += values[:1]
        ax.plot(angles, values, linewidth=1, linestyle='solid', label=data['label'])
        ax.fill(angles, values, alpha=0.1)
    
    # Add a title
    plt.title(title, size=14, color='black', y=1.1)
    
    # Add legend
    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.45), ncol=3)

# Define the categories
categories = ['precision', 'recall', 'accuracy']

## ------- CHART 1

fig, axs = plt.subplots(1, 2, figsize=(10, 5), subplot_kw=dict(polar=True))
axs[0].set_xticks([])
axs[0].set_yticks([])
axs[1].set_xticks([])
axs[1].set_yticks([])

# Create radar charts for selected tools
# selected_tools = ['aikido', 'cppcheck', 'codeql']
selected_tools = [ t for t in scores if scores[t]['type'] == 'llm' ]
datasets = []
for tool in selected_tools:
    data = {
        'label': display_names[tool],
        'values': {
            'precision': scores[tool]['precision'],
            'recall': scores[tool]['recall'],
            'accuracy': scores[tool]['accuracy'],
        }
    }
    datasets.append(data)

ax = plt.subplot(1, 2, 1, polar=True)
create_stacked_radar_chart(ax, datasets, categories, "Large Language Models")

## ------- CHART 2

# Create radar charts for selected tools
# selected_tools = ['aikido', 'cppcheck', 'codeql']
selected_tools = [ t for t in scores if scores[t]['type'] != 'llm' ]
datasets = []
for tool in selected_tools:
    data = {
        'label': display_names[tool],
        'values': {
            'precision': scores[tool]['precision'],
            'recall': scores[tool]['recall'],
            'accuracy': scores[tool]['accuracy'],
        }
    }
    datasets.append(data)
ax = plt.subplot(1, 2, 2, polar=True)
create_stacked_radar_chart(ax, datasets, categories, "Static Analyzers")

plt.tight_layout()
plt.savefig('assets/radars.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Venn diagram for detected vulnerabilities for tool combinations

# Function to get the set of detected vulnerabilities for a tool
def get_detected_vulnerabilities(tool_name):
    detected_vulnerabilities = set()
    for test in results[tool_name]:
        if test['result']['tp'] == 1:
            detected_vulnerabilities.add(test['id'])
    return detected_vulnerabilities

# Select three tools for the Venn diagram
tool_a = 'aikido'
tool_b = 'clang-analyzer'
tool_c = 'esbmc'

# Get the sets of detected vulnerabilities
set_a = get_detected_vulnerabilities(tool_a)
set_b = get_detected_vulnerabilities(tool_b)
set_c = get_detected_vulnerabilities(tool_c)

# Create the Venn diagram
plt.figure(figsize=(10, 7))
venn = venn3([set_a, set_b, set_c], (tool_a, tool_b, tool_c))
venn3_circles([set_a, set_b, set_c], linestyle='dashed')

# Add title
plt.show()

In [None]:
# Square treemap of score components

# Define the score components for the treemap
score_components = {
    'True Positives': tps,
    'True Negatives': tns,
    'False Positives': fps,
    'False Negatives': fns,
    'Toplist Bonus': bonus
}

# Define the sizes and labels for the treemap
sizes = list(score_components.values())
labels = [f'{key}\n{value}' for key, value in score_components.items()]

# Create the treemap
fig, ax = plt.subplots(figsize=(5, 5))
squarify.plot(sizes=sizes, label=labels, alpha=0.7, ax=ax)
plt.title('Treemap of Score Components')
plt.axis('off')
plt.show()

In [None]:
# Box-and-whisker plots for score components

# Extract score components for each tool
tp_scores = [scores[tool]['tp'] for tool in scores]
tn_scores = [scores[tool]['tn'] for tool in scores]
fp_scores = [scores[tool]['fp'] for tool in scores]
fn_scores = [scores[tool]['fn'] for tool in scores]
bonus_scores = [castle(top_25_cwes, castle_toplist_bonus, results[tool])[5] for tool in scores]

# Create a dictionary to hold the data for box plots
score_data = {
    'True Positives': tp_scores,
    'True Negatives': tn_scores,
    'False Positives': fp_scores,
    'False Negatives': fn_scores,
    'Toplist Bonus': bonus_scores
}

# Plot box-and-whisker charts
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(20, 5))
fig.suptitle('Box-and-Whisker Plots for Score Components')

for ax, (label, data) in zip(axes, score_data.items()):
    ax.boxplot(data)
    ax.set_title(label)
    ax.set_ylabel('Scores')

plt.tight_layout()
plt.show()

In [None]:
# Single, matrix and cubic combinations
all_combinations = []
all_combinations += [ ( [name], scores[name]['castle'], scores[name]['castle'] ) for name in scores ] # 1
all_combinations += [ ( [cs[0], cs[1]], cs[2], max(scores[cs[0]]['castle'], scores[cs[1]]['castle']) ) for cs in combination_scores ] # 2
#all_combinations += [ ( [cc['a'], cc['b'], cc['c']], cc['score'], max(scores[cc['a']]['castle'], scores[cc['b']]['castle'], scores[cc['c']]['castle']) ) for cc in cubic_combinations ] # 3

# Remove all LLMs
all_combinations = [ c for c in all_combinations if all([ scores[tool]['type'] != 'llm' for tool in c[0] ]) ]


all_combinations = sorted(all_combinations, key=lambda x: x[1], reverse=True)
all_combinations_toplist = all_combinations[:20]

x = [ f'{" + ".join(c)}' for c, s, m in all_combinations_toplist ]
singles = [ m for c, s, m in all_combinations_toplist ]
combinations = [ 0 if s-m < 0 else s-m for c, s, m in all_combinations_toplist ]

fig, ax = plt.subplots(figsize=(8, 6))
single_bar = ax.bar(x, singles, label='Group 1', color='blue')
combi_bar = ax.bar(x, combinations, bottom=singles, label='Group 2', color='green')

ax.set_xticks(range(len(x)), labels=x, rotation=45, ha="right", rotation_mode="anchor")

for rect in combi_bar:
    continue
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

plt.xticks(rotation=45)
plt.title('Top 20 Combinations')
plt.ylabel('CASTLE Score')
plt.show()

In [None]:
# biggest increase in combination scores compared to the single
biggest_increase = [ (c, s, m, s-m) for c, s, m in all_combinations_toplist ]
biggest_increase = sorted(biggest_increase, key=lambda x: x[3], reverse=True)
biggest_increase[:10]

In [None]:
## Chart of castle scores with combinations
top_combination_score_items = sorted(all_combinations, key=lambda x: x[1], reverse=True)[:5]
top_combination_score_items = [ (f'{" + ".join( [display_names[n] for n in top[0]] )}', top[1], tool_color_map['llm']) for top in top_combination_score_items ]
all_tool_items = [ (n, scores[n]['castle'], tool_color_map[scores[n]['type']]) for n in scores if scores[n]['type'] != 'llm' ]
scores_sorted = sorted(all_tool_items + top_combination_score_items, key=lambda item: item[1], reverse=False)

x = [ display_names[ s[0] ] if '+' not in s[0] else s[0] for s in scores_sorted ]
y = [ s[1] for s in scores_sorted ]
colors = [s[2] for s in scores_sorted]

fig, ax = plt.subplots()
bar = ax.bar(x, y, color=colors)
plt.ylabel('CASTLE Score')

ax.set_xticks(range(len(x)), labels=x, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

# Legend
legend_elements = [
    Patch(facecolor=color_map['sast']['color'] , label=color_map['sast']['label']),
    Patch(facecolor=color_map['fv']['color'], label=color_map['fv']['label']),
    Patch(facecolor=color_map['llm']['color'], label='Combination'),
]
ax.legend(handles=legend_elements, bbox_to_anchor=(0.9, 1.15), ncol=3)

# set plot size
fig = plt.gcf()
fig.figure.set_size_inches(8, 4)
plt.savefig('assets/tool-combo-castle-scores.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
## Chart of castle scores divided between SCAs and LLMs

## 1: SCAs
top_combination_score_items = sorted(all_combinations, key=lambda x: x[1], reverse=True)
top_combination_score_items = top_combination_score_items[:5]
top_combination_score_items = [ (f'{" + ".join( [display_names[n] for n in top[0]] )}', top[1], tool_color_map['llm']) for top in top_combination_score_items ]
all_tool_items = [ (n, scores[n]['castle'], tool_color_map[scores[n]['type']]) for n in scores if scores[n]['type'] != 'llm' ]
scores_sorted = sorted(all_tool_items + top_combination_score_items, key=lambda item: item[1], reverse=False)

x_sca = [ display_names[ s[0] ] if '+' not in s[0] else s[0] for s in scores_sorted ]
y_sca = [ s[1] for s in scores_sorted ]
colors_sca = [ tool_color_map[ tool_type_map[s[0]] ] if '+' not in s[0] else tool_color_map['combination'] for s in scores_sorted]

## 2: LLMs
scores_llm = { k: v for k, v in scores.items() if v['type'] == 'llm' }
scores_llm_sorted = sorted(scores_llm.items(), key=lambda item: item[1]['castle'], reverse=False)
x_llm = [ display_names[ s[0] ] for s in scores_llm_sorted ]
y_llm = [ s[1]['castle'] for s in scores_llm_sorted ]
colors_llm = [ tool_color_map['llm'] ] * len(x_llm)

## 3: SCAs plot
fig, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw={'width_ratios': [len(x_sca), len(x_llm)]})
ax = plt.subplot(1, 2, 1)

bar = ax.bar(x_sca, y_sca, color=colors_sca)
plt.ylim(-800, 1000)
plt.ylabel('CASTLE Score (C@250)')

ax.set_xticks(range(len(x_sca)), labels=x_sca, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

ax.title.set_text('Static Code Analyzers and Tool Combinations')

## 4: LLMs plot

ax = plt.subplot(1, 2, 2)
bar = ax.bar(x_llm, y_llm, color=colors_llm)
ax.yaxis.set_label_position("right")
ax.yaxis.tick_right()

ax.set_xticks(range(len(x_llm)), labels=x_llm, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)
ax.title.set_text('Large Language Models')

# Legend
legend_elements = [
    Patch(facecolor=tool_color_map['sast'], label='Static Application Security Tester'),
    Patch(facecolor=tool_color_map['gca'], label='Generic Code Analyzer'),
    Patch(facecolor=tool_color_map['fv'], label='Formal Verification'),
    Patch(facecolor=tool_color_map['combination'], label='Tool Combination'),
    Patch(facecolor=tool_color_map['llm'], label='Large Language Model'),
]
ax.legend(handles=legend_elements, bbox_to_anchor=(1.2, 1.3), ncol=3)
    
    
plt.subplots_adjust(hspace=0, wspace=0.05)
plt.savefig('assets/castle-scores-separate.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
## Chart of castle scores divided between SCAs and Combinations

## 1: SCAs
top_combination_score_items = sorted(all_combinations, key=lambda x: x[1], reverse=True)
top_combination_score_items = top_combination_score_items[:5]
top_combination_score_items = [ (f'{" + ".join( [display_names[n] for n in top[0]] )}', top[1], color_map['llm']['color']) for top in top_combination_score_items ]
all_tool_items = [ (n, scores[n]['castle'], tool_color_map[scores[n]['type']]) for n in scores if scores[n]['type'] != 'llm' ]
tool_scores_sorted = sorted(all_tool_items, key=lambda item: item[1], reverse=False)
combination_scores_sorted = sorted(top_combination_score_items, key=lambda item: item[1], reverse=False)

x_sca = [ display_names[ s[0] ] if '+' not in s[0] else s[0] for s in tool_scores_sorted ]
y_sca = [ s[1] for s in tool_scores_sorted ]
colors_sca = [ tool_color_map[ tool_type_map[s[0]] ] if '+' not in s[0] else tool_color_map['combination'] for s in tool_scores_sorted]

## 2: LLMs
x_llm = [ s[0] for s in combination_scores_sorted ]
y_llm = [ s[1] for s in combination_scores_sorted ]
colors_llm = [ tool_color_map['combination'] ] * len(combination_scores_sorted)

## 3: SCAs plot
fig, axs = plt.subplots(1, 2, figsize=(8, 3), gridspec_kw={'width_ratios': [len(x_sca), len(x_llm)]})
ax = plt.subplot(1, 2, 1)

bar = ax.bar(x_sca, y_sca, color=colors_sca)
plt.ylim(0, perfect_castle_score)
plt.ylabel('CASTLE Score (C@250)')

ax.set_xticks(range(len(x_sca)), labels=x_sca, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    if height < 0:
        height = 0
        ax.text(rect.get_x() + rect.get_width()/2., height, f'{rect.get_height()}', ha='center', va='bottom', size=8, color='red')
        continue
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{rect.get_height()}', ha='center', va='bottom', size=8)
    

ax.title.set_text('Tools')

# Dotted line across the chart at the 200 mark
ax.axhline(y=200, color='gray', linestyle='--', linewidth=1)
ax.axhline(y=perfect_castle_score, color='green', linestyle='--', linewidth=1)
plt.ylim(0, perfect_castle_score + 50)


## 4: Combination plot

ax = plt.subplot(1, 2, 2)
bar = ax.bar(x_llm, y_llm, color=colors_llm)
ax.yaxis.set_label_position("right")
ax.yaxis.tick_right()

ax.set_xticks(range(len(x_llm)), labels=x_llm, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)
ax.title.set_text('Combinations')
ax.axhline(y=200, color='gray', linestyle='--', linewidth=1)
ax.axhline(y=perfect_castle_score, color='green', linestyle='--', linewidth=1)
plt.ylim(0, perfect_castle_score + 50)

# Legend
legend_elements = [
    Patch(facecolor=tool_color_map['sast'], label='Static Application Security Tester'),
    Patch(facecolor=tool_color_map['gca'], label='Generic Code Analyzer'),
    Patch(facecolor=tool_color_map['fv'], label='Formal Verification'),
    Patch(facecolor=tool_color_map['combination'], label='Tool Combination'),
]
ax.legend(handles=legend_elements, bbox_to_anchor=(0.4, 1.4), ncol=2)

    
    
plt.subplots_adjust(hspace=0, wspace=0.05)
plt.savefig('assets/castle-scores-separate.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Average SAST recall
sast_recall = sum([ scores[tool]['recall'] for tool in scores if scores[tool]['type'] == 'sast' ]) / len([ tool for tool in scores if scores[tool]['type'] == 'sast' ])
sast_recall

In [None]:
TP = np.array([ scores[tool]['tp'] for tool in scores if scores[tool]['type'] != 'llm' and scores[tool]['castle'] > 0 ])
FP = np.array([ scores[tool]['fp'] for tool in scores if scores[tool]['type'] != 'llm' and scores[tool]['castle'] > 0 ])

mask = (TP > 0) & (FP > 0)
TP_filtered = TP[mask]
FP_filtered = FP[mask]

# Log transformation
log_TP = np.log(TP_filtered)
log_FP = np.log(FP_filtered)

# Perform linear regression in log-log space
slope, intercept, r_value, p_value, std_err = stats.linregress(log_TP, log_FP)

# Reconstruct the estimated FP values for plotting
FP_estimated = np.exp(intercept) * TP_filtered ** slope

# Print results
print(f"Log-Log Regression Results:")
print(f"Exponent (n): {slope:.3f}")
print(f"Intercept: {intercept:.3f}")
print(f"R-squared: {r_value**2:.3f}")

# Plot original data vs. fitted line
plt.scatter(log_TP, log_FP, label="Observed Data", color="blue")
plt.plot(log_TP, slope * log_TP + intercept, color="red", label=f"Fit: log(FP) = {slope:.2f} log(TP) + {intercept:.2f}")

plt.xlabel("log(True Positives)")
plt.ylabel("log(False Positives)")
plt.legend()
plt.title("Log-Log Regression of FP vs. TP")
plt.show()

In [None]:
# Exponential regression of FP vs. TP
TP = np.array([ scores[tool]['tp'] for tool in scores if scores[tool]['type'] != 'llm' and scores[tool]['castle'] > 0 ])
FP = np.array([ scores[tool]['fp'] for tool in scores if scores[tool]['type'] != 'llm' and scores[tool]['castle'] > 0 ])

mask = (TP > 0) & (FP > 0)
TP_filtered = TP[mask]
FP_filtered = FP[mask]

# Log transformation
log_TP = np.log(TP_filtered)
log_FP = np.log(FP_filtered)

# Perform linear regression in log-log space
slope, intercept, r_value, p_value, std_err = stats.linregress(log_TP, log_FP)

# Reconstruct the estimated FP values for plotting
FP_estimated = np.exp(intercept) * TP_filtered ** slope

# Print results
print(f"Log-Log Regression Results:")
print(f"Exponent (n): {slope:.3f}")
print(f"Intercept: {intercept:.3f}")
print(f"R-squared: {r_value**2:.3f}")

# Plot original data vs. fitted line
plt.scatter(log_TP, log_FP, label="Observed Data", color="blue")
plt.plot(log_TP, slope * log_TP + intercept, color="red", label=f"Fit: log(FP) = {slope:.2f} log(TP) + {intercept:.2f}")

plt.xlabel("log(True Positives)")
plt.ylabel("log(False Positives)")
plt.legend()
plt.title("Log-Log Regression of FP vs. TP")
plt.show()

In [None]:
# Linear regression of FP vs. TP
slope, intercept, r_value, p_value, std_err = stats.linregress(TP, FP)

# Compute fitted values
FP_estimated = slope * TP + intercept

# Print results
print(f"Linear Regression Results:")
print(f"Slope (a): {slope:.3f}")
print(f"Intercept (b): {intercept:.3f}")
print(f"R-squared: {r_value**2:.3f}")

# Plot data and fitted line
plt.scatter(TP, FP, label="Observed Data", color="blue")
plt.plot(TP, FP_estimated, color="red", label=f"Fit: FP = {slope:.2f} TP + {intercept:.2f}")

plt.xlabel("True Positives")
plt.ylabel("False Positives")
plt.legend()
plt.title("Linear Regression of FP vs. TP")
plt.show()


In [None]:
# Quadratic, cubic, exponential and power law models

# Quadratic Model
quad_coeffs = np.polyfit(TP, FP, 2)  # Fit quadratic: FP = aTP^2 + bTP + c
FP_quad = np.polyval(quad_coeffs, TP)
r2_quad = r2_score(FP, FP_quad)

# Cubic Model
cubic_coeffs = np.polyfit(TP, FP, 3)  # Fit cubic: FP = aTP^3 + bTP^2 + cTP + d
FP_cubic = np.polyval(cubic_coeffs, TP)
r2_cubic = r2_score(FP, FP_cubic)

# Exponential Model
def exp_model(x, a, b):
    return a * np.exp(b * x)

exp_params, _ = curve_fit(exp_model, TP, FP, maxfev=5000)
FP_exp = exp_model(TP, *exp_params)
r2_exp = r2_score(FP, FP_exp)

# Power Law Model
def power_model(x, a, b):
    return a * x ** b

power_params, _ = curve_fit(power_model, TP, FP, maxfev=5000)
FP_power = power_model(TP, *power_params)
r2_power = r2_score(FP, FP_power)

# Print R^2 values
print(f"Quadratic R²: {r2_quad:.3f}")
print(f"Cubic R²: {r2_cubic:.3f}")
print(f"Exponential R²: {r2_exp:.3f}")
print(f"Power Law R²: {r2_power:.3f}")

# Plot results
plt.scatter(TP, FP, label="Observed Data", color="blue")

plt.plot(TP, FP_quad, label=f"Quadratic (R²={r2_quad:.2f})", linestyle="dashed", color="green")
plt.plot(TP, FP_cubic, label=f"Cubic (R²={r2_cubic:.2f})", linestyle="dashed", color="purple")
plt.plot(TP, FP_exp, label=f"Exponential (R²={r2_exp:.2f})", linestyle="dashed", color="orange")
plt.plot(TP, FP_power, label=f"Power Law (R²={r2_power:.2f})", linestyle="dashed", color="red")

plt.xlabel("True Positives")
plt.ylabel("False Positives")
plt.legend()
plt.title("Testing Different Regression Models")
plt.show()

In [None]:
# Misc dataset statistics

def format_number(number):
    if int(number) != number:
        return f'{number:,.1f}'
    return number

ds_stats = dataset['statistics']
for s in ['line_count', 'functions', 'cyclomatic_complexity', 'halstead_volume', 'maintainability_index', 'cl100k_base_tokens' ]:
    name = s.replace('_', ' ').title().capitalize()
    min_v = format_number(ds_stats[s]['min'])
    max_v = format_number(ds_stats[s]['max'])
    avg = format_number(ds_stats[s]['average'])
    total = format_number(ds_stats[s]['total'])
    print(f'{name} & {min_v} & {avg} & {max_v} & {total} \\\\')

In [None]:
# Chart of castle scores for LLMs
llm_scores = { k: v for k, v in scores.items() if v['type'] == 'llm' }
llm_scores = sorted(llm_scores.items(), key=lambda item: item[1]['castle'], reverse=False)

x = [ display_names[ s[0] ] for s in llm_scores ]
y = [ s[1]['castle'] for s in llm_scores ]
colors = [tool_color_map[tool_type_map[s[0]]] for s in llm_scores]

fig, ax = plt.subplots()
bar = ax.bar(x, y, color=colors)
plt.ylabel('CASTLE Score')

ax.set_xticks(range(len(x)), labels=x, rotation=45, ha="right", rotation_mode="anchor")

for rect in bar:
    height = rect.get_height()
    vertical_anchor = 'top' if height < 0 else 'bottom'
    ax.text(rect.get_x() + rect.get_width()/2., height, f'{height}', ha='center', va=vertical_anchor, size=8)

# Legend
legend_elements = [ Patch(facecolor=color_map[color]['color'], label=color_map[color]['label']) for color in color_map ]
ax.legend(handles=legend_elements, bbox_to_anchor=(0.95, 1.1), ncol=3)

# Dotted line across the chart at the 200 mark
ax.axhline(y=200, color='gray', linestyle='--', linewidth=1)

plt.savefig('assets/castle-scores-llms.eps', format='eps', bbox_inches='tight')
plt.show()

In [None]:
# Create a synthtetic report with random line numbers
import random
def gen_synthetic_report():
    main_dataset = dataset['tests']
    reports = []
    for t in main_dataset:
        reports.append({
            'id': t['id'],
            'findings': [
                {
                    'severity': 'high',
                    'line': random.randint(1, t['line_count']),
                    'cwe': 0,
                    'message': 'Synthetic finding',
                }
            ],
            'report': 'SYNTHETIC'
        })
        
    results = []
    for i, rep in enumerate(reports):
        findings = filter_findings(rep['findings'])
        # Determine result
        result = validate_findings(dataset['tests'][i], findings)
        results.append(result)
        
    return castle(top_25_cwes, 1, results)

n = 1000
synthetic_reports = [ gen_synthetic_report() for _ in range(n) ]
sum([ s[5] for s in synthetic_reports ]) / n