In [None]:
import os
import pandas as pd
import re
from sklearn.metrics import accuracy_score

In [None]:
# Map full words to shorthand CVSS codes
cvss_value_map = {
    "None": "N",
    "Low": "L",
    "Medium": "M",
    "High": "H",
    "Network": "N",
    "Adjacent": "A",
    "Local": "L",
    "Physical": "P",
    "Required": "R",
    "Changed": "C",
    "Unchanged": "U"
}

# Define valid options per component
cvss_options = {
    "AV": ["N", "A", "L", "P"],
    "AC": ["L", "H"],
    "PR": ["N", "L", "H"],
    "UI": ["N", "R"],
    "S": ["U", "C"],
    "C": ["N", "L", "H"],
    "I": ["N", "L", "H"],
    "A": ["N", "L", "H"]
}

def extract_cvss_value(component, text):
    if not isinstance(text, str):
        return "?"

    text = text.strip()

    if '<think>' in text:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    # Try to match shorthand (e.g., AV:L)
    match = re.search(rf"{component}\s*:\s*([A-Z])", text, flags=re.IGNORECASE)
    if match and match.group(1) in cvss_options[component]:
        return match.group(1)

    # Try to match full name
    for full, code in cvss_value_map.items():
        if re.search(rf"\b{full}\b", text, flags=re.IGNORECASE):
            if code in cvss_options[component]:
                return code

    # Try to find any valid short code mentioned
    for code in cvss_options[component]:
        if re.search(rf"\b{code}\b", text):
            return code

    return "?"

# Apply to your dataframe
def clean_cvss_responses(df):
    components = {
        "AV": "AV_response",
        "AC": "AC_response",
        "PR": "PR_response",
        "UI": "UI_response",
        "S": "S_response",
        "C": "C_response",
        "I": "I_response",
        "A": "A_response"
    }

    for component, col in components.items():
        new_col = component + "_clean"
        df[new_col] = df[col].apply(lambda x: extract_cvss_value(component, x))

    return df

In [None]:
csv_files = []
csv_data = {}

for root, dirs, files in os.walk('./results/llms'):
    for file in sorted(files):
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            csv_files.append(file)
            csv_data[file] = pd.read_csv(file_path)

In [None]:
df_gt = pd.read_parquet('./dataset.parquet')
df_gt.rename(columns={col: f"{col}_true" for col in ["AV", "AC", "PR", "UI", "S", "C", "I", "A"]}, inplace=True)
df_gt.head()

### Component Evaluation

In [None]:
components_data = {key: value for key, value in csv_data.items() if "components" in key}

In [None]:
components_results = []

for key, df in components_data.items():
    # Clean CVSS responses
    cleaned_df = clean_cvss_responses(df)

    # Drop the original response columns
    response_columns = [
        "AV_response", "AC_response", "PR_response", 
        "UI_response", "S_response", "C_response", 
        "I_response", "A_response"
    ]
    cleaned_df.drop(columns=response_columns, inplace=True)

    # Update the dataframe in components_data
    components_data[key] = cleaned_df

    # Merge with ground truth
    merged_df = cleaned_df.merge(df_gt[[
        'cve_id', 'AV_true', 'AC_true', 'PR_true', 
        'UI_true', 'S_true', 'C_true', 'I_true', 'A_true'
    ]], on='cve_id', suffixes=('_pred', '_true'))

    # Components to evaluate
    components = ["AV", "AC", "PR", "UI", "S", "C", "I", "A"]
    accuracy_row = {}

    for comp in components:
        y_true = merged_df[f"{comp}_true"]
        y_pred = merged_df[f"{comp}_clean"]
        acc = accuracy_score(y_true, y_pred)
        accuracy_row[comp] = acc

    # Add model name (strip suffix)
    model_name = key.replace('_components.csv', '')
    accuracy_row["model"] = model_name

    # Append result
    components_results.append(accuracy_row)

# Create results DataFrame
results_df = pd.DataFrame(components_results)

# Reorder columns so 'model' is first
cols = ['model'] + [col for col in results_df.columns if col != 'model']
results_df = results_df[cols]

# Save to CSV
results_df.to_csv('./results/components.csv', index=False)
results_df

### Vector Evaluation

In [None]:
vector_data = {key: value for key, value in csv_data.items() if "_vector.csv" in key and 'cwe' not in key}

In [None]:
# Try to parse CVSS vector
def parse_vector_string(vector_text):
    components = {}
    matches = re.findall(r"([A-Z]{1,2}):([A-Z])", vector_text)
    for comp, val in matches:
        if comp in cvss_options and val in cvss_options[comp]:
            components[comp] = val
    return components

# Extract from noisy text
def extract_component_from_text(component, text):
    if not isinstance(text, str):
        return "?"

    # 1. Try from full CVSS vector
    vector_parts = parse_vector_string(text)
    if component in vector_parts:
        return vector_parts[component]

    # 2. Try matching "AV: N", "AV - N", etc.
    match = re.search(rf"{component}\s*[:\-]?\s*([A-Z])\b", text)
    if match and match.group(1) in cvss_options[component]:
        return match.group(1)

    # 3. Try full word (e.g., "Network" → "N")
    for full, code in cvss_value_map.items():
        if code in cvss_options[component] and re.search(rf"{full}", text, re.IGNORECASE):
            return code

    # 4. Last resort: search for any valid code in text
    for code in cvss_options[component]:
        if re.search(rf"\b{code}\b", text):
            return code

    return "?"

# Clean function to extract components from messy LLM vector
def extract_components_from_llm_vector(df, vector_column="cvss_response"):
    component_list = ["AV", "AC", "PR", "UI", "S", "C", "I", "A"]
    for comp in component_list:
        df[f"{comp}_clean"] = df[vector_column].apply(lambda x: extract_component_from_text(comp, x))
    return df

In [None]:
vector_results = []

for key, df in vector_data.items():
    # Clean CVSS responses
    cleaned_df = df = extract_components_from_llm_vector(df, vector_column="response")

    # Merge with ground truth
    merged_df = cleaned_df.merge(df_gt[[
        'cve_id', 'AV_true', 'AC_true', 'PR_true', 
        'UI_true', 'S_true', 'C_true', 'I_true', 'A_true'
    ]], on='cve_id', suffixes=('_pred', '_true'))

    # Components to evaluate
    components = ["AV", "AC", "PR", "UI", "S", "C", "I", "A"]
    accuracy_row = {}

    for comp in components:
        y_true = merged_df[f"{comp}_true"]
        y_pred = merged_df[f"{comp}_clean"]
        acc = accuracy_score(y_true, y_pred)
        accuracy_row[comp] = acc

    # Add model name (strip suffix)
    model_name = key.replace('_vector.csv', '')
    accuracy_row["model"] = model_name

    # Append result
    vector_results.append(accuracy_row)

# Create results DataFrame
results_df = pd.DataFrame(vector_results)

# Reorder columns so 'model' is first
cols = ['model'] + [col for col in results_df.columns if col != 'model']
results_df = results_df[cols]

# Save to CSV
results_df.to_csv('./results/vector.csv', index=False)
results_df

### Few Shot Results

In [None]:
vector_few_shot_data = {key: value for key, value in csv_data.items() if "_vector_few_shot.csv" in key}

In [None]:
vector_few_shot_results = []

for key, df in vector_few_shot_data.items():
    # Clean CVSS responses
    cleaned_df = df = extract_components_from_llm_vector(df, vector_column="response")

    # Merge with ground truth
    merged_df = cleaned_df.merge(df_gt[[
        'cve_id', 'AV_true', 'AC_true', 'PR_true', 
        'UI_true', 'S_true', 'C_true', 'I_true', 'A_true'
    ]], on='cve_id', suffixes=('_pred', '_true'))

    # Components to evaluate
    components = ["AV", "AC", "PR", "UI", "S", "C", "I", "A"]
    accuracy_row = {}

    for comp in components:
        y_true = merged_df[f"{comp}_true"]
        y_pred = merged_df[f"{comp}_clean"]
        acc = accuracy_score(y_true, y_pred)
        accuracy_row[comp] = acc

    # Add model name (strip suffix)
    model_name = key.replace('_vector_few_shot.csv', '')
    accuracy_row["model"] = model_name

    # Append result
    vector_few_shot_results.append(accuracy_row)

# Create results DataFrame
results_df = pd.DataFrame(vector_few_shot_results)

# Reorder columns so 'model' is first
cols = ['model'] + [col for col in results_df.columns if col != 'model']
results_df = results_df[cols]

# Save to CSV
results_df.to_csv('./results/vector_few_shot.csv', index=False)
results_df

### Vector CWE

In [None]:
cwe_vector_data = {key: value for key, value in csv_data.items() if "_cwe_vector.csv" in key}

In [None]:
cwe_vector_results = []

for key, df in cwe_vector_data.items():
    # Clean CVSS responses
    cleaned_df = df = extract_components_from_llm_vector(df, vector_column="response")

    # Merge with ground truth
    merged_df = cleaned_df.merge(df_gt[[
        'cve_id', 'AV_true', 'AC_true', 'PR_true', 
        'UI_true', 'S_true', 'C_true', 'I_true', 'A_true'
    ]], on='cve_id', suffixes=('_pred', '_true'))

    # Components to evaluate
    components = ["AV", "AC", "PR", "UI", "S", "C", "I", "A"]
    accuracy_row = {}

    for comp in components:
        y_true = merged_df[f"{comp}_true"]
        y_pred = merged_df[f"{comp}_clean"]
        acc = accuracy_score(y_true, y_pred)
        accuracy_row[comp] = acc

    # Add model name (strip suffix)
    model_name = key.replace('_cwe_vector.csv', '')
    accuracy_row["model"] = model_name

    # Append result
    cwe_vector_results.append(accuracy_row)

# Create results DataFrame
results_df = pd.DataFrame(cwe_vector_results)

# Reorder columns so 'model' is first
cols = ['model'] + [col for col in results_df.columns if col != 'model']
results_df = results_df[cols]

# Save to CSV
results_df.to_csv('./results/cwe_vector.csv', index=False)
results_df

### CVSS Score

In [None]:
score_data = {key: value for key, value in csv_data.items() if "_score.csv" in key}

In [None]:
def extract_score_from_response(response):
    if not isinstance(response, str):
        if isinstance(response, float):
            return response
        else:
            return None

    # Remove content between <think> brackets
    response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()

    # Find the last number in the format x.x
    match = re.search(r"(\d+\.\d+)(?!.*\d+\.\d+)", response)
    if match:
        return float(match.group(1))

    if len(response) == 3:
        return response

    return None


score_results = []

for key, df in score_data.items():
    # Clean CVSS responses
    df["score_clean"] = df["response"].apply(extract_score_from_response)
    df = df.drop(columns=["response"])

    # Add CVSS score from df_gt
    df = df.merge(df_gt[['cve_id', 'cvss']], on='cve_id', how='left', suffixes=('', '_true'))

    # Calculate distance
    mse = ((df["score_clean"] - df["cvss"]) ** 2).mean()
    mae = (df["score_clean"] - df["cvss"]).abs().mean()
    rmse = ((df["score_clean"] - df["cvss"]) ** 2).mean() ** 0.5

    # Add model name (strip suffix)
    model_name = key.replace('_score.csv', '')

    # Append result
    score_results.append({"model": model_name,
                          "mse": mse,
                          "mae": mae,
                          "rmse": rmse})

# Create results DataFrame
results_df = pd.DataFrame(score_results)

# Reorder columns so 'model' is first
cols = ['model'] + [col for col in results_df.columns if col != 'model']
results_df = results_df[cols]

# Save to CSV
results_df.to_csv('./results/score.csv', index=False)
results_df