In [1]:
import csv
from collections import defaultdict
import os
import math
from utils.extractor import walk_and_extract_cwe

lang_to_ext = {
    'c': 'c',
    'cpp': 'cpp',
    'python': 'py',
    'java': 'java',
    'javascript': 'js',
    'php': 'php',
    "csharp": "cs"
}

In [2]:
def count_conversations(file_path):
    # Structure: {language: {conversation_hash: count, 'total_unique': count, 'total_occurrences': count}}
    language_data = defaultdict(lambda: {'unique_conversations': set(), 'total_occurrences': 0})

    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        for row in csv_reader:
            conversation_hash = row['conversation_hash']
            language = row['language']

            # Track data per language
            lang_data = language_data[language]

            # Add to unique conversations if not already present
            if conversation_hash not in lang_data['unique_conversations']:
                lang_data['unique_conversations'].add(conversation_hash)

            # Increment total occurrences
            lang_data['total_occurrences'] += 1

    # Convert sets to counts and prepare final output
    result = {}
    for language, data in language_data.items():
        result[language] = {
            'unique_conversations': len(data['unique_conversations']),
            'total_occurrences': data['total_occurrences']
        }

    return result

In [3]:
c_files = os.listdir("files/c/codes")

In [4]:
def get_code_lines_stats(c_files):
    line_counts = []

    for c_file in c_files:
        
        with open(f"files/c/codes/{c_file}", 'r', encoding='utf-8', errors='ignore') as code_file:
            lines = code_file.readlines()
            num_lines = len(lines)
            line_counts.append(num_lines)

    if not line_counts:
        return 0.0, 0.0  # No files found or no lines counted

    avg = sum(line_counts) / len(line_counts)
    variance = sum((x - avg) ** 2 for x in line_counts) / len(line_counts)
    std_dev = math.sqrt(variance)

    return avg, std_dev

get_code_lines_stats(c_files)

(20.574345742720237, 28.810912040148537)

In [5]:
allowed_rules = {
    "c": [
        "insecure-use-gets-fn",
        "insecure-use-memset",
        "insecure-use-printf-fn",
        "insecure-use-strcat-fn",
        "insecure-use-scanf-fn",
        "insecure-use-string-copy-fn",
    ]
}

In [6]:
for key, value in allowed_rules.items():
    print(key, len(value))

c 6


In [7]:
# Flatten allowed rules into a list
allowed_rules_list = [rule for sublist in allowed_rules.values() for rule in sublist]

# Structure: language -> rule -> {count, hashes}
language_rule_results = defaultdict(lambda: defaultdict(lambda: {"count": 0, "hashes": set()}))

with open('codegrep_results.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    
    for row in csv_reader:
        error_id = row['error_id'].split('.')[-1]  # Extract last part of error_id
        if error_id in allowed_rules_list:
            language = row['language']  # Make sure this column exists in your CSV
            conversation_hash = row['conversation_hash']
            
            # Update counts for language and rule
            language_rule_results[language][error_id]["count"] += 1
            language_rule_results[language][error_id]["hashes"].add(conversation_hash)

print("Aggregated results by language and rule:")
total_all = 0
for language, rules in language_rule_results.items():
    total_unique_hashes = set()
    total_count = 0
    print(f"Language: {language}")
    rule_id_to_cwes = walk_and_extract_cwe(rules)
    for rule, data in rules.items():
        unique_hash_count = len(data["hashes"])
        count = data["count"]
        total_unique_hashes.update(data["hashes"])
        total_count += count
        cwes = rule_id_to_cwes.get(rule, [])
        cwe_str = ", ".join(cwes) if cwes else "N/A"
        print(f"  Rule: {rule}")
        print(f"    CWE(s): {cwe_str}")
        print(f"    Unique conversation hashes: {unique_hash_count}")
        print(f"    Total occurrences: {count}")
    total_all += len(total_unique_hashes)
    print(f"  Overall unique conversation hashes (language): {len(total_unique_hashes)}")
    print(f"  Overall total occurrences (language): {total_count}")
    print()

Aggregated results by language and rule:
Language: c
  Rule: insecure-use-scanf-fn
    CWE(s): CWE-676: Use of Potentially Dangerous Function
    Unique conversation hashes: 378
    Total occurrences: 1182
  Rule: insecure-use-memset
    CWE(s): CWE-14: Compiler Removal of Code to Clear Buffers
    Unique conversation hashes: 117
    Total occurrences: 263
  Rule: insecure-use-string-copy-fn
    CWE(s): CWE-676: Use of Potentially Dangerous Function
    Unique conversation hashes: 120
    Total occurrences: 273
  Rule: insecure-use-strcat-fn
    CWE(s): CWE-676: Use of Potentially Dangerous Function
    Unique conversation hashes: 18
    Total occurrences: 56
  Rule: insecure-use-gets-fn
    CWE(s): CWE-676: Use of Potentially Dangerous Function
    Unique conversation hashes: 11
    Total occurrences: 19
  Rule: insecure-use-printf-fn
    CWE(s): CWE-134: Use of Externally-Controlled Format String
    Unique conversation hashes: 5
    Total occurrences: 14
  Overall unique conversatio

## Total statistics

In [8]:
total_files = set([c_file.split("_")[0] for c_file in c_files])

print(len(total_files), total_all)
print(total_all / len(total_files))

11261 581
0.05159399698072995


In [9]:
def get_code_lines_stats(c_files, unique_hashes=None, include_only_unique=False):
    if unique_hashes is None:
        unique_hashes = set()

    line_counts = []

    for c_file in c_files:
        convo_hash = c_file.split("_")[0]
    
        if include_only_unique:
            # Only process rows whose convo_hash is in unique_hashes
            if convo_hash not in unique_hashes:
                continue
        else:
            # Exclude rows whose convo_hash is in unique_hashes
            if convo_hash in unique_hashes:
                continue

        filename = f"files/c/codes/{c_file}"

        if not os.path.isfile(filename):
            print(f"File {filename} not found.")
            continue

        with open(filename, 'r', encoding='utf-8', errors='ignore') as code_file:
            lines = code_file.readlines()
            num_lines = len(lines)
            line_counts.append(num_lines)

    if not line_counts:
        return 0.0, 0.0

    avg = sum(line_counts) / len(line_counts)
    variance = sum((x - avg) ** 2 for x in line_counts) / len(line_counts)
    std_dev = math.sqrt(variance)

    return avg, std_dev

good_avg, good_std = get_code_lines_stats(c_files, unique_hashes)
print(f"Good results ||| Avg lines: {good_avg} | Std: {good_std} ")


bad_avg, bad_std = get_code_lines_stats(c_files, unique_hashes, include_only_unique=True)

print(f"Bad results ||| Avg lines: {bad_avg} | Std: {bad_std} ")


NameError: name 'unique_hashes' is not defined

## Save the results in a CSV file

In [None]:
with open("results/unsafe_memory_occurrence.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Rule", "Unique Hash Count", "Hashes"])
    for rule, data in results.items():
        writer.writerow([
            rule,
            len(data["hashes"]),
            ";".join(data["hashes"])
        ])