In [9]:
import csv
from collections import defaultdict
import os
import math
from utils.extractor import walk_and_extract_cwe

lang_to_ext = {
    'c': 'c',
    'cpp': 'cpp',
    'python': 'py',
    'java': 'java',
    'javascript': 'js',
    'php': 'php',
    "csharp": "cs"
}

In [10]:
def count_conversations(file_path):
    # Structure: {language: {conversation_hash: count, 'total_unique': count, 'total_occurrences': count}}
    language_data = defaultdict(lambda: {'unique_conversations': set(), 'total_occurrences': 0})

    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        for row in csv_reader:
            conversation_hash = row['conversation_hash']
            language = row['language']

            # Track data per language
            lang_data = language_data[language]

            # Add to unique conversations if not already present
            if conversation_hash not in lang_data['unique_conversations']:
                lang_data['unique_conversations'].add(conversation_hash)

            # Increment total occurrences
            lang_data['total_occurrences'] += 1

    # Convert sets to counts and prepare final output
    result = {}
    for language, data in language_data.items():
        result[language] = {
            'unique_conversations': len(data['unique_conversations']),
            'total_occurrences': data['total_occurrences']
        }

    return result

In [11]:
file_path = 'codegrep_results_random.csv'
language_conversations = count_conversations(file_path)
print(language_conversations)
unique_convo_hash_good = 0
# Print results
print("Unique conversations and total occurrences per language:")
for language, data in language_conversations.items():
    print(f"Language: {language}")
    unique_convo_hash_good += data['unique_conversations']
    print(f"  Unique conversations: {data['unique_conversations']}")
    print(f"  Total occurrences: {data['total_occurrences']}")

{'c': {'unique_conversations': 203, 'total_occurrences': 1148}, 'csharp': {'unique_conversations': 91, 'total_occurrences': 133}, 'java': {'unique_conversations': 150, 'total_occurrences': 232}, 'javascript': {'unique_conversations': 352, 'total_occurrences': 987}, 'python': {'unique_conversations': 2810, 'total_occurrences': 12791}}
Unique conversations and total occurrences per language:
Language: c
  Unique conversations: 203
  Total occurrences: 1148
Language: csharp
  Unique conversations: 91
  Total occurrences: 133
Language: java
  Unique conversations: 150
  Total occurrences: 232
Language: javascript
  Unique conversations: 352
  Total occurrences: 987
Language: python
  Unique conversations: 2810
  Total occurrences: 12791


In [12]:
allowed_rules = {
    "java": [
        "weak-random"
    ],
    "csharp": [
        "use_weak_rng_for_keygeneration",
    ],
    "javascript": [
        "JS_WEAK_RNG",
    ],
    "python": [
        "PYTHON_WEAK_RNG",
        "PYTHON_WEAK_RNG_UNQUALIFIED",
        "PYTHON_WEAK_RNG_WRAPPER"
    ]
}

In [13]:
for key, value in allowed_rules.items():
    print(key, len(value))

java 1
csharp 1
javascript 1
python 3


In [14]:
# Flatten allowed rules into a list
allowed_rules_list = [rule for sublist in allowed_rules.values() for rule in sublist]

# Structure: language -> rule -> {count, hashes}
language_rule_results = defaultdict(lambda: defaultdict(lambda: {"count": 0, "hashes": set()}))

with open('codegrep_results.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    
    for row in csv_reader:
        error_id = row['error_id'].split('.')[-1]  # Extract last part of error_id
        if error_id in allowed_rules_list:
            language = row['language']  # Make sure this column exists in your CSV
            conversation_hash = row['conversation_hash']
            
            # Update counts for language and rule
            language_rule_results[language][error_id]["count"] += 1
            language_rule_results[language][error_id]["hashes"].add(conversation_hash)
total_all = 0
print("Aggregated results by language and rule:")
for language, rules in language_rule_results.items():
    total_unique_hashes = set()
    total_count = 0
    print(f"Language: {language}")
    rule_id_to_cwes = walk_and_extract_cwe(rules)
    for rule, data in rules.items():
        unique_hash_count = len(data["hashes"])
        count = data["count"]
        total_unique_hashes.update(data["hashes"])
        total_count += count
        cwes = rule_id_to_cwes.get(rule, [])
        cwe_str = ", ".join(cwes) if cwes else "N/A"
        print(f"  Rule: {rule}")
        print(f"    CWE(s): {cwe_str}")
        print(f"    Unique conversation hashes: {unique_hash_count}")
        print(f"    Total occurrences: {count}")
        
    num_all_hashes = language_conversations.get(language, {}).get('unique_conversations', 0)
    num_bad_hashes = len(total_unique_hashes)    
    percentage_bad = (num_bad_hashes / num_all_hashes * 100) if num_all_hashes > 0 else 0
    total_all += len(total_unique_hashes)
    print(f"  Overall unique conversation hashes (language): {len(total_unique_hashes)}")
    print(f"  Overall total occurrences (language): {total_count}")
    print(f"  Percentage of unique conversations that are wrong: {percentage_bad:.2f}%\n")
    print()

Aggregated results by language and rule:
Language: java
  Rule: weak-random
    CWE(s): CWE-330: Use of Insufficiently Random Values
    Unique conversation hashes: 15
    Total occurrences: 29
  Overall unique conversation hashes (language): 15
  Overall total occurrences (language): 29
  Percentage of unique conversations that are wrong: 10.00%


Language: python
  Rule: PYTHON_WEAK_RNG_UNQUALIFIED
    CWE(s): N/A
    Unique conversation hashes: 2
    Total occurrences: 2
  Overall unique conversation hashes (language): 2
  Overall total occurrences (language): 2
  Percentage of unique conversations that are wrong: 0.07%




## Total statistics

In [15]:
print(unique_convo_hash_good, total_all)
print(total_all / unique_convo_hash_good)

3606 17
0.004714364947310039


In [16]:
def get_code_lines_stats(csv_path, unique_hashes=None, include_only_unique=False):
    if unique_hashes is None:
        unique_hashes = set()

    line_counts = []

    with open(csv_path, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            convo_hash = row['conversation_hash']

            if include_only_unique:
                # Only process rows whose convo_hash is in unique_hashes
                if convo_hash not in unique_hashes:
                    continue
            else:
                # Exclude rows whose convo_hash is in unique_hashes
                if convo_hash in unique_hashes:
                    continue

            code_index = row['code_index']
            language = row['language'].lower()

            ext = lang_to_ext.get(language)
            if not ext:
                print(f"Unknown language '{language}' for conversation_hash={convo_hash}")
                continue

            filename = f"files/{language}/codes/{convo_hash}_{code_index}.{ext}"

            if not os.path.isfile(filename):
                print(f"File {filename} not found.")
                continue

            with open(filename, 'r', encoding='utf-8', errors='ignore') as code_file:
                lines = code_file.readlines()
                num_lines = len(lines)
                line_counts.append(num_lines)

    if not line_counts:
        return 0.0, 0.0

    avg = sum(line_counts) / len(line_counts)
    variance = sum((x - avg) ** 2 for x in line_counts) / len(line_counts)
    std_dev = math.sqrt(variance)

    return avg, std_dev

good_avg, good_std = get_code_lines_stats(file_path, unique_hashes)
print(f"Good results ||| Avg lines: {good_avg} | Std: {good_std} ")


bad_avg, bad_std = get_code_lines_stats(file_path, unique_hashes, include_only_unique=True)
print(f"Bad results ||| Avg lines: {bad_avg} | Std: {bad_std} ")


NameError: name 'unique_hashes' is not defined

## Save the results in a CSV file

In [None]:
with open("results/weak_random_occurrences.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Rule", "Unique Hash Count", "Hashes"])
    for rule, data in results.items():
        writer.writerow([
            rule,
            len(data["hashes"]),
            ";".join(data["hashes"])
        ])