In [17]:
import csv
from collections import defaultdict
import os
import math
from utils.extractor import walk_and_extract_cwe

lang_to_ext = {
    'c': 'c',
    'cpp': 'cpp',
    'python': 'py',
    'java': 'java',
    'javascript': 'js',
    'php': 'php',
    "csharp": "cs"
}

In [18]:
def count_conversations(file_path):
    # Structure: {language: {conversation_hash: count, 'total_unique': count, 'total_occurrences': count}}
    language_data = defaultdict(lambda: {'unique_conversations': set(), 'total_occurrences': 0})

    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        for row in csv_reader:
            conversation_hash = row['conversation_hash']
            language = row['language']

            # Track data per language
            lang_data = language_data[language]

            # Add to unique conversations if not already present
            if conversation_hash not in lang_data['unique_conversations']:
                lang_data['unique_conversations'].add(conversation_hash)

            # Increment total occurrences
            lang_data['total_occurrences'] += 1

    # Convert sets to counts and prepare final output
    result = {}
    for language, data in language_data.items():
        result[language] = {
            'unique_conversations': len(data['unique_conversations']),
            'total_occurrences': data['total_occurrences']
        }

    return result

In [19]:
file_path = 'codegrep_results_hash.csv'
language_conversations = count_conversations(file_path)
print(language_conversations)
unique_convo_hash_good = 0
# Print results
print("Unique conversations and total occurrences per language:")
for language, data in language_conversations.items():
    print(f"Language: {language}")
    unique_convo_hash_good += data['unique_conversations']
    print(f"  Unique conversations: {data['unique_conversations']}")
    print(f"  Total occurrences: {data['total_occurrences']}")

{'c': {'unique_conversations': 16, 'total_occurrences': 56}, 'csharp': {'unique_conversations': 24, 'total_occurrences': 56}, 'java': {'unique_conversations': 32, 'total_occurrences': 93}, 'javascript': {'unique_conversations': 22, 'total_occurrences': 72}, 'php': {'unique_conversations': 3, 'total_occurrences': 7}, 'python': {'unique_conversations': 180, 'total_occurrences': 577}}
Unique conversations and total occurrences per language:
Language: c
  Unique conversations: 16
  Total occurrences: 56
Language: csharp
  Unique conversations: 24
  Total occurrences: 56
Language: java
  Unique conversations: 32
  Total occurrences: 93
Language: javascript
  Unique conversations: 22
  Total occurrences: 72
Language: php
  Unique conversations: 3
  Total occurrences: 7
Language: python
  Unique conversations: 180
  Total occurrences: 577


In [20]:
allowed_rules = {
    "java": [
        "use-of-md5",
        "use-of-weak-rsa-key",
        "use-of-sha1",
        "use-of-rc4",
        "use-of-rc2",
        "use-of-md5-digest-utils",
        "use-of-default-aes",
        "use-of-aes-ecb",
        "use-of-blowfish",
        "rsa-no-padding",
        "no-null-cipher",
        "gcm-nonce-reuse",
        "gcm-detection",
        "ecb-cipher",
        "desede-is-deprecated",
        "des-is-deprecated",
    ],
    "csharp": [
        "use_weak_rsa_encryption_padding",
        "use_deprecated_cipher_algorithm",
        "X509Certificate2-privkey"
    ],
    "javascript": [
        "aead-no-final",
        "create-de-cipher-no-iv",
        "gcm-no-tag-length",
        "md5-used-as-password",
    ],
    "php": [
        "weak-crypto",
        "md5-used-as-password",
        "md5-loose-equality",
        "mcrypt-use",
        "openssl-decrypt-validate",
    ],
    "python": [
        "crypto-mode-without-authentication",
        "insufficient-rsa-key-size",
        "insufficient-dsa-key-size",
        "insecure-hash-algorithm-sha1",
        "insecure-hash-algorithm-md5",
        "insecure-hash-algorithm-md4",
        "insecure-hash-algorithm-md2",
        "insecure-cipher-algorithm-xor",
        "insecure-cipher-algorithm-rc4",
        "insecure-cipher-algorithm-rc2",
        "insecure-cipher-algorithm-des",
        "insecure-cipher-algorithm-blowfish",
        "insecure-hash-function",
        "insecure-hash-algorithm-sha1",
        "md5-used-as-password",
        "hashids-with-django-secret",
        "crypto-mode-without-authentication",
        "insufficient-ec-key-size",
        "insecure-cipher-mode-ecb",
        "insecure-cipher-algorithm-idea",
        "insecure-cipher-algorithm-arc4",
        "empty-aes-key",
    ]
}

In [21]:
for key, value in allowed_rules.items():
    print(key, len(value))

java 16
csharp 3
javascript 4
php 5
python 22


In [22]:
# Flatten allowed rules into a list
allowed_rules_list = [rule for sublist in allowed_rules.values() for rule in sublist]

# Structure: language -> rule -> {count, hashes}
language_rule_results = defaultdict(lambda: defaultdict(lambda: {"count": 0, "hashes": set()}))

with open('codegrep_results.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    
    for row in csv_reader:
        error_id = row['error_id'].split('.')[-1]  # Extract last part of error_id
        if error_id in allowed_rules_list:
            language = row['language']  # Make sure this column exists in your CSV
            conversation_hash = row['conversation_hash']
            
            # Update counts for language and rule
            language_rule_results[language][error_id]["count"] += 1
            language_rule_results[language][error_id]["hashes"].add(conversation_hash)

total_all = 0
print("Aggregated results by language and rule:")
for language, rules in language_rule_results.items():
    total_unique_hashes = set()
    total_count = 0
    print(f"Language: {language}")
    rule_id_to_cwes = walk_and_extract_cwe(rules)
    
    for rule, data in rules.items():
        unique_hash_count = len(data["hashes"])
        count = data["count"]
        total_unique_hashes.update(data["hashes"])
        total_count += count
        cwes = rule_id_to_cwes.get(rule, [])
        cwe_str = ", ".join(cwes) if cwes else "N/A"
        print(f"  Rule: {rule}")
        print(f"    CWE(s): {cwe_str}")
        print(f"    Unique conversation hashes: {unique_hash_count}")
        print(f"    Total occurrences: {count}")
        
    num_all_hashes = language_conversations.get(language, {}).get('unique_conversations', 0)
    num_bad_hashes = len(total_unique_hashes)    
    percentage_bad = (num_bad_hashes / num_all_hashes * 100) if num_all_hashes > 0 else 0
    total_all += len(total_unique_hashes)
    print(f"  Overall unique conversation hashes (language): {len(total_unique_hashes)}")
    print(f"  Overall total occurrences (language): {total_count}")
    print(f"  Percentage of unique conversations that are wrong: {percentage_bad:.2f}%\n")
    print()

Aggregated results by language and rule:
Language: java


  Rule: use-of-md5
    CWE(s): CWE-328: Use of Weak Hash
    Unique conversation hashes: 8
    Total occurrences: 9
  Rule: desede-is-deprecated
    CWE(s): CWE-326: Inadequate Encryption Strength
    Unique conversation hashes: 1
    Total occurrences: 1
  Rule: des-is-deprecated
    CWE(s): CWE-326: Inadequate Encryption Strength
    Unique conversation hashes: 1
    Total occurrences: 2
  Rule: ecb-cipher
    CWE(s): CWE-327: Use of a Broken or Risky Cryptographic Algorithm
    Unique conversation hashes: 2
    Total occurrences: 4
  Rule: use-of-aes-ecb
    CWE(s): CWE-327: Use of a Broken or Risky Cryptographic Algorithm
    Unique conversation hashes: 2
    Total occurrences: 4
  Rule: use-of-default-aes
    CWE(s): CWE-327: Use of a Broken or Risky Cryptographic Algorithm
    Unique conversation hashes: 1
    Total occurrences: 2
  Overall unique conversation hashes (language): 12
  Overall total occurrences (language): 22
  Percentage of unique conversations that are wrong: 37.

## Total statistics

In [23]:
print(unique_convo_hash_good, total_all)
print(total_all / unique_convo_hash_good)

277 60
0.21660649819494585


In [24]:
def get_code_lines_stats(csv_path, unique_hashes=None, include_only_unique=False):
    if unique_hashes is None:
        unique_hashes = set()

    line_counts = []

    with open(csv_path, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            convo_hash = row['conversation_hash']

            if include_only_unique:
                # Only process rows whose convo_hash is in unique_hashes
                if convo_hash not in unique_hashes:
                    continue
            else:
                # Exclude rows whose convo_hash is in unique_hashes
                if convo_hash in unique_hashes:
                    continue

            code_index = row['code_index']
            language = row['language'].lower()

            ext = lang_to_ext.get(language)
            if not ext:
                print(f"Unknown language '{language}' for conversation_hash={convo_hash}")
                continue

            filename = f"files/{language}/codes/{convo_hash}_{code_index}.{ext}"

            if not os.path.isfile(filename):
                print(f"File {filename} not found.")
                continue

            with open(filename, 'r', encoding='utf-8', errors='ignore') as code_file:
                lines = code_file.readlines()
                num_lines = len(lines)
                line_counts.append(num_lines)

    if not line_counts:
        return 0.0, 0.0

    avg = sum(line_counts) / len(line_counts)
    variance = sum((x - avg) ** 2 for x in line_counts) / len(line_counts)
    std_dev = math.sqrt(variance)

    return avg, std_dev

good_avg, good_std = get_code_lines_stats(file_path, unique_hashes)
print(f"Good results ||| Avg lines: {good_avg} | Std: {good_std} ")


bad_avg, bad_std = get_code_lines_stats(file_path, unique_hashes, include_only_unique=True)

print(f"Bad results ||| Avg lines: {bad_avg} | Std: {bad_std} ")


NameError: name 'unique_hashes' is not defined

## Save the results in a CSV file

In [None]:
with open("results/hash_occurrences.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Rule", "Unique Hash Count", "Hashes"])
    for rule, data in results.items():
        writer.writerow([
            rule,
            len(data["hashes"]),
            ";".join(data["hashes"])
        ])