In [1]:
import csv
from collections import defaultdict

In [21]:
def count_conversations(file_path):
    # Structure: {language: {conversation_hash: count, 'total_unique': count, 'total_occurrences': count}}
    language_data = defaultdict(lambda: {'unique_conversations': set(), 'total_occurrences': 0})

    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        for row in csv_reader:
            conversation_hash = row['conversation_hash']
            language = row['language']

            # Track data per language
            lang_data = language_data[language]

            # Add to unique conversations if not already present
            if conversation_hash not in lang_data['unique_conversations']:
                lang_data['unique_conversations'].add(conversation_hash)

            # Increment total occurrences
            lang_data['total_occurrences'] += 1

    # Convert sets to counts and prepare final output
    result = {}
    for language, data in language_data.items():
        result[language] = {
            'unique_conversations': len(data['unique_conversations']),
            'total_occurrences': data['total_occurrences']
        }

    return result

In [22]:
file_path = 'codegrep_results_hash.csv'
language_conversations = count_conversations(file_path)
print(language_conversations)
unique_convo_hash_good = 0
# Print results
print("Unique conversations and total occurrences per language:")
for language, data in language_conversations.items():
    print(f"Language: {language}")
    unique_convo_hash_good += data['unique_conversations']
    print(f"  Unique conversations: {data['unique_conversations']}")
    print(f"  Total occurrences: {data['total_occurrences']}")

Unique conversations and total occurrences per language:
Language: c
  Unique conversations: 15
  Total occurrences: 104
Language: csharp
  Unique conversations: 17
  Total occurrences: 88
Language: java
  Unique conversations: 25
  Total occurrences: 154
Language: javascript
  Unique conversations: 20
  Total occurrences: 140
Language: php
  Unique conversations: 20
  Total occurrences: 154
Language: python
  Unique conversations: 167
  Total occurrences: 1070


In [11]:
allowed_rules = {
    "java": [
        "use-of-md5",
        "use-of-weak-rsa-key",
        "use-of-sha1",
        "use-of-rc4",
        "use-of-rc2",
        "use-of-md5-digest-utils",
        "use-of-default-aes",
        "use-of-aes-ecb",
        "use-of-blowfish",
        "rsa-no-padding",
        "no-null-cipher",
        "gcm-nonce-reuse",
        "gcm-detection",
        "ecb-cipher",
        "desede-is-deprecated",
        "des-is-deprecated",
    ],
    "csharp": [
        "use_weak_rsa_encryption_padding",
        "use_deprecated_cipher_algorithm",
        "X509Certificate2-privkey"
    ],
    "javascript": [
        "aead-no-final",
        "create-de-cipher-no-iv",
        "gcm-no-tag-length",
        "md5-used-as-password",
    ],
    "php": [
        "weak-crypto",
        "md5-used-as-password",
        "md5-loose-equality",
        "mcrypt-use",
        "openssl-decrypt-validate",
    ],
    "python": [
        "crypto-mode-without-authentication",
        "insufficient-rsa-key-size",
        "insufficient-dsa-key-size",
        "insecure-hash-algorithm-sha1",
        "insecure-hash-algorithm-md5",
        "insecure-hash-algorithm-md4",
        "insecure-hash-algorithm-md2",
        "insecure-cipher-algorithm-xor",
        "insecure-cipher-algorithm-rc4",
        "insecure-cipher-algorithm-rc2",
        "insecure-cipher-algorithm-des",
        "insecure-cipher-algorithm-blowfish",
        "insecure-hash-function",
        "insecure-hash-algorithm-sha1",
        "md5-used-as-password",
        "hashids-with-django-secret",
        "crypto-mode-without-authentication",
        "insufficient-ec-key-size",
        "insecure-cipher-mode-ecb",
        "insecure-cipher-algorithm-idea",
        "insecure-cipher-algorithm-arc4",
        "empty-aes-key",
    ]
}

In [20]:
for key, value in allowed_rules.items():
    print(key, len(value))

java 16
csharp 3
javascript 4
php 5
python 22


In [15]:
# Flatten the allowed rules into a single list
allowed_rules_list = [rule for sublist in allowed_rules.values() for rule in sublist]
results = defaultdict(lambda: {"count": 0, "hashes": set()})
# Read the CSV file
with open('codegrep_results.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)

    for row in csv_reader:
        error_id = row['error_id'].split('.')[-1]  # Extract the last part of the error_id
        if error_id in allowed_rules_list:
            conversation_hash = row['conversation_hash']
            if error_id in results:
                # Update the count and add the hash to the set
                results[error_id]["count"] += 1
                results[error_id]["hashes"].add(conversation_hash)
            else:
                # Initialize a new entry
                results[error_id] = {"count": 1, "hashes": {conversation_hash}}         
            
print("Rules and their Occurrences (unique conversation hashes and total occurrences):")
for rule in results:
    count = results[rule]["count"]
    unique_hashes = results[rule]["hashes"]
    print(f"Rule: {rule}, Unique Conversation Hashes Count: {len(unique_hashes)}, Picture of hashes: {unique_hashes}")



Rules and their Occurrences (unique conversation hashes and total occurrences):
Rule: use-of-md5, Unique Conversation Hashes Count: 7, Picture of hashes: {'b7392d11fd4fb15e24e085a10c7757f4', 'aa29876ac06b2ddbd22210da34e3936c', '0b7e60ee78f56ea4ea40a586da334163', '5d88b53049b7f7b39f5ed4f2bb932987', '66697ed0c18248780d802e923e275b4e', 'b3a866b6f359d675053b9d75f5aeadea', '3df0dab6da4d5d90b2df561804993bee'}
Rule: desede-is-deprecated, Unique Conversation Hashes Count: 1, Picture of hashes: {'1fe2948bd6633ee13cd8966bd875bb1d'}
Rule: des-is-deprecated, Unique Conversation Hashes Count: 1, Picture of hashes: {'1fe2948bd6633ee13cd8966bd875bb1d'}
Rule: ecb-cipher, Unique Conversation Hashes Count: 1, Picture of hashes: {'5aca47b467de285189f8dcd13033974f'}
Rule: use-of-aes-ecb, Unique Conversation Hashes Count: 1, Picture of hashes: {'5aca47b467de285189f8dcd13033974f'}
Rule: use-of-default-aes, Unique Conversation Hashes Count: 1, Picture of hashes: {'908efbc83f60e0b3798e4ab8fb75cc3a'}
Rule: wea

## Total statistics

In [17]:
unique_convo_hash_bad = 0
for rule in results:
    count = results[rule]["count"]
    unique_hashes = results[rule]["hashes"]
    unique_convo_hash_bad += len(unique_hashes)


In [19]:
print(unique_convo_hash_good, unique_convo_hash_bad)
print(unique_convo_hash_bad / unique_convo_hash_good)

264 59
0.22348484848484848
