In [1]:
from datasets import load_dataset as hf_load_dataset
from numpy import std
import re
import os

In [2]:
# Generate stats about the rules
languages = [
    "c",
    "csharp",
    "java",
    "javascript",
    "php",
    "python",
]
def find_security_yaml_rules(src_root: str):
    """
    Walk src_root, find all .yaml files under any 'security' folder
    """
    output = []
    for root, dirs, files in os.walk(src_root):
        # only consider paths that have 'security' in their hierarchy
        if 'security' in root.split(os.sep):
            for file in files:
                if file.endswith('.yaml'):
                    # compute relative path under src_root
                    src_file = os.path.join(root, file)
                    output.append(src_file)
                    
    return output

security_rules = {}
for language in languages:
    if not os.path.exists(f"opengrep-rules/{language}"):
        continue
    files = find_security_yaml_rules(f"opengrep-rules/{language}")
    security_rules[language] = len(files)
    
print(security_rules)

{'c': 12, 'csharp': 48, 'java': 117, 'javascript': 163, 'php': 62, 'python': 246}


In [3]:
df = hf_load_dataset("regularpooria/llm_generated_code_snippets")["train"]

unique_languages = {row["language"] for row in df}
unique_languages

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/25 [00:00<?, ?files/s]

php.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/153345 [00:00<?, ? examples/s]

{'ARM Assembly',
 'AppleScript',
 'C',
 'C#',
 'COBOL',
 'Erlang',
 'Fortran',
 'Go',
 'Java',
 'JavaScript',
 'Kotlin',
 'Lua',
 'Mathematica/Wolfram Language',
 'PHP',
 'Pascal',
 'Perl',
 'PowerShell',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'Scala',
 'Swift',
 'Visual Basic .NET',
 'jq'}

In [4]:
"""This is what a "data_point" looks like
{
    "conversation_hash": "string",
    "code_index": "number",
    "language": "string",
    "libraries": ["string"],
    "code": "string",
    "filename": "string"
}
"""

regex_dataset = {
    "c": r"^\s*(//|/\*|\*|\*/).*",
    "c#": r"^\s*(//|/\*|\*|\*/).*",
    "java": r"^\s*(//|/\*|\*|\*/).*",
    "javascript": r"^\s*(//|/\*|\*|\*/).*",
    "php": r"^\s*(#|//|/\*|\*|\*/).*",
    "python": r'^\s*#',
    "visual basic .net": [
        r"//.*",           # C# single-line
        r"/\*[\s\S]*?\*/", # C# block
        r"'.*",            # VB.NET single-quote
        r"REM .*"          # VB style REM
    ],

    "fortran": [
        r"^!.*",           # Modern free form
        r"^c.*",           # Fixed form comment (column 1 = C or c or *)
        r"^\*.*"
    ],

    "perl": [
        r"#.*",            # Perl single-line
        r"^=cut.*"         # POD comment close
    ],

    "ruby": [
        r"#.*",                  # Ruby single-line
        r"=begin[\s\S]*?=end"    # Ruby block comment
    ],

    "swift": [
        r"//.*",           # Swift single-line
        r"/\*[\s\S]*?\*/"  # Swift block
    ],

    "cobol": [
        r"^\s*\*.*"        # COBOL comment lines
    ],

    "erlang": [
        r"%.*"             # Erlang single-line
    ],

    "mathematica/wolfram language": [
        r"\(\*[\s\S]*?\*\)" # Mathematica/Wolfram block comment
    ],

    "rust": [
        r"//.*",           # Rust single-line
        r"/\*[\s\S]*?\*/"  # Rust block
    ],

    "arm assembly": [
        r";.*"             # ARM Assembly comment
    ],

    "go": [
        r"//.*",           # Go single-line
        r"/\*[\s\S]*?\*/"  # Go block
    ],

    "r": [
        r"#.*"             # R single-line
    ],

    "kotlin": [
        r"//.*",           # Kotlin single-line
        r"/\*[\s\S]*?\*/"  # Kotlin block
    ],

    "applescript": [
        r"--.*",           # AppleScript single-line
        r"\(\*[\s\S]*?\*\)" # AppleScript block
    ],

    "jq": [
        r"#.*"             # JQ single-line
    ],

    "scala": [
        r"//.*",           # Scala single-line
        r"/\*[\s\S]*?\*/"  # Scala block
    ],

    "powershell": [
        r"#.*"             # PowerShell single-line
    ],

    "pascal": [
        r"\{[\s\S]*?\}",   # Pascal block {}
        r"\(\*[\s\S]*?\*\)" # Pascal block (* *)
    ],

    "lua": [
        r"--.*",           # Lua single-line
        r"--\[\[[\s\S]*?\]\]" # Lua block
    ]
    
}

headers = [
    "Language",
    "Total Code",
    "Total Conv.",
    "Avg Blocks per Conv.",
    "Avg Lines per Block",
    "Stddev Lines per Block",
    "Avg Comments per Code",
]


    
row_format = "{:<10} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}"

# Print header
print(row_format.format(*headers))
print("-" * 130)

for language in unique_languages:
    
    total_data_points = 0
    conversations = {}
    blocks = []
    comments_length = []

    for row in df:
        
        if row["language"].lower() != language.lower():
            continue
        
            
        total_data_points += 1
        conversation_hash = row["conversation_hash"]

        if conversation_hash in conversations:
            conversations[conversation_hash] += 1
        else:
            conversations[conversation_hash] = 1

        blocks.append(row["code"])
        if language.lower() not in regex_dataset:
            continue
        if isinstance(regex_dataset[language.lower()], list):
            length = 0
            for regex in regex_dataset[language.lower()]:
                length += len(re.findall(regex, row["code"], flags=re.MULTILINE))
            
            comments_length.append(length)
        else:
            comments_length.append(len(re.findall(regex_dataset[language.lower()], row["code"], re.MULTILINE)))

    avg_blocks_per_conv = sum(list(conversations.values())) / len(conversations)
    avg_lines_per_block = sum([len(block.split("\n")) for block in blocks]) / len(blocks)
    std_lines_per_block = std([len(block.split("\n")) for block in blocks])
    
    if language.lower() in regex_dataset:
        avg_lines_per_comment = sum(comments_length) / len(comments_length)
    else:
        avg_lines_per_comment = -1
    # Apply formatting: int with commas, floats with 2 decimals
    formatted_row = [
        f"{language}",  # Language
        f"&{len(blocks):,}&",  # Total Code
        f"{len(conversations):,}&",  # Total Conv.
        f"{avg_blocks_per_conv:.2f}&",  # Avg Blocks per Conv.
        f"{avg_lines_per_block:.2f}&",  # Avg Lines per Block
        f"{std_lines_per_block:.2f}&",  # Stddev Lines per Block
        f"{avg_lines_per_comment:.2f}",  # Avg Comments per Code
    ]
    print(row_format.format(*formatted_row))

Language   Total Code Total Conv. Avg Blocks per Conv. Avg Lines per Block Stddev Lines per Block Avg Comments per Code
----------------------------------------------------------------------------------------------------------------------------------
Rust          &1,919&     1,001&      1.92&     19.15&     17.46&       1.78
Scala         &1,692&     1,111&      1.52&     13.33&     11.87&       0.94
PHP             &449&       340&      1.32&     20.60&     13.42&       3.05
ARM Assembly    &1,174&       804&      1.46&     22.91&     28.68&       2.74
JavaScript   &15,943&     7,217&      2.21&     23.39&     25.15&       2.23
PowerShell    &6,375&     2,856&      2.23&     16.68&     10.47&       0.49
jq            &2,805&     1,146&      2.45&     14.23&      6.46&       0.06
Lua             &401&       288&      1.39&     12.34&      9.95&       2.05
Swift           &819&       552&      1.48&     14.77&     12.82&       1.11
C#           &14,138&     5,895&      2.40&     28.39&