# 1. Generate Table 1, Code stats

### 1.1 We start by loading in the data from Kaggle

In [1]:
!pip install kagglehub[pandas-datasets]



In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pandas import DataFrame
from datasets import load_dataset as hf_load_dataset
from numpy import std
import re

In [3]:
df = hf_load_dataset("regularpooria/llm_generated_code_snippets")["train"]

unique_languages = {row["language"] for row in df}
unique_languages

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

{'ARM Assembly',
 'AppleScript',
 'C',
 'C#',
 'COBOL',
 'Erlang',
 'Fortran',
 'Go',
 'Java',
 'JavaScript',
 'Kotlin',
 'Lua',
 'Mathematica/Wolfram Language',
 'PHP',
 'Pascal',
 'Perl',
 'PowerShell',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'Scala',
 'Swift',
 'Visual Basic .NET',
 'jq'}

In [4]:
"""This is what a "data_point" looks like
{
    "conversation_hash": "string",
    "code_index": "number",
    "language": "string",
    "libraries": ["string"],
    "code": "string",
    "filename": "string"
}
"""

regex_dataset = {
    "c": r"^\s*(//|/\*|\*|\*/).*",
    "c#": r"^\s*(//|/\*|\*|\*/).*",
    "java": r"^\s*(//|/\*|\*|\*/).*",
    "javascript": r"^\s*(//|/\*|\*|\*/).*",
    "php": r"^\s*(#|//|/\*|\*|\*/).*",
    "python": r'^\s*#',
    "visual basic .net": [
        r"//.*",           # C# single-line
        r"/\*[\s\S]*?\*/", # C# block
        r"'.*",            # VB.NET single-quote
        r"REM .*"          # VB style REM
    ],

    "fortran": [
        r"^!.*",           # Modern free form
        r"^c.*",           # Fixed form comment (column 1 = C or c or *)
        r"^\*.*"
    ],

    "perl": [
        r"#.*",            # Perl single-line
        r"^=cut.*"         # POD comment close
    ],

    "ruby": [
        r"#.*",                  # Ruby single-line
        r"=begin[\s\S]*?=end"    # Ruby block comment
    ],

    "swift": [
        r"//.*",           # Swift single-line
        r"/\*[\s\S]*?\*/"  # Swift block
    ],

    "cobol": [
        r"^\s*\*.*"        # COBOL comment lines
    ],

    "erlang": [
        r"%.*"             # Erlang single-line
    ],

    "mathematica/wolfram language": [
        r"\(\*[\s\S]*?\*\)" # Mathematica/Wolfram block comment
    ],

    "rust": [
        r"//.*",           # Rust single-line
        r"/\*[\s\S]*?\*/"  # Rust block
    ],

    "arm assembly": [
        r";.*"             # ARM Assembly comment
    ],

    "go": [
        r"//.*",           # Go single-line
        r"/\*[\s\S]*?\*/"  # Go block
    ],

    "r": [
        r"#.*"             # R single-line
    ],

    "kotlin": [
        r"//.*",           # Kotlin single-line
        r"/\*[\s\S]*?\*/"  # Kotlin block
    ],

    "applescript": [
        r"--.*",           # AppleScript single-line
        r"\(\*[\s\S]*?\*\)" # AppleScript block
    ],

    "jq": [
        r"#.*"             # JQ single-line
    ],

    "scala": [
        r"//.*",           # Scala single-line
        r"/\*[\s\S]*?\*/"  # Scala block
    ],

    "powershell": [
        r"#.*"             # PowerShell single-line
    ],

    "pascal": [
        r"\{[\s\S]*?\}",   # Pascal block {}
        r"\(\*[\s\S]*?\*\)" # Pascal block (* *)
    ],

    "lua": [
        r"--.*",           # Lua single-line
        r"--\[\[[\s\S]*?\]\]" # Lua block
    ]
    
}

headers = [
    "Language",
    "Total Code",
    "Total Conv.",
    "Avg Blocks per Conv.",
    "Avg Lines per Block",
    "Stddev Lines per Block",
    "Avg Comments per Code",
]


    
row_format = "{:<10} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}"

# Print header
print(row_format.format(*headers))
print("-" * 130)

for language in unique_languages:
    
    total_data_points = 0
    conversations = {}
    blocks = []
    comments_length = []

    for row in df:
        
        if row["language"].lower() != language.lower():
            continue
        
            
        total_data_points += 1
        conversation_hash = row["conversation_hash"]

        if conversation_hash in conversations:
            conversations[conversation_hash] += 1
        else:
            conversations[conversation_hash] = 1

        blocks.append(row["code"])
        if language.lower() not in regex_dataset:
            continue
        if isinstance(regex_dataset[language.lower()], list):
            length = 0
            for regex in regex_dataset[language.lower()]:
                length += len(re.findall(regex, row["code"], flags=re.MULTILINE))
            
            comments_length.append(length)
        else:
            comments_length.append(len(re.findall(regex_dataset[language.lower()], row["code"], re.MULTILINE)))

    avg_blocks_per_conv = sum(list(conversations.values())) / len(conversations)
    avg_lines_per_block = sum([len(block.split("\n")) for block in blocks]) / len(blocks)
    std_lines_per_block = std([len(block.split("\n")) for block in blocks])
    
    if language.lower() in regex_dataset:
        avg_lines_per_comment = sum(comments_length) / len(comments_length)
    else:
        avg_lines_per_comment = -1
    # Apply formatting: int with commas, floats with 2 decimals
    formatted_row = [
        f"{language}",  # Language
        f"&{len(blocks):,}&",  # Total Code
        f"{len(conversations):,}&",  # Total Conv.
        f"{avg_blocks_per_conv:.2f}&",  # Avg Blocks per Conv.
        f"{avg_lines_per_block:.2f}&",  # Avg Lines per Block
        f"{std_lines_per_block:.2f}&",  # Stddev Lines per Block
        f"{avg_lines_per_comment:.2f}",  # Avg Comments per Code
    ]
    print(row_format.format(*formatted_row))

Language   Total Code Total Conv. Avg Blocks per Conv. Avg Lines per Block Stddev Lines per Block Avg Comments per Code
----------------------------------------------------------------------------------------------------------------------------------
Kotlin        &1,559&       580&      2.69&     22.95&     20.45&       1.79
Pascal        &3,623&     2,325&      1.56&     17.01&     17.37&       0.72
Mathematica/Wolfram Language    &1,225&       783&      1.56&     15.80&     15.86&       1.20
Lua             &401&       288&      1.39&     12.34&      9.95&       2.05
Fortran         &679&       477&      1.42&     15.40&     17.83&       0.42
Go            &1,631&       768&      2.12&     35.49&     34.43&       2.89
COBOL         &1,378&     1,022&      1.35&     15.84&     20.00&       0.06
Java         &18,680&     7,228&      2.58&     27.43&     27.89&       1.94
Ruby          &1,507&     1,097&      1.37&     14.30&     16.34&       1.31
PowerShell    &6,375&     2,856&      