In [34]:
from datasets import load_dataset
import re
import os
import json

In [2]:
# Load the dataset
ds = load_dataset("allenai/WildChat-1M", split="train")

In [35]:
def extract_code_snippets(text):
    """
    Extracts all code blocks from Markdown text.
    Returns a list of tuples: (language, code)
    Language is None if not specified.
    Handles consecutive code blocks correctly.
    """
    blocks = []
    
    # Updated regex pattern to handle various markdown formats:
    # ``` followed by optional language, then newline
    # Content until closing ``` at start of line or end of string
    pattern = r"```(\w+)?\n?(.*?)\n?```"
    
    matches = re.findall(pattern, text, re.DOTALL)
    
    for lang, code in matches:
        # Clean up the code content
        cleaned_code = code.strip()
        if cleaned_code:  # Only add non-empty code blocks
            blocks.append((lang if lang else None, cleaned_code))
    
    return blocks

In [33]:
extract_code_snippets("""Hey chatgpt, how are you doing?
```python
print(\"I'm doing just fine\")```

```
console.log(\"this is javascript\")
```
""")

[('python', 'print("I\'m doing just fine")'),
 (None, 'console.log("this is javascript")')]

In [41]:
results = []
for example in ds:
    conversation_id = example["conversation_hash"]
    for message in example["conversation"]:
        if message["role"] == "assistant":
            blocks = extract_code_snippets(message["content"])
            for (language, code) in blocks:
                results.append(
                    {"conversation_hash": conversation_id, "code": code.strip(), "language": language}
                )

In [43]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/code_snippets.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

In [47]:
results[0]

{'conversation_hash': '80976bec07e850921fddb61e9469a62b',
 'code': '#include <iostream>\n#include <vector>\n#include <algorithm>\n#include <fstream>\n\nusing namespace std;\n\nstruct Data {\n    int id;\n    string name;\n    float value;\n\n    void display() const {\n        printf("%-5d %-20s %.2f\\n", id, name.c_str(), value);\n    }\n};\n\n// Сравнение двух структур по умолчанию\nbool compareDefault(const Data &a, const Data &b) {\n    return a.id < b.id;\n}\n\n// Сортировка QuickSort\nvoid quickSort(vector<Data> &data, int low, int high, bool (*compare)(const Data &, const Data &) = compareDefault) {\n    if (low < high) {\n        int i = low;\n        int j = high;\n        Data pivot = data[(i + j) / 2];\n\n        do {\n            while (compare(data[i], pivot)) i++;\n            while (compare(pivot, data[j])) j--;\n\n            if (i <= j) {\n                swap(data[i], data[j]);\n                i++;\n                j--;\n            }\n        } while (i <= j);\n\n  

In [50]:
conversation_hashes = set([r["conversation_hash"] for r in results])
len(conversation_hashes)

82843

In [45]:
languages = {
    "unknown": 0
}
for result in results:
    if result["language"] == None:
        languages["unknown"] += 1
    else:
        if result["language"] not in languages:
            languages[result["language"]] = 1
        languages[result["language"]] += 1

In [46]:
languages

{'unknown': 94669,
 'cpp': 17622,
 'plaintext': 1175,
 'python': 84746,
 'lua': 2445,
 'java': 17603,
 'csharp': 14341,
 'bash': 15927,
 'html': 9937,
 'c': 10191,
 'HTML': 47,
 'vba': 2228,
 'dart': 1893,
 'shell': 3561,
 'groovy': 480,
 'xml': 3432,
 'properties': 199,
 'sh': 5049,
 'js': 1206,
 'C': 649,
 'ruby': 365,
 'R': 2319,
 'css': 3075,
 'javascript': 22233,
 'kotlin': 2331,
 'assembly': 360,
 'sql': 5887,
 'rust': 1532,
 'php': 1839,
 'nasm': 7,
 'gdscript': 339,
 'py': 17,
 'json': 7957,
 'typescript': 2652,
 'verilog': 123,
 'MATLAB': 243,
 'vb': 592,
 'go': 1607,
 'python3': 5,
 'Python': 114,
 'powershell': 1191,
 'xaml': 213,
 'jsx': 2484,
 'vue': 562,
 'delphi': 196,
 'matlab': 3270,
 'swift': 453,
 'Dockerfile': 237,
 'yml': 33,
 'toml': 290,
 'yaml': 1478,
 'gradle': 175,
 'batch': 329,
 'asm': 77,
 'mathematica': 354,
 'ini': 227,
 'pascal': 142,
 'scala': 377,
 '符号': 2,
 '符号包裹的代码会被解析成代码块': 2,
 'dash': 3,
 'text': 63,
 'jsp': 154,
 'sudo': 124,
 'sqlite3': 3,
 'dock