In [34]:
from datasets import load_dataset
import re
import os
import json

In [2]:
# Load the dataset
ds = load_dataset("allenai/WildChat-1M", split="train")

In [35]:
def extract_code_snippets(text):
    """
    Extracts all code blocks from Markdown text.
    Returns a list of tuples: (language, code)
    Language is None if not specified.
    Handles consecutive code blocks correctly.
    """
    blocks = []
    
    # Updated regex pattern to handle various markdown formats:
    # ``` followed by optional language, then newline
    # Content until closing ``` at start of line or end of string
    pattern = r"```(\w+)?\n?(.*?)\n?```"
    
    matches = re.findall(pattern, text, re.DOTALL)
    
    for lang, code in matches:
        # Clean up the code content
        cleaned_code = code.strip()
        if cleaned_code:  # Only add non-empty code blocks
            blocks.append((lang if lang else None, cleaned_code))
    
    return blocks

In [33]:
extract_code_snippets("""Hey chatgpt, how are you doing?
```python
print(\"I'm doing just fine\")```

```
console.log(\"this is javascript\")
```
""")

[('python', 'print("I\'m doing just fine")'),
 (None, 'console.log("this is javascript")')]

In [41]:
results = []
for example in ds:
    conversation_id = example["conversation_hash"]
    for message in example["conversation"]:
        if message["role"] == "assistant":
            blocks = extract_code_snippets(message["content"])
            for (language, code) in blocks:
                results.append(
                    {"conversation_hash": conversation_id, "code": code.strip(), "language": language}
                )

In [43]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/code_snippets.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

In [45]:
languages = {
    "unknown": 0
}
for result in results:
    if result["language"] == None:
        languages["unknown"] += 1
    else:
        if result["language"] not in languages:
            languages[result["language"]] = 1
        languages[result["language"]] += 1

In [None]:
languages