In [114]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("regularpooria/llm_generated_code_snippets")

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

In [115]:
import os
import json
import re
from typing import List

In [116]:
pattern = r"""
    # require("...")
    require\s*\(\s*["'`](.*?)["'`]\s*\)
  | # import ... from "..."
    import\s+(?:type\s+)?(?:[\w*{}\s,]*\s+from\s+)?["'`](.*?)["'`]
  | # dynamic import("...")
    import\s*\(\s*["'`](.*?)["'`]\s*\)
  | # import fs = require("fs")
    import\s+\w+\s*=\s*require\(\s*["'`](.*?)["'`]\s*\)
"""

In [117]:
def normalize_package(name: str) -> str:
    # Scoped packages: @scope/pkg/submodule → @scope/pkg
    if name.startswith("@"):
        parts = name.split("/")
        if len(parts) >= 2:
            return "/".join(parts[:2])
        return name
    
    # Normal packages: pkg/submodule → pkg
    if "/" in name:
        return name.split("/")[0]
    
    return name
def remove_colon(name: str) -> str:
    if name.endswith(":"):
        return name[:-1]
    return name
def parse_imports(code: str) -> List[str]:
    matches = re.findall(pattern, code, re.VERBOSE)
    modules = [m for group in matches for m in group if m]  # flatten
    
    # Remove local/relative imports
    third_party = [
        remove_colon(normalize_package(m))
        for m in modules
        if not (m.startswith("./") or m.startswith("../") or m.startswith("/") or m.startswith("@/"))
    ]
    
    return third_party

In [118]:
# Testing to see if it works
code = """
const express = require("express");
import fs from "fs";
import { join } from "path";
import * as utils from "./utils.js";
import "dotenv/config";
await import("./dynamicModule.js");
import type { User } from "./types";
import fs = require("fs");
import firebase from "firebase/app";
"""
print(parse_imports(code))


['express', 'fs', 'path', 'dotenv', 'fs', 'firebase']


In [119]:
javascript_rows = list(filter(lambda x: x["language"] == "JavaScript", ds["train"]))
for idx, row in enumerate(javascript_rows):
    javascript_rows[idx]["libraries"] = parse_imports(row["code"])

In [120]:
javascript_rows[0]

{'conversation_hash': 'f70f171081d6ecf4b3863b0fbb7577c3',
 'code_index': 1,
 'language': 'JavaScript',
 'libraries': ['express', 'firebase-admin', 'body-parser', 'cors'],
 'code': "const express = require('express');\nconst admin = require('firebase-admin');\nconst bodyParser = require('body-parser');\nconst cors = require('cors');\n\nconst app = express();\napp.use(cors());\napp.use(bodyParser.json());\n\nvar serviceAccount = require('./path/to/your/firebase-adminsdk-key.json');\n\nadmin.initializeApp({\n  credential: admin.credential.cert(serviceAccount)\n});\n\nconst db = admin.firestore();\n\n// Add business\napp.post('/business', async (req, res) => {\n  // TODO: Implement adding a new business to Firestore\n});\n\n// Get businesses\napp.get('/businesses', async (req, res) => {\n  // TODO: Implement getting businesses from Firestore\n});\n\n// Add review\napp.post('/review', async (req, res) => {\n  // TODO: Implement adding a review to a business\n});\n\n// Set up server\nconst P

In [121]:
libraries = {}
for row in javascript_rows:
    for library in row["libraries"]:
        if library in libraries:
            libraries[library] += 1
        else:
            libraries[library] = 1
libraries = dict(sorted(libraries.items(), key=lambda x: x[1], reverse=True))
print(len(libraries))

606


In [122]:
with open("utils/all-the-package-names/names.json", "r", encoding="utf-8") as f:
    npm_dataset = json.load(f)

In [123]:
with open("utils/javascript/builtin-modules/builtin-modules.json", "r", encoding="utf-8") as f:
    builtin_modules = json.load(f)

In [126]:
libraries = {k: v for k, v in libraries.items() if k not in npm_dataset}
libraries = {k: v for k, v in libraries.items() if k not in builtin_modules}
print(len(libraries))

46


In [125]:
with open("results/hallucinations_javascript.json", "w", encoding="utf-8") as f:
    json.dump(libraries, f)