In [1]:
import os
import json
import subprocess
from tqdm import tqdm
import sys
import re

### Loading path so that the installed apps work

In [4]:
env = os.environ.copy()
env["PATH"] = "/home/pooria/.nvm/versions/node/v22.17.1/bin:" + env["PATH"]
env["PATH"] = os.path.expanduser("~/.dotnet/tools") + ":" + env["PATH"]

subprocess.run(["dotnet-script", "--version"], env=env)

1.6.0


CompletedProcess(args=['dotnet-script', '--version'], returncode=0)

### Python

In [2]:
import py_compile

directory = "files/python/codes/"
output_file = "results/python_lint.json"
all_results = []

py_files = [f for f in os.listdir(directory) if f.endswith(".py")]
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Disable .pyc creation globally
sys.dont_write_bytecode = True

for filename in tqdm(py_files, desc="Linting files", unit="file"):
    filepath = os.path.join(directory, filename)

    try:
        # Compile with syntax check, no .pyc files written
        py_compile.compile(filepath, doraise=True)
    except py_compile.PyCompileError as e:
        err = {
            "type": "syntax-error",
            "module": filename,
            "obj": "",
            "line": getattr(e.exc_value, "lineno", None),
            "column": None,
            "path": filepath,
            "symbol": "syntax-error",
            "message": str(e.exc_value),
            "message-id": ""
        }
        all_results.append(err)

# Write all results to JSON once after all files are processed
with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2)


Linting files:   0%|          | 0/57371 [00:00<?, ?file/s]

  100 is larger
  ...
  ...
  100 is larger
  [License](LICENSE)
  [['1' '2' '3']
  ...
Linting files: 100%|██████████| 57371/57371 [00:06<00:00, 9055.38file/s]


### Javascript
Make sure you have eslint installed

In [5]:
import os
import json
import subprocess

directory = "files/javascript/codes/"
output_file = "results/js_lint.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

js_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".js")]
all_results = []


# Batch size (tweak if needed)
BATCH_SIZE = 20000  

def run_eslint_batch(batch_files):
    result = subprocess.run(
        [
            "eslint",
            "--config",
            "linting_rules/js.eslint.config.mjs",
            "--format",
            "json",
            *batch_files,
        ],
        capture_output=True,
        text=True,
        env=env,
    )
    try:
        return json.loads(result.stdout)
    except json.JSONDecodeError:
        print("⚠️ Failed to parse JSON from ESLint output:")
        print(result.stdout)
        return []

# Run in batches
for i in range(0, len(js_files), BATCH_SIZE):
    batch = js_files[i:i + BATCH_SIZE]
    file_results = run_eslint_batch(batch)

    for file_result in file_results:
        filename = os.path.basename(file_result.get("filePath", ""))
        for msg in file_result.get("messages", []):
            if msg.get("fatal", False) or msg.get("severity", 0) == 2:
                all_results.append({
                    "type": "syntax-error",
                    "module": filename,
                    "obj": "",
                    "line": msg.get("line"),
                    "column": msg.get("column"),
                    "path": file_result.get("filePath"),
                    "symbol": "syntax-error",
                    "message": msg.get("message", ""),
                    "message-id": "",
                })

# Save results
with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2)

print(f"✅ Linting complete, results saved to {output_file}")


✅ Linting complete, results saved to results/js_lint.json


### Java files

In [7]:
!sudo apt install default-jdk

[sudo] password for pooria: 


In [6]:
import shutil
import hashlib

directory = "files/java/codes/"
good_dir = os.path.join(directory, "good")
os.makedirs(good_dir, exist_ok=True)

java_files = [f for f in os.listdir(directory) if f.endswith(".java")]
public_class_pattern = re.compile(r'public\s+class\s+(\w+)')

# Map from hash (hex string) to original filename
mapping = {}

def hash_file_content(path):
    hasher = hashlib.sha256()
    with open(path, "rb") as f:
        while chunk := f.read(8192):
            hasher.update(chunk)
    return hasher.hexdigest()

for filename in java_files:
    filepath = os.path.join(directory, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    # Find public class name
    match = public_class_pattern.search(content)
    if match:
        class_name = match.group(1)
        file_hash = hash_file_content(filepath)

        new_filename = f"{class_name}.java"
        dest_path = os.path.join(good_dir, new_filename)

        if os.path.exists(dest_path):
            continue
        else:
            print(f"Copying {filename} → good/{new_filename}")
            shutil.copy2(filepath, dest_path)

        # Map hash → original filename
        mapping[file_hash] = filename


Copying ab09eef5d14bac1a1ff761a7d77a07c1_0.java → good/QuizController.java
Copying 14e324436caad13d217bcb7b723c9175_9.java → good/User.java
Copying 1c3767860affeb923a9736d2fe92bedd_0.java → good/DisableRegenPlugin.java
Copying b51cc1f10724c38f41cb7d37f387e510_1.java → good/KnnSignalHistoryService.java
Copying 0b27669ed4aa5db14236d8c006f35bac_13.java → good/CustomDolphins.java
Copying 0d22d7a849fdc87fdccaca6bd3c8b62d_5.java → good/AccountBalanceException.java
Copying 2f61ab105239df7ef679fd295e7ed4f0_14.java → good/UserServiceImpl.java
Copying 3ca0eb685fc71a09183130dd110791d6_1.java → good/SentryInitializer.java
Copying 4510cc13d09d846c6561f8adf93ea0f2_3.java → good/DatabaseManager.java
Copying 9b702073976e1f60611ff4cb8b5b5ee8_2.java → good/UIAnotherScreen.java
Copying d5ffdf695700932a7fc001e2ca23e312_50.java → good/InfluxDBService.java
Copying 1946f8753e9eaed67feba2259cf19a42_3.java → good/MyDateUtil.java
Copying 315a2ecee52cb495c531573c1435a9f0_5.java → good/LocatingCustomElementHandle

In [7]:
directory = "files/java/codes/good"
output_file = "results/java_lint.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

java_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".java")]
all_results = []

os.makedirs("/tmp/java_linting", exist_ok=True)

class_files = [f for f in os.listdir(directory) if f.endswith(".java")]

hash_to_original = mapping  # hash -> original filename

hash_to_classfile = {}



def hash_file_content(path):
    hasher = hashlib.sha256()
    with open(path, "rb") as f:
        while chunk := f.read(8192):
            hasher.update(chunk)
    return hasher.hexdigest()

class_to_original = {}

for class_filename in class_files:
    class_filepath = os.path.join(directory, class_filename)
    file_hash = hash_file_content(class_filepath)
    original = hash_to_original.get(file_hash, None)
    if original:
        class_to_original[class_filename] = original
    else:
        # fallback: map to itself if no mapping found
        class_to_original[class_filename] = class_filename

def add_error(file, errors):
    base = os.path.basename(file)
    original_filename = class_to_original.get(base, base)  # fallback to base

    if errors:
        all_results.append({
            "type": "syntax-error",
            "module": os.path.basename(original_filename) if original_filename else "unknown",
            "obj": "",
            "line": None,
            "column": None,
            "path": original_filename if original_filename else "",
            "symbol": "syntax-error",
            "message": "\n".join(errors),
            "message-id": "",
        })

result = subprocess.run(
    ["javac", "-d", "/tmp/java_linting", *java_files],
    capture_output=True,
    text=True,
)

if result.returncode != 0:
    current_file = None
    current_errors = []

    for line in result.stderr.splitlines():
        if ".java:" in line and "error:" in line:
            if current_file is not None:
                add_error(current_file, current_errors)
            current_errors = [line]
            current_file = line.split(":")[0]
        else:
            current_errors.append(line)
    if current_file is not None:
        add_error(current_file, current_errors)

with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2)


### C

In [8]:
# Setup directories
directory = "files/c/codes/"
output_file = "results/c_lint.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Gather all .c files
c_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".c")]
all_results = []

# Run GCC in syntax-only mode on all files
result = subprocess.run(
    ["gcc", "-fsyntax-only", *c_files],
    capture_output=True,
    text=True,
)

# Parse output if there were errors
if result.returncode != 0:
    for line in result.stderr.splitlines():
        if ": error:" in line:
            parts = line.split(":")
            file_path = parts[0]
            line_no = int(parts[1]) if parts[1].isdigit() else None
            column_no = int(parts[2]) if parts[2].isdigit() else None
            message = line.split(": error:")[1].strip()
            
            # Decode any unicode punctuation to ASCII equivalents
            message = message.replace("\u2018", "'").replace("\u2019", "'")
            
            all_results.append({
                "type": "syntax-error",
                "module": os.path.basename(file_path),
                "obj": "",
                "line": line_no,
                "column": column_no,
                "path": file_path,
                "symbol": "syntax-error",
                "message": message,
                "message-id": "",
            })

# Write results to file
with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2)

### PHP

In [9]:

# Setup directories
directory = "files/php/codes/"
output_file = "results/php_lint.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Gather all .php files
php_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".php")]
all_results = []

for file in php_files:
    with open(file, "r", encoding="utf-8") as f:
        raw = f.read()
        if "<?php" not in raw:
            all_results.append({
            "type": "syntax-error",
            "module": os.path.basename(file),
            "obj": "",
            "line": None,
            "column": None,
            "path": file,
            "symbol": "Not valid php file",
            "message": "The file does not include <?php tag",
            "message-id": "",
            })
            continue      
            
    result = subprocess.run(
        ["php", "-l", file],
        capture_output=True,
        text=True
    )
    if "Parse error" in result.stdout or "Parse error" in result.stderr:
        match = re.search(r'PHP Parse error: (.+) in .+ on line (\d+)', result.stdout)
        if match:
            message = match.group(1)
            line_no = int(match.group(2))
        else:
            message = result.stdout.strip()
            line_no = 0
        
        all_results.append({
            "type": "syntax-error",
            "module": os.path.basename(file),
            "obj": "",
            "line": line_no,
            "column": None,
            "path": file,
            "symbol": "syntax-error",
            "message": result.stderr + result.stdout,
            "message-id": "",
        })

# Write results to file
with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2)


### C#

In [10]:
import os
import json
import subprocess
from tqdm import tqdm

# Setup directories
directory = "files/csharp/codes/"
output_file = "results/csharp_lint.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Gather all .cs files
csharp_files = [
    os.path.join(directory, f)
    for f in os.listdir(directory)
    if f.endswith(".cs")
]

all_results = []
batch_size = 500   # tune this as needed (100–1000 is good)

def chunked(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i:i+size]

# Add tqdm for progress
for batch in tqdm(list(chunked(csharp_files, batch_size)), desc="Checking C# files"):
    result = subprocess.run(
        ["dotnet", "script", "utils/SyntaxCheck.csx", *batch],
        capture_output=True,
        text=True
    )

    try:
        batch_results = json.loads(result.stdout)
        all_results.extend(batch_results)
    except json.JSONDecodeError:
        all_results.append({
            "type": "syntax-error",
            "module": "batch",
            "obj": "",
            "line": None,
            "column": None,
            "path": None,
            "symbol": "syntax-error",
            "message": result.stderr + result.stdout,
            "message-id": "",
        })

print(f"Collected {len(all_results)} results")

# Save results
with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2)


Checking C# files:   0%|          | 0/28 [00:00<?, ?it/s]

Checking C# files: 100%|██████████| 28/28 [00:23<00:00,  1.19it/s]


Collected 56798 results
