In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pandas import DataFrame
from datasets import load_dataset as hf_load_dataset
import json

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import re
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from collections import defaultdict


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
linting_paths = {
    "php": "results/php_lint.json",
    "csharp": "results/csharp_lint.json",
    "python": "results/python_lint.json",
    "c": "results/c_lint.json",
    "javascript": "results/js_lint.json",
    "java": "results/java_lint.json",
}

In [3]:
other_cleaned = []
with open("other_cleaned.json", "r", encoding="utf-8") as f:
    for line in f.readlines():
        other_cleaned.append(json.loads(line))

In [4]:
tracker = {
    "php": 0,
    "csharp": 0,
    "python": 0,
    "c": 0,
    "javascript": 0,
    "java": 0,
}
for row in other_cleaned:
    language = row["language"].lower()
    if language == "c++":
        language = "c"
    elif language == "c#":
        language = "csharp"
    if language not in tracker.keys():
        continue
    tracker[language] += 1


print(tracker)

{'php': 1935, 'csharp': 13893, 'python': 57371, 'c': 21510, 'javascript': 26442, 'java': 19257}


In [5]:
linting_results = {}

for key, value in linting_paths.items():
    with open(value, "r", encoding="utf-8") as f:
        linting_results[key] = json.load(f)

In [6]:
for key, value in linting_results.items():
    hashes = set()
    for row in value:
        hashes.add(row["module"])
    print(key, len(hashes))

php 1935
csharp 6517
python 19805
c 18323
javascript 15725
java 20


In [7]:
messages = []

for key, value in linting_results.items():
    for row in value:
        convo_hash = row["module"].split("_")[0]
        messages.append(row["message"])

In [12]:
model = SentenceTransformer("all-MiniLM-L6-v2", backend="openvino")

Multiple OpenVINO files found in 'sentence-transformers/all-MiniLM-L6-v2': ['openvino/openvino_model.xml', 'openvino/openvino_model_qint8_quantized.xml'], defaulting to 'openvino/openvino_model.xml'. Please specify the desired file name via `model_kwargs={"file_name": "<file_name>"}`.


In [None]:
# Step 1: embed raw messages
embeddings = model.encode(messages, batch_size = 128)
np.save("tmp/embeddings.npy", embeddings)

In [None]:
embeddings[0]

In [8]:
embeddings = np.load("tmp/embeddings.npy")

In [11]:
error_categories = []
with open("utils/error_categories.json", "r", encoding="utf-8") as f:
    error_categories = json.load(f)

In [22]:
len(error_categories)

20

In [14]:
error_category_embeddings = {}
for error_category in error_categories:
    
    description = error_category["description"]
    embedding = model.encode([description])
    error_category_embeddings[error_category["category"]] = embedding


In [None]:
# Cosine similarity algorithm on the categories vs error messages

In [19]:
categories = list(error_category_embeddings.keys())
category_matrix = np.vstack([error_category_embeddings[c] for c in categories])

similarity = cosine_similarity(embeddings, category_matrix)

print(similarity.shape)

best_indices = np.argmax(similarity, axis=1)
best_categories = [categories[idx] for idx in best_indices]

results = []
for msg, cat_idx in zip(messages, best_indices):
    results.append({"message": msg, "predicted_category": categories[cat_idx]})

(177732, 20)
{'message': '; expected', 'predicted_category': 'Syntax Error'}


In [25]:
import random

# Step 1: assign each message to its best category, carrying the language key
category_to_lang_msgs = defaultdict(lambda: defaultdict(list))

# Flatten but keep language info
all_rows = []
for lang, rows in linting_results.items():
    for row, cat in zip(rows, best_categories[:len(rows)]):  
        # make sure alignment between rows and best_categories is correct
        all_rows.append((lang, row["message"], cat))

# Group into category → language → messages
for lang, msg, cat in all_rows:
    category_to_lang_msgs[cat][lang].append(msg)

# Step 2: sort categories by number of total messages
sorted_categories = sorted(
    category_to_lang_msgs.items(),
    key=lambda x: sum(len(msgs) for msgs in x[1].values()),
    reverse=True
)

# Step 3: pretty print with language distributions + random samples
for cat, lang_dict in sorted_categories:
    total_msgs = sum(len(msgs) for msgs in lang_dict.values())
    print(f"\nCategory {cat} ({total_msgs} messages):")
    
    for lang, msgs in sorted(lang_dict.items(), key=lambda x: len(x[1]), reverse=True):
        print(f"  Language {lang}: {len(msgs)} messages")
        sample_msgs = random.sample(msgs, min(5, len(msgs)))
        for m in sample_msgs:
            print("    -", m.replace("\n", " ")[:120], "...")


Category Syntax Error (95432 messages):
  Language c: 48598 messages
    - expected '=', ',', ';', 'asm' or '__attribute__' before '.' token ...
    - expected '=', ',', ';', 'asm' or '__attribute__' before 'приведет' ...
    - unknown type name 'address' ...
    - expected declaration specifiers or '...' before '&' token ...
    - unknown type name 'обеспечение' ...
  Language csharp: 30032 messages
    - A get or set accessor expected ...
    - Identifier expected ...
    - ) expected ...
    - Invalid token '=' in class, record, struct, or interface member declaration ...
    - ; expected ...
  Language python: 9585 messages
    - invalid syntax (3dca8fb089f4ba02cab912b049b5edeb_4.py, line 1) ...
    - invalid syntax (662358933687754f576696979835b325_8.py, line 1) ...
    - invalid syntax (4af4d025fe2c5f1ba5802ac8f6d564ad_0.py, line 1) ...
    - invalid syntax (ff481342352594d81e5c3bfca838a604_0.py, line 1) ...
    - invalid syntax (a08540dcc16a8788a10bc8f0ccc2a0f5_2.py, line 1) ..

### Clustering (Not using this rn)

In [44]:
n_clusters = 5  

clustering = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1024)
labels = clustering.fit_predict(embeddings)

In [45]:
labels

array([1, 1, 1, ..., 2, 2, 3], shape=(177732,), dtype=int32)

In [46]:
clusters = defaultdict(list)
for label, msg in zip(labels, messages):
    clusters[label].append(msg)

In [47]:
for cid in range(n_clusters):
    print(f"\nCluster {cid} ({len(clusters[cid])} messages):")
    for m in clusters[cid][:5]:  # preview first 5 per cluster
        print("  -", m.replace("\n", " ")[:120], "...")


Cluster 0 (44985 messages):
  - Identifier expected ...
  - Identifier expected ...
  - Identifier expected ...
  - Identifier expected ...
  - Identifier expected ...

Cluster 1 (40757 messages):
  - The file does not include <?php tag ...
  - The file does not include <?php tag ...
  - The file does not include <?php tag ...
  - The file does not include <?php tag ...
  - The file does not include <?php tag ...

Cluster 2 (12187 messages):
  - The modifier 'public' is not valid for this item ...
  - A namespace cannot directly contain members such as fields, methods or statements ...
  - The modifier 'private' is not valid for this item ...
  - Member modifier 'protected' must precede the member type and name ...
  - The modifier 'public' is not valid for this item ...

Cluster 3 (62712 messages):
  - Type or namespace definition, or end-of-file expected ...
  - Type or namespace definition, or end-of-file expected ...
  - Type or namespace definition, or end-of-file expected ...
  

In [48]:
from collections import Counter, defaultdict

# Attach labels to rows (messages, symbols, types, etc.)
all_rows = []
i = 0
for key, value in linting_results.items():
    for row in value:
        row_copy = dict(row)  # keep original fields
        row_copy["language"] = key
        row_copy["cluster"] = int(labels[i])  # add cluster label
        all_rows.append(row_copy)
        i += 1

print(f"Total rows with clusters: {len(all_rows)}")

# Group by cluster
clusters = defaultdict(list)
for row in all_rows:
    clusters[row["cluster"]].append(row)

# --- Analysis per cluster ---
for cid, rows in clusters.items():
    print("\n" + "="*60)
    print(f"Cluster {cid} (size={len(rows)})")
    print("="*60)

    # Count messages
    msg_counts = Counter([r["message"] for r in rows if "message" in r])
    print("Top 5 messages:")
    for msg, count in msg_counts.most_common(5):
        print(f"{count:5d}  {msg}")

    # Count symbols
    sym_counts = Counter([r["symbol"] for r in rows if "symbol" in r])
    print("Top symbols:")
    for sym, count in sym_counts.most_common(3):
        print(f"{count:5d}  {sym}")

    # Count types
    type_counts = Counter([r["type"] for r in rows if "type" in r])
    print("Top types:")
    for typ, count in type_counts.most_common(3):
        print(f"{count:5d}  {typ}")

    # Breakdown by language
    lang_counts = Counter([r["language"] for r in rows])
    print("Languages in cluster:")
    for lang, count in lang_counts.most_common():
        print(f"{lang:12s} {count}")


Total rows with clusters: 177732

Cluster 1 (size=40757)
Top 5 messages:
 9592  Parsing error: Unexpected token <
 4931  Syntax error, ',' expected
 4408  Invalid expression term '<'
 1996  Invalid token ':' in class, record, struct, or interface member declaration
 1935  The file does not include <?php tag
Top symbols:
38822  syntax-error
 1935  Not valid php file
Top types:
40757  syntax-error
Languages in cluster:
csharp       20112
javascript   15475
c            3232
php          1935
java         3

Cluster 4 (size=17091)
Top 5 messages:
12728  ; expected
 2568  } expected
  749  { expected
  739  ) expected
  305  Expected expression
Top symbols:
17091  syntax-error
Top types:
17091  syntax-error
Languages in cluster:
csharp       17091

Cluster 3 (size=62712)
Top 5 messages:
 2639  Type or namespace definition, or end-of-file expected
  846  unknown type name 'CREATE'
  844  unknown type name 'class'
  670  stray '#' in program
  568  Top-level statements must precede namespace