# Hybrid Code Clustering (GPU-Optimized) — CodeBERT + AST/Rule

This notebook enables CUDA for CodeBERT fine-tuning and embedding; optional RAPIDS for GPU clustering.

In [None]:

# === 0) REQUIREMENTS (run once if needed)
# !pip install -U sentence-transformers transformers accelerate scikit-learn pandas numpy tqdm
# Optional (GPU clustering with RAPIDS cuML; install per your CUDA/OS):
# !pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
# !pip install umap-learn hdbscan


In [None]:

# === 1) CONFIG
from pathlib import Path
DATA_ROOT = Path(r"D:\GitHub\dolap\files_dolab\files")
CORRECT_DIR = Path(r"C:\files_dolab\dolap\testfile")
INCORRECT_DIR = Path(r"C:\files_dolab\dolap\testvfile")
OUT_DIR = Path("outputs_hybrid_gpu"); OUT_DIR.mkdir(exist_ok=True)

DO_FINE_TUNE = True
EPOCHS = 2
BATCH_SIZE = 24
MAX_SEQ_LEN = 512

BASE_MODEL = "microsoft/codebert-base"
FT_MODEL_DIR = OUT_DIR / "codebert_embed_contrastive_selfaug_gpu"
USE_RAPIDS = True

print("DATA_ROOT:", DATA_ROOT)
print("correct dir exists:", CORRECT_DIR.exists(), "incorrect dir exists:", INCORRECT_DIR.exists())
print("Output dir:", OUT_DIR)


DATA_ROOT: C:\files_dolab\vfiles
correct dir exists: True incorrect dir exists: True
Output dir: outputs_hybrid_gpu


In [3]:

# === 2) GPU CHECK
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass
if DEVICE == "cuda":
    torch.backends.cudnn.benchmark = True
print("Using device:", DEVICE)


Using device: cpu


In [None]:

# === 3) LOAD + AUGMENT
import re, random
from pathlib import Path

def load_py_with_paths(folder: Path):
    items = []
    for p in folder.rglob("*.py"):
        try:
            txt = Path(p).read_text(encoding="utf-8", errors="ignore")
        except Exception:
            continue
        items.append({"path": str(p), "text": txt})
    return items

def aug_minimal(code: str) -> str:
    x = re.sub(r"(?m)#.*$", "", code)
    x = re.sub(r"(\"\"\"|\'\'\')(?:.|\n)*?\1", "", x)
    x = re.sub(r"[ \t]+", " ", x)
    x = re.sub(r"\n{3,}", "\n\n", x).strip()
    return x

COMMON_RENAMES = [(r"\btemp\b","varA"), (r"\bdata\b","varB"), (r"\bi\b","idx"), (r"\bj\b","jdx"), (r"\bresult\b","outVal")]
def aug_rename_vars_light(code: str) -> str:
    x = code
    for pat, rep in COMMON_RENAMES:
        x = re.sub(pat, rep, x)
    return x

rows_correct = load_py_with_paths(CORRECT_DIR) if CORRECT_DIR.exists() else []
rows_incorrect = load_py_with_paths(INCORRECT_DIR) if INCORRECT_DIR.exists() else []
print(f"Loaded files — correct: {len(rows_correct)} | incorrect: {len(rows_incorrect)}")

pairs = []
for row in rows_correct + rows_incorrect:
    a1 = aug_minimal(row["text"]); a2 = aug_rename_vars_light(row["text"])
    pairs.append((a1, a2))
print("Positive pairs prepared:", len(pairs))


In [None]:

# === 4) SELF-CONTRASTIVE FINE-TUNING (GPU)
from datetime import datetime
if DO_FINE_TUNE and len(pairs) > 0:
    from sentence_transformers import SentenceTransformer, models, InputExample, losses
    from torch.utils.data import DataLoader
    word = models.Transformer(BASE_MODEL, max_seq_length=MAX_SEQ_LEN)
    pool = models.Pooling(word.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
    sbert = SentenceTransformer(modules=[word, pool], device=DEVICE)
    train_examples = [InputExample(texts=[a,b]) for a,b in pairs]
    train_dl = DataLoader(train_examples, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, pin_memory=(DEVICE=='cuda'))
    loss_fn = losses.MultipleNegativesRankingLoss(sbert)
    warmup_steps = int(0.1 * len(train_dl) * EPOCHS)
    print("Starting fine-tune on", DEVICE, "...", datetime.now())
    sbert.fit(train_objectives=[(train_dl, loss_fn)], epochs=EPOCHS, warmup_steps=warmup_steps, use_amp=True, show_progress_bar=True)
    sbert.save(str(FT_MODEL_DIR))
    print("Saved fine-tuned model →", FT_MODEL_DIR)
else:
    print("[SKIP] Fine-tune disabled or no data.")


In [None]:

# === 5) EMBEDDING (GPU)
import numpy as np
from sentence_transformers import SentenceTransformer, models
texts = [r["text"] for r in rows_correct + rows_incorrect]
labels = [1]*len(rows_correct) + [0]*len(rows_incorrect)
paths  = [r["path"] for r in rows_correct + rows_incorrect]
if len(texts) == 0:
    raise SystemExit("No .py files found.")

if FT_MODEL_DIR.exists():
    emb_model = SentenceTransformer(str(FT_MODEL_DIR), device=DEVICE)
    print("Loaded fine-tuned model:", FT_MODEL_DIR)
else:
    emb_model = SentenceTransformer(modules=[models.Transformer(BASE_MODEL, max_seq_length=MAX_SEQ_LEN), models.Pooling(768, pooling_mode_mean_tokens=True)], device=DEVICE)
    print("Loaded base model:", BASE_MODEL)

EMB = emb_model.encode(texts, normalize_embeddings=True, batch_size=64 if DEVICE=='cuda' else 16, show_progress_bar=True, device=DEVICE).astype("float32")
np.save(OUT_DIR/"embeddings.npy", EMB)
np.save(OUT_DIR/"labels.npy", np.array(labels, dtype=np.int32))
with open(OUT_DIR/"paths.txt", "w", encoding="utf-8") as f:
    for p in paths: f.write(p + "\\n")
print("Saved embeddings/labels/paths →", OUT_DIR)


In [None]:

# === 6) AST/RULE FEATURES (CPU)
import ast, json
from collections import Counter
import pandas as pd

def ast_counts(src: str):
    try:
        t = ast.parse(src)
    except Exception:
        return {}, 0
    c = Counter()
    class V(ast.NodeVisitor):
        def generic_visit(self, node):
            c[type(node).__name__] += 1
            super().generic_visit(node)
    V().visit(t)
    return {f"AST_{k}": v for k, v in c.items()}, 1

def rule_signals(src: str):
    s = src.lower()
    return {
        "uses_match": int("match " in s),
        "uses_eval": int("eval(" in s),
        "uses_try": int("try:" in s),
        "uses_dict": int("{" in src and "}" in src and ":" in src),
        "has_zero_check_literal": int("== 0" in s or "==0" in s or "!= 0" in s or "!=0" in s),
        "has_zero_check_except": int("zerodivisionerror" in s),
        "len_chars": len(src),
        "len_lines": src.count("\\n") + (1 if src else 0),
    }

struct_rows = []
for p, t in zip(paths, texts):
    cnts, ok = ast_counts(t); sigs = rule_signals(t)
    row = {"path": p, **cnts, **sigs}; struct_rows.append(row)

df_struct = pd.DataFrame(struct_rows).fillna(0)
df_struct.to_csv(OUT_DIR/"struct_features.csv", index=False)
print("Saved structural features →", OUT_DIR/"struct_features.csv")


In [None]:

# === 7) HYBRID + (Optional) GPU CLUSTERING (RAPIDS) ==========================
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

HAS_CUML = False
try:
    if USE_RAPIDS:
        from cuml.cluster import KMeans as cuKMeans
        from cuml.manifold import UMAP as cuUMAP
        HAS_CUML = True
except Exception:
    HAS_CUML = False

E = np.load(OUT_DIR/"embeddings.npy")
df_struct = pd.read_csv(OUT_DIR/"struct_features.csv")
with open(OUT_DIR/"paths.txt","r",encoding="utf-8") as f:
    saved_paths = [line.strip() for line in f]

num_cols = [c for c in df_struct.columns if c != "path"]
X_struct = df_struct[num_cols].to_numpy(dtype="float32")
X_struct = np.nan_to_num(X_struct, nan=0.0, posinf=1e6, neginf=-1e6)

scaler = StandardScaler()
X_struct_z = scaler.fit_transform(X_struct)

X = np.hstack([E, X_struct_z]).astype("float32")
np.save(OUT_DIR/"hybrid_features.npy", X)

# Reducer
if HAS_CUML and X.shape[0] >= 5:
    umap = cuUMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
    Xr = umap.fit_transform(X)
    reducer_name = "cuML-UMAP(2D)"
else:
    pca = PCA(n_components=min(32, X.shape[1]), random_state=42)
    Xr = pca.fit_transform(X)
    reducer_name = f"PCA({Xr.shape[1]})"

# Clustering
if HAS_CUML and Xr.shape[0] >= 2:
    best_k, best_inertia, best_labels = None, float("inf"), None
    for k in range(2, min(12, max(2, Xr.shape[0]-1))+1):
        km = cuKMeans(n_clusters=k, random_state=42)
        labs = km.fit_predict(Xr).to_array()
        if km.inertia_ < best_inertia:
            best_inertia, best_k, best_labels = km.inertia_, k, labs
    labels_cluster, alg = best_labels, f"cuML-KMeans(k={best_k}) + {reducer_name}"
else:
    best_k, best_s, best_labels = None, -1, None
    for k in range(2, min(12, max(2, Xr.shape[0]-1))+1):
        km = KMeans(n_clusters=k, n_init="auto", random_state=42).fit(Xr)
        labs = km.labels_
        if len(set(labs)) < 2: continue
        try:
            s = silhouette_score(Xr, labs)
        except Exception:
            s = -1
        if s > best_s:
            best_s, best_k, best_labels = s, k, labs
    labels_cluster = best_labels if best_labels is not None else KMeans(n_clusters=2, n_init="auto", random_state=42).fit_predict(Xr)
    alg = f"CPU-KMeans(k={best_k}) + {reducer_name}"

print("Clustering algorithm:", alg)

df_clusters = pd.DataFrame({"path": saved_paths, "cluster": labels_cluster, "dim1": Xr[:,0], "dim2": Xr[:,1] if Xr.shape[1] > 1 else 0.0})
df_clusters.to_csv(OUT_DIR/"hybrid_clusters.csv", index=False)
print("Saved →", OUT_DIR/"hybrid_clusters.csv")
