In [None]:
!pip install kagglehub[pandas-datasets]
!pip install -U transformers

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from tqdm import tqdm
import json

In [None]:
model_name = 'philomath-1209/programming-language-identification'
loaded_tokenizer = AutoTokenizer.from_pretrained(model_name)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model.to(device)

In [None]:
def classify_code(codes, model, tokenizer, device=device):
    # Filter out empty or invalid codes
    codes = [c if isinstance(c, str) and c.strip() else " " for c in codes]

    # Tokenize batch
    inputs = tokenizer(
        codes,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )

    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Forward pass on GPU
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1).tolist()
    labels = [model.config.id2label[i] for i in predicted_ids]
    return labels


texts = [
    """public class HelloWorld { public static void Main(String[] args) { System.out.println("Hello World!"); } }""",
    """print("hello world")"""
]

labels = classify_code(texts, loaded_model, loaded_tokenizer)
print(labels)

In [5]:
with open("../../tmp/code_snippets.json", "r", encoding="utf-8") as f:
    data = json.load(f)

unknown_indexes = [idx for (idx, value) in enumerate(data) if value["language"] == None]
unknown_rows = [row for row in data if row["language"] == None]

In [None]:
BATCH_SIZE = 8

for start_idx in tqdm(range(0, len(unknown_rows), BATCH_SIZE), desc="Processing batches"):
    end_idx = min(start_idx + BATCH_SIZE, len(unknown_rows))
    
    batch_codes = unknown_rows[start_idx:end_idx]
    batch_codes = [row["code"] for row in batch_codes]

    # Predict languages
    batch_predictions = classify_code(batch_codes, loaded_model, loaded_tokenizer)

    # Update dataframe only for non-empty entries
    for idx, pred in zip(batch_codes, batch_predictions):
        unknown_rows[start_idx + idx, "language"] = pred

In [None]:
for i, idx in enumerate(unknown_indexes):
    data[idx] = unknown_rows[i]

In [None]:
with open("../../tmp/code_snippets.json", "w", encoding="utf-8") as f:
    data = json.dump(f)