In [None]:
from transformers import pipeline
import torch

# Determine the device to use: 0 for GPU if available, -1 for CPU
device_id = 0 if torch.cuda.is_available() else -1

# Create the pipeline with the specified device
pipe = pipeline("feature-extraction", model="microsoft/codebert-base", device=device_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
device_id = 0 if torch.cuda.is_available() else -1
pipe = pipeline("feature-extraction", model="microsoft/codebert-base")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [None]:
import pandas as pd


df = pd.read_json("hf://datasets/codeparrot/codecomplex/data.jsonl", lines=True)
print(df.shape)


(4517, 4)


In [None]:
df.head(10)


Unnamed: 0,src,complexity,problem,from
0,import java.io.*;\nimport java.math.BigInteger...,quadratic,1179_B. Tolik and His Uncle,CODEFORCES
1,import java.util.Scanner;\n \npublic class pil...,linear,1197_B. Pillars,CODEFORCES
2,import java.io.BufferedReader;\nimport java.io...,linear,1059_C. Sequence Transformation,CODEFORCES
3,import java.util.*;\n\nimport java.io.*;\npubl...,linear,1011_A. Stages,CODEFORCES
4,import java.io.OutputStream;\nimport java.io.I...,linear,1190_C. Tokitsukaze and Duel,CODEFORCES
5,import java.math.BigDecimal;\nimport java.math...,quadratic,527_B. Error Correct System,CODEFORCES
6,import java.util.*;\nimport java.io.*;\n\nimpo...,nlogn,913_D. Too Easy Problems,CODEFORCES
7,import java.io.*;\nimport java.util.*;\n\nimpo...,nlogn,1197_C. Array Splitting,CODEFORCES
8,\n// LM10: The next Ballon d'or\nimport java.u...,linear,1038_D. Slime,CODEFORCES
9,import java.util.*;\nimport java.io.*;\nimport...,constant,1028_B. Unnatural Conditions,CODEFORCES


In [None]:
codes = df['src'].tolist()
labels =df['complexity'].tolist()

In [None]:
num_labels = set(labels)
sorted_labels = sorted(num_labels)
print(f"Number of classes: {len(num_labels)}")
print(f"Labels in alphabetical order: {sorted_labels}")


Number of classes: 7
Labels in alphabetical order: ['constant', 'cubic', 'linear', 'logn', 'nlogn', 'np', 'quadratic']


In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
inputs = tokenizer(codes, padding=True, truncation=True, return_tensors="pt")


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
print("Encoded labels:", encoded_labels[:10])


Encoded labels: [6 2 2 2 2 6 4 4 2 0]


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CodeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach()for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

dataset = CodeDataset(inputs, encoded_labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
from torch.utils.data import random_split


In [None]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=len(label_encoder.classes_))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
train_codes, val_codes, train_labels, val_labels = train_test_split(codes, labels, test_size=0.2, random_state=42)
train_inputs = tokenizer(train_codes, padding=True, truncation=True, return_tensors="pt")
val_inputs = tokenizer(val_codes, padding=True, truncation=True, return_tensors="pt")
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)

# Create datasets and dataloaders
train_dataset = CodeDataset(train_inputs, train_labels_encoded)
val_dataset = CodeDataset(val_inputs, val_labels_encoded)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)



In [None]:
import torch
from torch.optim import AdamW  # Use AdamW from torch.optim
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

# Define device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

# Define accuracy function
def compute_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Set the learning rate
learning_rate = 1e-5

# Initialize optimizer with the specified learning rate
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training parameters
epochs = 10  # You can adjust this as needed
best_val_accuracy = 0

for epoch in range(epochs):
    model.train()
    total_accuracy = 0
    total_loss = 0
    no_deprecation_warning=True

    for batch in train_dataloader:
        # Move batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        # Compute accuracy
        labels = batch['labels']
        accuracy = compute_accuracy(logits, labels)

        total_loss += loss.item()
        total_accuracy += accuracy

    avg_loss = total_loss / len(train_dataloader)
    avg_accuracy = total_accuracy / len(train_dataloader)

    print(f"Epoch {epoch + 1}: Train Loss: {avg_loss:.4f}, Train Accuracy: {avg_accuracy:.4f}")


Epoch 1: Train Loss: 1.4535, Train Accuracy: 0.4271
Epoch 2: Train Loss: 0.7526, Train Accuracy: 0.7422
Epoch 3: Train Loss: 0.5042, Train Accuracy: 0.8317
Epoch 4: Train Loss: 0.3359, Train Accuracy: 0.8894
Epoch 5: Train Loss: 0.2206, Train Accuracy: 0.9311
Epoch 6: Train Loss: 0.1555, Train Accuracy: 0.9531
Epoch 7: Train Loss: 0.1194, Train Accuracy: 0.9652
Epoch 8: Train Loss: 0.1100, Train Accuracy: 0.9649
Epoch 9: Train Loss: 0.0817, Train Accuracy: 0.9732
Epoch 10: Train Loss: 0.0689, Train Accuracy: 0.9757


In [None]:
# Ensure you're saving the model with a filename and .pth or .pt extension
torch.save(model.state_dict(), '/content/drive/MyDrive/skandaks/codebert.pth')
