In [1]:
!pip install transformers scikit-learn torch



In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [3]:
code_snippets = []
labels = []
with open('datasets.csv', 'r') as file:
    data = file.readlines()
    # separate the data by ::::: delimiter
    for line in data:
        line = line.split(':::::')
        code_snippets.append(line[0])
        labels.append(int(line[1]))

In [4]:
from sklearn.utils import shuffle
code_snippets, labels = shuffle(code_snippets, labels, random_state=0)

In [5]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(code_snippets, labels, test_size=0.2, random_state=32)


In [6]:
# Define a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [8]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))
model.to(device)

# Define training parameters
batch_size = 16
max_length = 512
learning_rate = 5e-5
num_epochs = 20

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Create DataLoader for training and validation sets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()




In [10]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    val_preds = []
    val_true = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits


        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)  # Move predictions back to CPU
        val_preds.extend(preds)
        val_true.extend(labels.cpu().numpy())  # Move labels back to CPU
        # preds = np.argmax(logits.detach().numpy(), axis=1)
        # val_preds.extend(preds)
        # val_true.extend(labels.numpy())

    print("Epoch:", epoch + 1)
    print("Validation Report:")
    print(classification_report(val_true, val_preds))
    if accuracy_score(val_true, val_preds) >= 0.94:
        break

Epoch: 1
Validation Report:
              precision    recall  f1-score   support

           0       0.38      1.00      0.55        19
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.38        50
   macro avg       0.05      0.12      0.07        50
weighted avg       0.14      0.38      0.21        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2
Validation Report:
              precision    recall  f1-score   support

           0       0.53      1.00      0.69        19
           1       0.67      0.44      0.53         9
           2       0.50      0.29      0.36         7
           3       0.25      1.00      0.40         1
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.52        50
   macro avg       0.24      0.34      0.25        50
weighted avg       0.40      0.52      0.42        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 3
Validation Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90        19
           1       1.00      0.89      0.94         9
           2       0.75      0.86      0.80         7
           3       0.50      1.00      0.67         1
           4       1.00      0.83      0.91         6
           5       0.75      1.00      0.86         3
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.84        50
   macro avg       0.60      0.70      0.63        50
weighted avg       0.77      0.84      0.80        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 4
Validation Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90        19
           1       1.00      0.89      0.94         9
           2       0.86      0.86      0.86         7
           3       0.50      1.00      0.67         1
           4       1.00      1.00      1.00         6
           5       0.75      1.00      0.86         3
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.86        50
   macro avg       0.62      0.72      0.65        50
weighted avg       0.79      0.86      0.82        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 5
Validation Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.90      1.00      0.95         9
           2       1.00      0.86      0.92         7
           3       0.33      1.00      0.50         1
           4       0.75      1.00      0.86         6
           5       1.00      1.00      1.00         3
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.86        50
   macro avg       0.62      0.73      0.65        50
weighted avg       0.82      0.86      0.83        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 6
Validation Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.90      1.00      0.95         9
           2       0.88      1.00      0.93         7
           3       1.00      1.00      1.00         1
           4       0.86      1.00      0.92         6
           5       0.75      1.00      0.86         3
           6       0.00      0.00      0.00         4
           7       1.00      1.00      1.00         1

    accuracy                           0.90        50
   macro avg       0.79      0.87      0.83        50
weighted avg       0.83      0.90      0.86        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 7
Validation Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         7
           3       0.50      1.00      0.67         1
           4       0.86      1.00      0.92         6
           5       1.00      1.00      1.00         3
           6       1.00      0.50      0.67         4
           7       1.00      1.00      1.00         1

    accuracy                           0.96        50
   macro avg       0.92      0.94      0.91        50
weighted avg       0.97      0.96      0.96        50



In [11]:
# 0: No vulnerability, 1: XSS, 2: Command/Code Injection, 3: Prototype Pollution, 4: File Inclusion, 5: SQL Injection, 6: Bypassing input validation, 7: Excessive data exposure

# Prediction
new_code_snippet = ["$a = $_GET['id']; $sql = `SELECT * FROM users WHERE id = ${a}`"]
encoding = tokenizer(new_code_snippet, add_special_tokens=True, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits

logits_cpu = logits.detach().cpu().numpy()
prediction = np.argmax(logits_cpu, axis=1)
print("Prediction:", prediction)

Prediction: [5]


In [12]:
# 0: No vulnerability, 1: XSS, 2: Command/Code Injection, 3: Prototype Pollution, 4: File Inclusion, 5: SQL Injection, 6: Bypassing input validation, 7: Excessive data exposure

# Prediction
new_code_snippet = ["$username = $_POST['username']; $password = $_POST['password']; $stmt = $connection->prepare(\"SELECT * FROM users WHERE username=? AND password=?\");"]
encoding = tokenizer(new_code_snippet, add_special_tokens=True, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits

logits_cpu = logits.detach().cpu().numpy()
prediction = np.argmax(logits_cpu, axis=1)
print("Prediction:", prediction)

Prediction: [5]


In [18]:
# 0: No vulnerability, 1: XSS, 2: Command/Code Injection, 3: Prototype Pollution, 4: File Inclusion, 5: SQL Injection, 6: Bypassing input validation, 7: Excessive data exposure

# Prediction
new_code_snippet = ["$a = $_GET['a']; echo '<p>test</p> ' . $a;"]
encoding = tokenizer(new_code_snippet, add_special_tokens=True, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits

logits_cpu = logits.detach().cpu().numpy()
prediction = np.argmax(logits_cpu, axis=1)
print("Prediction:", prediction)

Prediction: [1]


In [14]:
# 0: No vulnerability, 1: XSS, 2: Command/Code Injection, 3: Prototype Pollution, 4: File Inclusion, 5: SQL Injection, 6: Bypassing input validation, 7: Excessive data exposure

new_code_snippet = ["$a = $_GET['a']; shell_exec(\"cat flag.txt \" . escapeshellcmd($a));"]
encoding = tokenizer(new_code_snippet, add_special_tokens=True, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits

logits_cpu = logits.detach().cpu().numpy()
prediction = np.argmax(logits_cpu, axis=1)
print("Prediction:", prediction)

Prediction: [2]


In [15]:
# 0: No vulnerability, 1: XSS, 2: Command/Code Injection, 3: Prototype Pollution, 4: File Inclusion, 5: SQL Injection, 6: Bypassing input validation, 7: Excessive data exposure

new_code_snippet = ["include($_GET['s'] . \".php\");"]
encoding = tokenizer(new_code_snippet, add_special_tokens=True, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits

logits_cpu = logits.detach().cpu().numpy()
prediction = np.argmax(logits_cpu, axis=1)
print("Prediction:", prediction)

Prediction: [4]


In [19]:
model.save_pretrained("bloom_model")

In [20]:
import shutil
shutil.make_archive("bloom_model", 'zip', "bloom_model")

'/content/bloom_model.zip'

In [21]:
from google.colab import files
files.download('bloom_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>