In [1]:
import pandas as pd
df = pd.read_csv("SQLiV3.tsv", sep = "\t")
df

Unnamed: 0,payload,label
0,SELECT AVG ( gradually ) FROM load SELECT SUM ...,0
1,"guardia, la",0
2,SELECT * FROM ( SELECT highway FROM walk ),0
3,1'+ ( select 'wyxu' where 2555 = 2555 unio...,1
4,SELECT AVG ( war ) FROM layers SELECT SUM ( me...,0
...,...,...
30604,1' ) ) ( select ( case when ( ...,1
30605,SELECT * FROM rest ORDER BY constantly,0
30606,1' and 6510 = ( select count ( * ) f...,1
30607,-6720' ) ) ) or 5023 = ctxsys.dri...,1


In [2]:
import re
query_list = df["payload"].tolist()
query_list = [str(query).lower() for query in query_list]
query_list = [re.sub(r'\d+', '0', query) for query in query_list]
query_list = [re.sub(r'([<>!=])', r' \1 ', query) for query in query_list]
df["Cleaned_Query"] = pd.DataFrame(query_list)

In [3]:
count_label_0 = df[df["label"] == 0].shape[0]
print(count_label_0)

19268


In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.config.max_length = 512

In [6]:
texts = df["Cleaned_Query"].tolist()
labels = df["label"].tolist()

In [7]:
from torch.utils.data import Dataset
class SQLInjectionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        temp = self.tokenizer(self.texts[idx], max_length = 512, padding = "max_length", truncation = True)
        return {"input_ids":torch.tensor(temp.input_ids).squeeze(0), "attention_mask":torch.tensor(temp.attention_mask).squeeze(0), "labels":torch.tensor(self.labels[idx])}
        

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import DataLoader

labels_tensor = torch.tensor(labels)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in sss.split(texts, labels_tensor):
    train_texts = [texts[i] for i in train_idx]
    test_texts = [texts[i] for i in test_idx]
    train_labels = [labels[i] for i in train_idx]
    test_labels = [labels[i] for i in test_idx]

train_dataset = SQLInjectionDataset(train_texts, train_labels, tokenizer)
test_dataset = SQLInjectionDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [16]:
from tqdm import tqdm
from torch.optim import AdamW
num_epochs = 1
optimizer = AdamW(model.parameters(), 5e-5)
model = model.to("cuda")
for i in tqdm(range(num_epochs)):
    epoch_loss = []
    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        optimizer.zero_grad()
        output = model(input_ids = input_ids, attention_mask = attention_mask, labels = batch["labels"].to(torch.long).to("cuda"))
        loss = output.loss
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print(f"Epoch {i}, Loss = {sum(epoch_loss)/len(epoch_loss)}")
        
        

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(

  0%|                                                                                 | 1/1531 [00:01<46:03,  1.81s/it][A
  0%|                                                                                 | 2/1531 [00:02<28:25,  1.12s/it][A
  0%|▏                                                                                | 3/1531 [00:03<31:01,  1.22s/it][A
  0%|▏                                                                                | 4/1531 [00:05<32:20,  1.27s/it][A
  0%|▎                                                                                | 5/1531 [00:06<33:10,  1.30s/it][A
  0%|▎                                                                                | 6/1531 [00:07<33:41,  1.33s/it][A
  0%|▎                                                                     

Epoch 0, Loss = 0.03269311216795991





In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
import torch
from transformers import  BertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("./sql_injection_detection_bert_2")

In [3]:
model.push_to_hub("suhaibrashid17/SQL-Injection-Detection")

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Non-default generation parameters: {'max_length': 512}


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suhaibrashid17/SQL-Injection-Detection/commit/e832332be85680dbe25227559b76ee8e26e8a824', commit_message='Upload BertForSequenceClassification', commit_description='', oid='e832332be85680dbe25227559b76ee8e26e8a824', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
sum(epoch_loss)/len(epoch_loss)

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()

total_loss = 0
correct_predictions = 0
total_samples = 0
all_predictions = []
all_labels = []
model = model.to("cuda")
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        logits = outputs.logits
        _, predicted = torch.max(logits, dim=1)
        
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
        correct_predictions += (predicted == labels).sum().item()
        total_samples += labels.size(0)

avg_loss = total_loss / len(test_loader)
accuracy = correct_predictions / total_samples

precision = precision_score(all_labels, all_predictions, average='binary')
recall = recall_score(all_labels, all_predictions, average='binary')
f1 = f1_score(all_labels, all_predictions, average='binary')

print(f"Test Loss: {avg_loss}")
print(f"Test Accuracy: {accuracy * 100}%")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Test Loss: 0.012525553654086252
Test Accuracy: 99.68964390721986%
Precision: 0.9960299955888839
Recall: 0.9955908289241623
F1 Score: 0.9958103638368246


In [19]:
model.save_pretrained("./sql_injection_detection_bert_2")

Non-default generation parameters: {'max_length': 512}


In [1]:
import gradio as gr
import torch
import re
from transformers import BertTokenizer, BertForSequenceClassification
import sqlvalidator
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("suhaibrashid17/SQL-Injection-Detection")
model.config.max_length = 512
model.to(device)
model.eval()
def detect_sql_injection(sql_query):
    try:
        sql = sqlvalidator.parse(sql_query)
        if not sql.is_valid():
            return "SQL Query Not Valid"
        sql_query = sql_query.lower()
        sql_query = re.sub(r'\d+', '0', sql_query)
        sql_query = re.sub(r'([<>!=])', r' \1 ', sql_query)
        

        with torch.no_grad():
            temp = tokenizer(sql_query, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
            input_ids = temp.input_ids.to(device)
            attention_mask = temp.attention_mask.to(device)
            output = model(input_ids=input_ids, attention_mask=attention_mask)
        
        logits = output.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)
        class_index = torch.argmax(probabilities, dim=1).item()

        if class_index == 0:
            return "Not SQL Injection"
        elif class_index == 1:
            return "SQL Injection"
        else:
            return "Unknown Classification"
    except Exception as e:
        return f"Error: {str(e)}"

interface = gr.Interface(
    fn=detect_sql_injection,
    inputs=gr.Textbox(label="Enter SQL Query"),
    outputs=gr.Label(label="Classification"),
    title="SQL Injection Detection",
    description="Enter an SQL query to determine if it's a potential SQL injection attempt."
)

interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7860


--------


Running on public URL: https://e036021b69c213239a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


