# Import the necessary modules

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
import pickle

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor


In [2]:
def custom_tokenizer(text):
    patterns = {
        "SQL_Keywords": r'\b(SELECT|FROM|WHERE|INSERT INTO|VALUES|UPDATE|SET|AND|OR)\b',
        "Malicious_Patterns": r'\b(utl_inaddr.get_host_address|SLEEP\()',
        "Specific_Functions": r'\b(dbms_pipe.receive_message)\b',
    }

    pattern_counts = {pattern_name: len(re.findall(pattern, text, re.IGNORECASE)) for pattern_name, pattern in patterns.items()}
    
    return " ".join([f"{name}:{count}" for name, count in pattern_counts.items()])



# OCC

## One Class SVM

In [3]:
df = pd.read_csv("imbalanced_dataset.tsv",delimiter='\t')

df = df.dropna()

X = df['Query']
y = df['Label']

In [4]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, binary=False)
X = vectorizer.fit_transform(df['Query'])



In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
model = OneClassSVM(gamma='scale', nu=0.1)
X_train = X_train[y_train==0]
model.fit(X_train)


In [34]:
y_pred = model.predict(X_test)
y_test[y_test == 0] = 1
y_test[y_test == 1] = -1

In [8]:
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

In [9]:
sensitivity= TN/(TN+TP);
print("Sensitivity:",sensitivity)
precision = TP / float(TP + FP)
recall = TP / float(TP + FN)
print ("Precision:",precision)
print ("Recall:", recall)
F1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: %.3f" % F1_score)

Sensitivity: 0.9910313901345291
Precision: 0.22764227642276422
Recall: 0.9032258064516129
F1 Score: 0.364


## Isolation forest

In [10]:
df = pd.read_csv("imbalanced_dataset.tsv",delimiter='\t')

df = df.dropna()

X = df['Query']
y = df['Label']

In [11]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, binary=False)
X = vectorizer.fit_transform(df['Query'])



In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [13]:
model = IsolationForest(contamination=0.1)
X_train = X_train[y_train==0]
model.fit(X_train)


In [14]:
y_pred = model.predict(X_test)
y_test[y_test == 0] = 1
y_test[y_test == 1] = -1

In [15]:
cm = confusion_matrix(y_test, y_pred)

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

In [16]:
sensitivity= TN/(TN+TP);
print("Sensitivity:",sensitivity)
precision = TP / float(TP + FP)
recall = TP / float(TP + FN)
print ("Precision:",precision)
print ("Recall:", recall)
F1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: %.3f" % F1_score)

Sensitivity: 0.9893169310456459
Precision: 0.20245398773006135
Recall: 0.9705882352941176
F1 Score: 0.335


## Local Outlier Factor

In [17]:
df = pd.read_csv("imbalanced_dataset.tsv",delimiter='\t')

df = df.dropna()

X = df['Query']
y = df['Label']

In [18]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, binary=False)
X = vectorizer.fit_transform(df['Query'])



In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [20]:
model = LocalOutlierFactor(contamination=0.0003,novelty=True)

In [21]:
X_train = X_train[y_train==0]
model.fit(X_train)

In [22]:
y_pred = model.predict(X_test)
y_test[y_test == 0] = 1
y_test[y_test == 1] = -1

In [23]:
cm = confusion_matrix(y_test, y_pred)

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

In [24]:
sensitivity= TN/(TN+TP);
print("Sensitivity:",sensitivity)
precision = TP / float(TP + FP)
recall = TP / float(TP + FN)
print ("Precision:",precision)
print ("Recall:", recall)
F1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: %.3f" % F1_score)

Sensitivity: 0.9875156054931336
Precision: 0.03541912632821724
Recall: 1.0
F1 Score: 0.068


# Saving the best performing model

In [25]:
# save the model to disk
model_filename = 'sqli_model.h5'
pickle.dump(model, open(model_filename, 'wb'))

In [26]:
# saving the vocabulary to the disk
voc_output_filename = 'sqli_vocabulary.pkl'
pickle.dump(vectorizer.vocabulary_,open(voc_output_filename,'wb'))

In [27]:
# load the model from disk
loaded_model = pickle.load(open(model_filename, 'rb'))

In [28]:
user_input_array=["	' AND 1 = utl_inaddr.get_host_address  (  (  SELECT SYS.DATABASE_NAME FROM DUAL  )  )   AND 'i' = 'i "]

In [29]:
input_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_)
vectorized_input = input_vectorizer.fit_transform(user_input_array).toarray()




In [30]:
vectorized_input=vectorized_input.reshape(1,-1)

In [31]:
predictions=loaded_model.predict(vectorized_input)

In [32]:
predictions[0]

-1

In [33]:
input = ["carte"]
vect_input = vectorizer.transform(input)
print(vect_input.toarray());

[[2 3 0 0 0 0 0 0 0 0 0 3 1 1 1 1 1 1 2 3 2 4 1 3 1 5 1 3 3 1 2 4 3 2 1 1]]
