In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from tensorflow.keras.layers import Bidirectional

In [86]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
df=pd.read_csv("/content/sqli.csv", encoding='utf-16')
df.head()

Unnamed: 0,Sentence,Label
0,select * from users where id = 1 or ( \ ) =...,1
1,select * from users where id = 1 or ( \. ) ...,1
2,select * from users where id = 1 or ( \+ ) ...,1
3,select * from users where id = 1 or ( 1 ) =...,1
4,select * from users where id = 1 or ( \+ ) ...,1


In [88]:
df = df.dropna()

In [89]:
X = df['Sentence'].values
y = df['Label']

In [90]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [91]:

X_pad = pad_sequences(X_seq, padding='post')

In [92]:
input_length = X.shape[0]  # Get the size of the first (and only) dimension

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.3, random_state=42,shuffle=True)

In [94]:
vocab_size = len(tokenizer.word_index) + 1  # Ensure vocab size includes padding and OOV token
embedding_dim = 100

In [102]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))



In [103]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [104]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/5
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 3s/step - accuracy: 0.7916 - loss: 0.5086 - val_accuracy: 0.9420 - val_loss: 0.1392
Epoch 2/5
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 3s/step - accuracy: 0.9457 - loss: 0.1179 - val_accuracy: 0.9585 - val_loss: 0.0790
Epoch 3/5
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 3s/step - accuracy: 0.9660 - loss: 0.0677 - val_accuracy: 0.9690 - val_loss: 0.0611
Epoch 4/5
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 3s/step - accuracy: 0.9775 - loss: 0.0491 - val_accuracy: 0.9717 - val_loss: 0.0649
Epoch 5/5
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 3s/step - accuracy: 0.9843 - loss: 0.0432 - val_accuracy: 0.9750 - val_loss: 0.0500


<keras.src.callbacks.history.History at 0x7c77cc574970>

In [105]:
# Evaluate on the test data
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)  # Convert predictions to 0 or 1

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 558ms/step


In [106]:
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.9749670619235836


In [107]:
y_pred_train = (model.predict(X_train) > 0.5)

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 544ms/step


In [108]:
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Train Accuracy: {test_accuracy}")

Train Accuracy: 0.9749670619235836


In [109]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1124
           1       0.99      0.92      0.95       394

    accuracy                           0.97      1518
   macro avg       0.98      0.96      0.97      1518
weighted avg       0.98      0.97      0.97      1518



In [110]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1119,    5],
       [  33,  361]])

In [111]:
from sklearn.metrics import classification_report

print("Training Classification Report:")
print(classification_report(y_train, y_pred_train))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred))

Training Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2684
           1       1.00      0.96      0.98       856

    accuracy                           0.99      3540
   macro avg       0.99      0.98      0.98      3540
weighted avg       0.99      0.99      0.99      3540

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1124
           1       0.99      0.92      0.95       394

    accuracy                           0.97      1518
   macro avg       0.98      0.96      0.97      1518
weighted avg       0.98      0.97      0.97      1518



In [112]:
overlap = np.intersect1d(y_train.index, y_test.index)
if len(overlap) > 0:
    print(f"Overlap found: {overlap}")
else:
    print("No overlap between training and testing datasets.")

No overlap between training and testing datasets.
