In [2]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/content/sqli.csv", encoding='utf-16')
df = df.dropna()


In [None]:
X = df['Sentence'].values
y = df['Label'].values

In [None]:
def tokenize_sql_query(query):
    return query.split()

In [None]:
X_tokens = [tokenize_sql_query(query) for query in X]

In [None]:
word2vec_model = Word2Vec(sentences=X_tokens, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec_sqli.model")


In [None]:
def get_average_word2vec(tokens_list, model, embedding_dim=100):
    vectors = [model.wv[word] for word in tokens_list if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)  # Return zero vector if no word is in the model

X_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in X_tokens])


In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))


In [None]:
def create_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)), input_shape=input_shape))
    model.add(Dropout(0.2))

    model.add(Bidirectional(LSTM(96, return_sequences=True, kernel_regularizer=l2(0.01))))
    model.add(Dropout(0.2))

    model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01))))
    model.add(Dropout(0.1))

    model.add(Bidirectional(LSTM(128, kernel_regularizer=l2(0.01))))

    model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=6.204036987017244e-05), metrics=['accuracy'])
    return model


In [None]:
model = create_model((X_word2vec.shape[1], 1))
history = model.fit(X_word2vec, y, epochs=50, batch_size=64,
                    callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6),
                               EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
                    class_weight=class_weight_dict, verbose=1)


In [None]:
y_pred_prob = model.predict(X_word2vec)

In [None]:
fpr, tpr, thresholds = roc_curve(y, y_pred_prob)
roc_auc = auc(fpr, tpr)


In [None]:
j_scores = tpr - fpr
optimal_threshold_index = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_threshold_index]
print(f"Optimal Threshold: {optimal_threshold}")


In [None]:
y_pred = (y_pred_prob > optimal_threshold).astype(int)

In [None]:
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy}")

# Classification Report
class_report = classification_report(y, y_pred)
print(f"Classification Report:\n{class_report}")


In [None]:
# Save the model
model.save('trained_lstm_model.h5')


