In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score



# New Section

In [2]:
# Load dataset (ensure y is binary: 0 or 1)
df = pd.read_csv("Combined_dataset.csv")
df.head()



Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1.0
1,create user name identified by pass123 tempora...,1.0
2,AND 1 = utl_inaddr.get_host_address ( ...,1.0
3,select * from users where id = '1' or @ @1 ...,1.0
4,"select * from users where id = 1 or 1#"" ( ...",1.0


In [3]:
df = df.dropna()

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Query'])


In [4]:
y=df['Label']
y.head()

Unnamed: 0,Label
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [5]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)




In [6]:
# Define base learners
gbm = GradientBoostingClassifier(n_estimators=100)
adaboost = AdaBoostClassifier(n_estimators=100)
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
lgbm = LGBMClassifier(n_estimators=100)



In [7]:
# Stacking Classifier for binary classification
stacking_clf = StackingClassifier(
    estimators=[
        ('gbm', gbm),
        ('adaboost', adaboost),
        ('xgb', xgb),
        ('lgbm', lgbm)
    ],
    final_estimator=LogisticRegression(solver='liblinear'),  # Binary classifier
    stack_method='predict_proba'  # Use probability outputs for better performance
)



In [8]:
# Train and evaluate
stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)
y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]  # Get probability for positive class



Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 9986, number of negative: 15629
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18065
[LightGBM] [Info] Number of data points in the train set: 25615, number of used features: 557
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389850 -> initscore=-0.447944
[LightGBM] [Info] Start training from score -0.447944


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 7989, number of negative: 12503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14745
[LightGBM] [Info] Number of data points in the train set: 20492, number of used features: 423
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389859 -> initscore=-0.447903
[LightGBM] [Info] Start training from score -0.447903




[LightGBM] [Info] Number of positive: 7989, number of negative: 12503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053985 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14766
[LightGBM] [Info] Number of data points in the train set: 20492, number of used features: 416
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389859 -> initscore=-0.447903
[LightGBM] [Info] Start training from score -0.447903




[LightGBM] [Info] Number of positive: 7989, number of negative: 12503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14764
[LightGBM] [Info] Number of data points in the train set: 20492, number of used features: 419
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389859 -> initscore=-0.447903
[LightGBM] [Info] Start training from score -0.447903




[LightGBM] [Info] Number of positive: 7989, number of negative: 12503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14823
[LightGBM] [Info] Number of data points in the train set: 20492, number of used features: 428
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389859 -> initscore=-0.447903
[LightGBM] [Info] Start training from score -0.447903




[LightGBM] [Info] Number of positive: 7988, number of negative: 12504
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14758
[LightGBM] [Info] Number of data points in the train set: 20492, number of used features: 422
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389811 -> initscore=-0.448108
[LightGBM] [Info] Start training from score -0.448108




In [9]:
# Metrics for binary classification
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)



In [10]:
print(f"Stacking Ensemble Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"F1 Score: {f1:.4f}")

Stacking Ensemble Accuracy: 0.9963
ROC AUC Score: 0.9987
F1 Score: 0.9952


In [12]:
# Take user input
user_input = input("Enter an SQL query: ")

# Transform using the same vectorizer
input_vector = vectorizer.transform([user_input])

# Predict using the trained model
prediction = stacking_clf.predict(input_vector)[0]
prediction_proba = stacking_clf.predict_proba(input_vector)[0][1]

# Output prediction
if prediction == 1:
    print(f"\n🔴 Predicted: SQL Injection (Risk Score: {prediction_proba:.4f})")
else:
    print(f"\n🟢 Predicted: Safe Query (Risk Score: {prediction_proba:.4f})")

Enter an SQL query: "SELECT * FROM users WHERE username = 'admin' AND CAST('abc' AS int) = 1"; 1

🔴 Predicted: SQL Injection (Risk Score: 0.9972)




In [None]:

import os
import pickle

# Ensure the trained_models directory exists
os.makedirs("trained_models", exist_ok=True)

# Save the trained stacking model
with open("trained_models/model.pkl", "wb") as model_file:
    pickle.dump(stacking_model, model_file)

# Save the TfidfVectorizer
with open("trained_models/vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully.")
