In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix
from tensorflow.keras import models, layers
import warnings

warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("C:\\Users\\ASUS\\Downloads\\sqli.csv", encoding='utf-16')

X = df['Sentence']
y = df['Label']



In [7]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
vectorizer = CountVectorizer(min_df = 2, max_df = 0.8, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(X.values.astype('U')).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3360, 4717)
(3360,)
(840, 4717)
(840,)


In [9]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
print(f"Accuracy of Logistic Regression on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of Logistic Regression on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)
print ("Recall=", Recall)
print ("FMeasure", F1)


Accuracy of Logistic Regression on test set : 0.9273809523809524
F1 Score of Logistic Regression on test set : 0.8478802992518704
sensitivity= 0.748898678414097
specificity= 0.9934747145187602
Precision= 0.9770114942528736
Recall= 0.748898678414097
FMeasure 0.8478802992518704


In [10]:
from sklearn.ensemble import AdaBoostClassifier

In [11]:
lr_clf = AdaBoostClassifier(n_estimators=100)
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
print(f"Accuracy of AdaBoost Regression on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of AadaBoost Regression on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)
print ("Recall=", Recall)
print ("FMeasure", F1)

Accuracy of AdaBoost Regression on test set : 0.9035714285714286
F1 Score of AadaBoost Regression on test set : 0.7885117493472584
sensitivity= 0.6651982378854625
specificity= 0.9918433931484503
Precision= 0.967948717948718
Recall= 0.6651982378854625
FMeasure 0.7885117493472584


In [12]:
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(f"Accuracy of Random Forest on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of Random Forest on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)
print ("Recall=", Recall)
print ("FMeasure", F1)

Accuracy of Random Forest on test set : 0.9285714285714286
F1 Score of Random Forest on test set : 0.8832684824902725
sensitivity= 1.0
specificity= 0.9021207177814029
Precision= 0.7909407665505227
Recall= 1.0
FMeasure 0.8832684824902725


In [14]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print(f"Accuracy of Naive Bayes on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of Naive Bayes on test set : {f1_score(y_pred, y_test)}")

confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
print ("Precision=",Precision)

Accuracy of Naive Bayes on test set : 0.9833333333333333
F1 Score of Naive Bayes on test set : 0.9700854700854701
sensitivity= 1.0
specificity= 0.9771615008156607
Precision= 0.941908713692946


In [18]:
import xgboost as xgb

In [19]:
nb_clf = xgb.XGBClassifier(n_estimators=100)
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print(f"Accuracy of XGBoost on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of XGBoost on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)
print ("Recall=", Recall)
print ("FMeasure", F1)

Accuracy of XGBoost on test set : 0.8964285714285715
F1 Score of XGBoost on test set : 0.7667560321715818
sensitivity= 0.6299559471365639
specificity= 0.9951060358890701
Precision= 0.9794520547945206
Recall= 0.6299559471365639
FMeasure 0.7667560321715818
