# SQL Injection Detection using NLP

**Project by RSK World**  
**Founder:** Molla Samser  
**Designer & Tester:** Rima Khatun  
**Contact:** help@rskworld.in | support@rskworld.in  
**Phone:** +91 93305 39277  
**Location:** Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147  
**Website:** https://rskworld.in

## Project Description

This notebook demonstrates the analysis and training of a machine learning model for SQL injection detection using natural language processing techniques.


In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.append('..')

# Import project modules
from src.feature_extractor import SQLFeatureExtractor
from src.model_trainer import SQLInjectionModelTrainer
from src.detector import SQLInjectionDetector

print("Libraries imported successfully!")


## 1. Load and Explore Data


In [None]:
# Load training data
df = pd.read_csv('../data/training_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nFirst few samples:")
df.head(10)


## 2. Feature Extraction


In [None]:
# Initialize feature extractor
feature_extractor = SQLFeatureExtractor()

# Extract features for a sample query
sample_query = "SELECT * FROM users WHERE id = 1 OR 1=1"
features = feature_extractor.extract_all_features(sample_query)
feature_names = feature_extractor.get_feature_names()

print(f"Sample Query: {sample_query}")
print(f"\nExtracted {len(features)} features:")
for name, value in zip(feature_names, features):
    if value != 0:
        print(f"  {name}: {value}")


## 3. Train Model


In [None]:
# Initialize trainer
trainer = SQLInjectionModelTrainer()

# Load and prepare data
X, y = trainer.load_data('../data/training_data.csv')
X_train, X_test, y_train, y_test = trainer.prepare_data(X, y)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


In [None]:
# Train models
models = trainer.train_models(X_train, y_train)

# Evaluate models
results = trainer.evaluate_models(models, X_test, y_test)


## 4. Model Evaluation and Visualization


In [None]:
# Compare model performance
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
f1_scores = [results[name]['f1_score'] for name in model_names]

# Create comparison plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.bar(model_names, accuracies, color='skyblue')
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.set_ylim([0, 1])
ax1.tick_params(axis='x', rotation=45)

ax2.bar(model_names, f1_scores, color='lightcoral')
ax2.set_title('Model F1-Score Comparison')
ax2.set_ylabel('F1-Score')
ax2.set_ylim([0, 1])
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


## 5. Test Detection System


In [None]:
# Select best model and save
best_model = trainer.select_best_model(results)
trainer.save_model(best_model, '../models/sql_injection_model.pkl', '../models/scaler.pkl')

# Initialize detector
detector = SQLInjectionDetector('../models/sql_injection_model.pkl', '../models/scaler.pkl')

# Test queries
test_queries = [
    "SELECT * FROM users WHERE id = 1",
    "SELECT * FROM users WHERE id = 1 OR 1=1",
    "SELECT * FROM users WHERE name = 'admin'--",
    "SELECT * FROM products WHERE price > 100",
]

print("Testing Detection System:")
print("=" * 70)
for query in test_queries:
    result = detector.detect(query)
    status = "⚠️ INJECTION" if result['is_injection'] else "✅ SAFE"
    print(f"\nQuery: {query}")
    print(f"Status: {status}")
    print(f"Confidence: {result['confidence']:.2%}")
    print("-" * 70)
