In [1]:
#----import Libraries*----
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import urllib.parse
import pickle

In [2]:
# List of bad words to check in the URL path
badwords = ['sleep', 'uid', 'select', 'waitfor', 'delay', 'system', 'union', 'order by', 'group by', 'admin', 'drop', 'script']

In [3]:
# Feature extraction function
def ExtractFeatures(path, body):
    path, body = str(path), str(body)
    combined_raw = path + body
    raw_percentages = combined_raw.count("%")
    raw_spaces = combined_raw.count(" ")

    raw_percentages_count = raw_percentages if raw_percentages > 3 else 0
    raw_spaces_count = raw_spaces if raw_spaces > 3 else 0

    path_decoded = urllib.parse.unquote_plus(path)
    body_decoded = urllib.parse.unquote_plus(body)

    single_q = path_decoded.count("'") + body_decoded.count("'")
    double_q = path_decoded.count("\"") + body_decoded.count("\"")
    dashes = path_decoded.count("--") + body_decoded.count("--")
    braces = path_decoded.count("(") + body_decoded.count("(")
    spaces = path_decoded.count(" ") + body_decoded.count(" ")
    semicolons = path_decoded.count(";") + body_decoded.count(";")
    angle_brackets = path_decoded.count("<") + path_decoded.count(">") + body_decoded.count("<") + body_decoded.count(">")
    special_chars = sum(path_decoded.count(c) + body_decoded.count(c) for c in '$&|')

    badwords_count = sum(path_decoded.lower().count(word) + body_decoded.lower().count(word) for word in badwords)

    path_length = len(path_decoded)
    body_length = len(body_decoded)

    return [single_q, double_q, dashes, braces, spaces, raw_percentages_count,
            semicolons, angle_brackets, special_chars, path_length, body_length, badwords_count]

In [4]:
# Load dataset
http = pd.read_csv('/content/combined_dataset.csv')
required_columns = ['path', 'body', 'class']
for col in required_columns:
    if col not in http.columns:
        raise ValueError(f"Dataset missing required column: {col}")

In [5]:
# Extract features
http['features'] = http.apply(lambda row: ExtractFeatures(row['path'], row['body']), axis=1)

In [6]:
# Prepare data
X = np.array(http['features'].tolist())
y = http['class'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train Random Forest model
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

In [9]:
model.fit(X_train, y_train)

In [10]:
# Saving the trained model
pickle.dump(model, open('training_model.pkl', 'wb'))

In [11]:
# Evaluating Trained Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9733333333333334
Classification Report:
              precision    recall  f1-score   support

         bad       0.95      0.98      0.96        56
        good       0.99      0.97      0.98        94

    accuracy                           0.97       150
   macro avg       0.97      0.98      0.97       150
weighted avg       0.97      0.97      0.97       150

