In [1]:
import re
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (classification_report, accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------
# File paths for logs and annotations
# ----------------------
bot_log_file = 'web_bot_detection_dataset/phase1/data/web_logs/bots/access_moderate_bots.log'
human_log_file_1 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_1.log'
human_log_file_2 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_2.log'
human_log_file_3 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_3.log'
human_log_file_4 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_4.log'
human_log_file_5 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_5.log'

train_annotation_file = 'web_bot_detection_dataset/phase1/annotations/humans_and_moderate_bots/train'
test_annotation_file  = 'web_bot_detection_dataset/phase1/annotations/humans_and_moderate_bots/test'

# ----------------------
# Initialize session_features dictionary
# ----------------------
# We also store timestamps, HTML requests count, list of depths, and page request counts.
session_features = defaultdict(lambda: {
    'total_requests': 0,
    'total_bytes': 0,
    'GET': 0,
    'POST': 0,
    'http_3xx': 0,
    'http_4xx': 0,
    'image_requests': 0,
    'html_requests': 0,
    'depths': [],
    'page_requests': {},
    'timestamps': []
})

# ----------------------
# Regular expressions for parsing and resource type matching.
# ----------------------
log_pattern = re.compile(
    r'- - \[(.*?)\]\s+"(\w+)\s+([^"]+)\s+HTTP/[\d.]+"\s+(\d{3})\s+(\d+)\s+"([^"]+)"\s+([^\s]+)\s+"([^"]+)"'
)
image_pattern = re.compile(r'\.(jpg|jpeg|png|gif|ico)$', re.IGNORECASE)
css_pattern   = re.compile(r'\.css$', re.IGNORECASE)
js_pattern    = re.compile(r'\.js$', re.IGNORECASE)

def process_log_file(file_path):
    """Process a log file and update session_features."""
    with open(file_path, 'r') as f:
        for line in f:
            match = log_pattern.match(line)
            if match:
                timestamp_str = match.group(1)
                try:
                    timestamp = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S %z")
                except Exception:
                    continue
                method = match.group(2)
                url = match.group(3)
                status = int(match.group(4))
                bytes_count = int(match.group(5))
                session_id = match.group(7)
                if session_id == "-":
                    continue
                feat = session_features[session_id]
                feat['total_requests'] += 1
                feat['total_bytes'] += bytes_count
                if method in ['GET', 'POST']:
                    feat[method] += 1
                if 300 <= status < 400:
                    feat['http_3xx'] += 1
                if 400 <= status < 500:
                    feat['http_4xx'] += 1
                if image_pattern.search(url):
                    feat['image_requests'] += 1
                # Heuristic: if not CSS, JS, or image, treat as HTML.
                if not (css_pattern.search(url) or js_pattern.search(url) or image_pattern.search(url)):
                    feat['html_requests'] += 1
                depth = len(url.strip().split('/')) - 1
                feat['depths'].append(depth)
                feat['page_requests'][url] = feat['page_requests'].get(url, 0) + 1
                feat['timestamps'].append(timestamp)

# Process all log files.
process_log_file(bot_log_file)
process_log_file(human_log_file_1)
process_log_file(human_log_file_2)
process_log_file(human_log_file_3)
process_log_file(human_log_file_4)
process_log_file(human_log_file_5)

# ----------------------
# Compute derived features.
# ----------------------
SEQUENTIAL_THRESHOLD = 2.0  # seconds

for session_id, feat in session_features.items():
    # Image Requests Percent.
    feat['image_requests_percent'] = (feat['image_requests'] / feat['total_requests'] * 100
                                        if feat['total_requests'] > 0 else 0)
    # HTML-to-Image Ratio.
    feat['html_to_image_ratio'] = (feat['html_requests'] / feat['image_requests']
                                    if feat['image_requests'] > 0 else feat['html_requests'])
    # Depth Standard Deviation.
    feat['depth_std'] = np.std(feat['depths']) if feat['depths'] else 0
    # Max Requests Per Page.
    feat['max_requests_per_page'] = max(feat['page_requests'].values()) if feat['page_requests'] else 0
    
    # Timing-based features.
    timestamps = sorted(feat['timestamps'])
    if timestamps:
        session_duration = (timestamps[-1] - timestamps[0]).total_seconds()
    else:
        session_duration = 0
    feat['session_duration'] = session_duration
    if len(timestamps) > 1:
        diffs = [(timestamps[i+1] - timestamps[i]).total_seconds() for i in range(len(timestamps)-1)]
        feat['inter_request_avg'] = np.mean(diffs)
        sequential_count = sum(1 for diff in diffs if diff < SEQUENTIAL_THRESHOLD)
        feat['sequential_req_percent'] = (sequential_count / len(diffs)) * 100
    else:
        feat['inter_request_avg'] = 0
        feat['sequential_req_percent'] = 0
    # Browsing Speed.
    feat['browsing_speed'] = (feat['total_requests'] / session_duration
                              if session_duration > 0 else feat['total_requests'])

# ----------------------
# Convert session_features to DataFrame.
# ----------------------
df_features = pd.DataFrame.from_dict(session_features, orient='index')
df_features.index.name = 'session_id'
df_features.reset_index(inplace=True)

def read_annotation(file_path):
    """Read an annotation file with session_id and label into a DataFrame."""
    annotations = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                session_id, label = parts
                annotations.append({'session_id': session_id.strip(), 'label': label.strip().lower()})
    return pd.DataFrame(annotations)

df_train_annot = read_annotation(train_annotation_file)
df_test_annot  = read_annotation(test_annotation_file)

# ----------------------
# Merge features with annotations using concat.
# ----------------------
df_features.set_index('session_id', inplace=True)
df_train_annot.set_index('session_id', inplace=True)
df_test_annot.set_index('session_id', inplace=True)
df_train = pd.concat([df_features, df_train_annot], axis=1, join='inner').reset_index()
df_test  = pd.concat([df_features, df_test_annot], axis=1, join='inner').reset_index()

# Map labels: 'human' -> 0, 'moderate_bot'/'bot' -> 1.
label_map = {'human': 0, 'moderate_bot': 1, 'bot': 1}
df_train['label'] = df_train['label'].map(label_map)
df_test['label']  = df_test['label'].map(label_map)

print("Training distribution:")
print("Human:", df_train[df_train['label'] == 0].shape[0])
print("Bot:", df_train[df_train['label'] == 1].shape[0])

# ----------------------
# Define feature columns and normalize inputs.
# ----------------------
feature_cols = [
    'total_requests', 
    'total_bytes', 
    'GET', 
    'POST', 
    'http_3xx', 
    'http_4xx', 
    'image_requests_percent',
    'html_to_image_ratio',
    'depth_std',
    'max_requests_per_page',
    'sequential_req_percent',
    'browsing_speed',
    'session_duration',
    'inter_request_avg'
]

X_train = df_train[feature_cols]
y_train = df_train['label']
X_test  = df_test[feature_cols]
y_test  = df_test['label']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ----------------------
# Train multiple classifiers.
# ----------------------
classifiers = {
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "KNN": KNeighborsClassifier()
}

results = {}
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    y_prob = clf.predict_proba(X_test_scaled)[:, 1] if hasattr(clf, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "y_pred": y_pred, "y_prob": y_prob}
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix for each classifier.
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Human", "Bot"], yticklabels=["Human", "Bot"])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"{name} Confusion Matrix")
    plt.tight_layout()
    plt.savefig(f"phase_1/{name.replace(' ', '_').lower()}_confusion_matrix.png")
    plt.close()

# ----------------------
# Plot a bar chart comparing classifier metrics.

# 1. Training Distribution.
plt.figure(figsize=(6, 4))
sns.countplot(x="label", data=df_train)
plt.xticks([0, 1], ["Human", "Bot"])
plt.xlabel("Label")
plt.ylabel("Count")
plt.title("Training Distribution")
plt.tight_layout()
plt.savefig("phase_1/training_distribution.png")
plt.close()

# 2. Feature Correlation Heatmap.
plt.figure(figsize=(10, 8))
corr = df_train[feature_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig("phase_1/feature_correlation_heatmap.png")
plt.close()
# ----------------------
metrics = ["accuracy", "precision", "recall", "f1"]
metric_values = {metric: [results[clf][metric] for clf in results] for metric in metrics}
x = np.arange(len(classifiers))
width = 0.2

plt.figure(figsize=(10,6))
for i, metric in enumerate(metrics):
    plt.bar(x + i*width, metric_values[metric], width, label=metric.title())
plt.xticks(x + width*1.5, list(classifiers.keys()))
plt.xlabel("Classifier")
plt.ylabel("Metric Value")
plt.title("Classifier Performance Comparison")
plt.legend()
plt.tight_layout()
plt.savefig("phase_1/classifier_performance_comparison.png")
plt.close()

# ----------------------
# Plot ROC curves for classifiers that support probability estimates.
# ----------------------
plt.figure(figsize=(8,6))
for name in classifiers:
    if results[name]["y_prob"] is not None:
        fpr, tpr, _ = roc_curve(y_test, results[name]["y_prob"])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("phase_1/roc_curve_comparison.png")
plt.close()

# ----------------------
# Additional Graphs on Feature Distributions and Scatter Plots.
# ----------------------
# Feature distributions (raw values)
features_to_plot = ['total_requests', 'session_duration', 'browsing_speed', 'inter_request_avg']
for feature in features_to_plot:
    plt.figure(figsize=(6,4))
    sns.histplot(df_train[feature], bins=20, kde=True)
    plt.xlabel(feature.replace('_', ' ').title())
    plt.title(f"Distribution of {feature.replace('_', ' ').title()}")
    plt.tight_layout()
    plt.savefig(f"phase_1/{feature}_distribution.png")
    plt.close()

# Scatter Plot: Session Duration vs. Browsing Speed.
plt.figure(figsize=(6,4))
sns.scatterplot(x="session_duration", y="browsing_speed", hue="label", data=df_train, palette="Set1")
plt.xlabel("Session Duration (s)")
plt.ylabel("Browsing Speed (req/s)")
plt.title("Session Duration vs. Browsing Speed")
plt.legend(title="Label", labels=["Human", "Bot"])
plt.tight_layout()
plt.savefig("phase_1/session_duration_vs_browsing_speed.png")
plt.close()

print("All graphs have been saved as PNG files.")


Training distribution:
Human: 35
Bot: 35

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90        15
           1       0.93      0.87      0.90        15

    accuracy                           0.90        30
   macro avg       0.90      0.90      0.90        30
weighted avg       0.90      0.90      0.90        30


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        15
           1       0.80      0.80      0.80        15

    accuracy                           0.80        30
   macro avg       0.80      0.80      0.80        30
weighted avg       0.80      0.80      0.80        30


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        15
           1       0.87      0.87      0.87        15

    accuracy                   

In [5]:
import re
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (classification_report, accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------
# File paths for logs and annotations
# ----------------------
bot_log_file = 'web_bot_detection_dataset/phase1/data/web_logs/bots/access_advanced_bots.log'
human_log_file_1 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_1.log'
human_log_file_2 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_2.log'
human_log_file_3 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_3.log'
human_log_file_4 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_4.log'
human_log_file_5 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_5.log'

train_annotation_file = 'web_bot_detection_dataset/phase1/annotations/humans_and_advanced_bots/train'
test_annotation_file  = 'web_bot_detection_dataset/phase1/annotations/humans_and_advanced_bots/test'

# ----------------------
# Initialize session_features dictionary
# ----------------------
# We also store timestamps, HTML requests count, list of depths, and page request counts.
session_features = defaultdict(lambda: {
    'total_requests': 0,
    'total_bytes': 0,
    'GET': 0,
    'POST': 0,
    'http_3xx': 0,
    'http_4xx': 0,
    'image_requests': 0,
    'html_requests': 0,
    'depths': [],
    'page_requests': {},
    'timestamps': []
})

# ----------------------
# Regular expressions for parsing and resource type matching.
# ----------------------
log_pattern = re.compile(
    r'- - \[(.*?)\]\s+"(\w+)\s+([^"]+)\s+HTTP/[\d.]+"\s+(\d{3})\s+(\d+)\s+"([^"]+)"\s+([^\s]+)\s+"([^"]+)"'
)
image_pattern = re.compile(r'\.(jpg|jpeg|png|gif|ico)$', re.IGNORECASE)
css_pattern   = re.compile(r'\.css$', re.IGNORECASE)
js_pattern    = re.compile(r'\.js$', re.IGNORECASE)

def process_log_file(file_path):
    """Process a log file and update session_features."""
    with open(file_path, 'r') as f:
        for line in f:
            match = log_pattern.match(line)
            if match:
                timestamp_str = match.group(1)
                try:
                    timestamp = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S %z")
                except Exception:
                    continue
                method = match.group(2)
                url = match.group(3)
                status = int(match.group(4))
                bytes_count = int(match.group(5))
                session_id = match.group(7)
                if session_id == "-":
                    continue
                feat = session_features[session_id]
                feat['total_requests'] += 1
                feat['total_bytes'] += bytes_count
                if method in ['GET', 'POST']:
                    feat[method] += 1
                if 300 <= status < 400:
                    feat['http_3xx'] += 1
                if 400 <= status < 500:
                    feat['http_4xx'] += 1
                if image_pattern.search(url):
                    feat['image_requests'] += 1
                # Heuristic: if not CSS, JS, or image, treat as HTML.
                if not (css_pattern.search(url) or js_pattern.search(url) or image_pattern.search(url)):
                    feat['html_requests'] += 1
                depth = len(url.strip().split('/')) - 1
                feat['depths'].append(depth)
                feat['page_requests'][url] = feat['page_requests'].get(url, 0) + 1
                feat['timestamps'].append(timestamp)

# Process all log files.
process_log_file(bot_log_file)
process_log_file(human_log_file_1)
process_log_file(human_log_file_2)
process_log_file(human_log_file_3)
process_log_file(human_log_file_4)
process_log_file(human_log_file_5)

# ----------------------
# Compute derived features.
# ----------------------
SEQUENTIAL_THRESHOLD = 2.0  # seconds

for session_id, feat in session_features.items():
    # Image Requests Percent.
    feat['image_requests_percent'] = (feat['image_requests'] / feat['total_requests'] * 100
                                        if feat['total_requests'] > 0 else 0)
    # HTML-to-Image Ratio.
    feat['html_to_image_ratio'] = (feat['html_requests'] / feat['image_requests']
                                    if feat['image_requests'] > 0 else feat['html_requests'])
    # Depth Standard Deviation.
    feat['depth_std'] = np.std(feat['depths']) if feat['depths'] else 0
    # Max Requests Per Page.
    feat['max_requests_per_page'] = max(feat['page_requests'].values()) if feat['page_requests'] else 0
    
    # Timing-based features.
    timestamps = sorted(feat['timestamps'])
    if timestamps:
        session_duration = (timestamps[-1] - timestamps[0]).total_seconds()
    else:
        session_duration = 0
    feat['session_duration'] = session_duration
    if len(timestamps) > 1:
        diffs = [(timestamps[i+1] - timestamps[i]).total_seconds() for i in range(len(timestamps)-1)]
        feat['inter_request_avg'] = np.mean(diffs)
        sequential_count = sum(1 for diff in diffs if diff < SEQUENTIAL_THRESHOLD)
        feat['sequential_req_percent'] = (sequential_count / len(diffs)) * 100
    else:
        feat['inter_request_avg'] = 0
        feat['sequential_req_percent'] = 0
    # Browsing Speed.
    feat['browsing_speed'] = (feat['total_requests'] / session_duration
                              if session_duration > 0 else feat['total_requests'])

# ----------------------
# Convert session_features to DataFrame.
# ----------------------
df_features = pd.DataFrame.from_dict(session_features, orient='index')
df_features.index.name = 'session_id'
df_features.reset_index(inplace=True)

def read_annotation(file_path):
    """Read an annotation file with session_id and label into a DataFrame."""
    annotations = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                session_id, label = parts
                annotations.append({'session_id': session_id.strip(), 'label': label.strip().lower()})
    return pd.DataFrame(annotations)

df_train_annot = read_annotation(train_annotation_file)
df_test_annot  = read_annotation(test_annotation_file)

# ----------------------
# Merge features with annotations using concat.
# ----------------------
df_features.set_index('session_id', inplace=True)
df_train_annot.set_index('session_id', inplace=True)
df_test_annot.set_index('session_id', inplace=True)
df_train = pd.concat([df_features, df_train_annot], axis=1, join='inner').reset_index()
df_test  = pd.concat([df_features, df_test_annot], axis=1, join='inner').reset_index()

# Map labels: 'human' -> 0, 'moderate_bot'/'bot' -> 1.
label_map = {'human': 0, 'moderate_bot': 1, 'bot': 1, 'advanced_bot': 1}
df_train['label'] = df_train['label'].map(label_map)
df_test['label']  = df_test['label'].map(label_map)

print("Training distribution:")
print("Human:", df_train[df_train['label'] == 0].shape[0])
print("Bot:", df_train[df_train['label'] == 1].shape[0])

# ----------------------
# Define feature columns and normalize inputs.
# ----------------------
feature_cols = [
    'total_requests', 
    'total_bytes', 
    'GET', 
    'POST', 
    'http_3xx', 
    'http_4xx', 
    'image_requests_percent',
    'html_to_image_ratio',
    'depth_std',
    'max_requests_per_page',
    'sequential_req_percent',
    'browsing_speed',
    'session_duration',
    'inter_request_avg'
]

X_train = df_train[feature_cols]
y_train = df_train['label']
X_test  = df_test[feature_cols]
y_test  = df_test['label']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ----------------------
# Train multiple classifiers.
# ----------------------
classifiers = {
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "KNN": KNeighborsClassifier()
}

results = {}
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    y_prob = clf.predict_proba(X_test_scaled)[:, 1] if hasattr(clf, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "y_pred": y_pred, "y_prob": y_prob}
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix for each classifier.
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Human", "Bot"], yticklabels=["Human", "Bot"])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"{name} Confusion Matrix")
    plt.tight_layout()
    plt.savefig(f"phase_1_with_advance_bot/{name.replace(' ', '_').lower()}_confusion_matrix.png")
    plt.close()

# ----------------------
# Plot a bar chart comparing classifier metrics.

# 1. Training Distribution.
plt.figure(figsize=(6, 4))
sns.countplot(x="label", data=df_train)
plt.xticks([0, 1], ["Human", "Bot"])
plt.xlabel("Label")
plt.ylabel("Count")
plt.title("Training Distribution")
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/training_distribution.png")
plt.close()

# 2. Feature Correlation Heatmap.
plt.figure(figsize=(10, 8))
corr = df_train[feature_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/feature_correlation_heatmap.png")
plt.close()
# ----------------------
metrics = ["accuracy", "precision", "recall", "f1"]
metric_values = {metric: [results[clf][metric] for clf in results] for metric in metrics}
x = np.arange(len(classifiers))
width = 0.2

plt.figure(figsize=(10,6))
for i, metric in enumerate(metrics):
    plt.bar(x + i*width, metric_values[metric], width, label=metric.title())
plt.xticks(x + width*1.5, list(classifiers.keys()))
plt.xlabel("Classifier")
plt.ylabel("Metric Value")
plt.title("Classifier Performance Comparison")
plt.legend()
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/classifier_performance_comparison.png")
plt.close()

# ----------------------
# Plot ROC curves for classifiers that support probability estimates.
# ----------------------
plt.figure(figsize=(8,6))
for name in classifiers:
    if results[name]["y_prob"] is not None:
        fpr, tpr, _ = roc_curve(y_test, results[name]["y_prob"])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/roc_curve_comparison.png")
plt.close()

# ----------------------
# Additional Graphs on Feature Distributions and Scatter Plots.
# ----------------------
# Feature distributions (raw values)
features_to_plot = ['total_requests', 'session_duration', 'browsing_speed', 'inter_request_avg']
for feature in features_to_plot:
    plt.figure(figsize=(6,4))
    sns.histplot(df_train[feature], bins=20, kde=True)
    plt.xlabel(feature.replace('_', ' ').title())
    plt.title(f"Distribution of {feature.replace('_', ' ').title()}")
    plt.tight_layout()
    plt.savefig(f"phase_1_with_advance_bot/{feature}_distribution.png")
    plt.close()

# Scatter Plot: Session Duration vs. Browsing Speed.
plt.figure(figsize=(6,4))
sns.scatterplot(x="session_duration", y="browsing_speed", hue="label", data=df_train, palette="Set1")
plt.xlabel("Session Duration (s)")
plt.ylabel("Browsing Speed (req/s)")
plt.title("Session Duration vs. Browsing Speed")
plt.legend(title="Label", labels=["Human", "Bot"])
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/session_duration_vs_browsing_speed.png")
plt.close()

print("All graphs have been saved as PNG files.")


Training distribution:
Human: 35
Bot: 35

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86        15
           1       1.00      0.67      0.80        15

    accuracy                           0.83        30
   macro avg       0.88      0.83      0.83        30
weighted avg       0.88      0.83      0.83        30


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.73      0.67        15
           1       0.67      0.53      0.59        15

    accuracy                           0.63        30
   macro avg       0.64      0.63      0.63        30
weighted avg       0.64      0.63      0.63        30


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        15
           1       1.00      0.73      0.85        15

    accuracy                   

In [None]:
!pip install pytorch-tabnet

In [3]:
import re
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (classification_report, accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# For TabNet
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

# ----------------------
# File paths for logs and annotations
# ----------------------
bot_log_file = 'web_bot_detection_dataset/phase1/data/web_logs/bots/access_advanced_bots.log'
human_log_file_1 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_1.log'
human_log_file_2 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_2.log'
human_log_file_3 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_3.log'
human_log_file_4 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_4.log'
human_log_file_5 = 'web_bot_detection_dataset/phase1/data/web_logs/humans/access_5.log'

train_annotation_file = 'web_bot_detection_dataset/phase1/annotations/humans_and_advanced_bots/train'
test_annotation_file  = 'web_bot_detection_dataset/phase1/annotations/humans_and_advanced_bots/test'

# ----------------------
# Initialize session_features dictionary
# ----------------------
# We also store timestamps, HTML requests count, list of depths, and page request counts.
session_features = defaultdict(lambda: {
    'total_requests': 0,
    'total_bytes': 0,
    'GET': 0,
    'POST': 0,
    'http_3xx': 0,
    'http_4xx': 0,
    'image_requests': 0,
    'html_requests': 0,
    'depths': [],
    'page_requests': {},
    'timestamps': []
})

# ----------------------
# Regular expressions for parsing and resource type matching.
# ----------------------
log_pattern = re.compile(
    r'- - \[(.*?)\]\s+"(\w+)\s+([^"]+)\s+HTTP/[\d.]+"\s+(\d{3})\s+(\d+)\s+"([^"]+)"\s+([^\s]+)\s+"([^"]+)"'
)
image_pattern = re.compile(r'\.(jpg|jpeg|png|gif|ico)$', re.IGNORECASE)
css_pattern   = re.compile(r'\.css$', re.IGNORECASE)
js_pattern    = re.compile(r'\.js$', re.IGNORECASE)

def process_log_file(file_path):
    """Process a log file and update session_features."""
    with open(file_path, 'r') as f:
        for line in f:
            match = log_pattern.match(line)
            if match:
                timestamp_str = match.group(1)
                try:
                    timestamp = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S %z")
                except Exception:
                    continue
                method = match.group(2)
                url = match.group(3)
                status = int(match.group(4))
                bytes_count = int(match.group(5))
                session_id = match.group(7)
                if session_id == "-":
                    continue
                feat = session_features[session_id]
                feat['total_requests'] += 1
                feat['total_bytes'] += bytes_count
                if method in ['GET', 'POST']:
                    feat[method] += 1
                if 300 <= status < 400:
                    feat['http_3xx'] += 1
                if 400 <= status < 500:
                    feat['http_4xx'] += 1
                if image_pattern.search(url):
                    feat['image_requests'] += 1
                # Heuristic: if not CSS, JS, or image, treat as HTML.
                if not (css_pattern.search(url) or js_pattern.search(url) or image_pattern.search(url)):
                    feat['html_requests'] += 1
                depth = len(url.strip().split('/')) - 1
                feat['depths'].append(depth)
                feat['page_requests'][url] = feat['page_requests'].get(url, 0) + 1
                feat['timestamps'].append(timestamp)

# Process all log files.
process_log_file(bot_log_file)
process_log_file(human_log_file_1)
process_log_file(human_log_file_2)
process_log_file(human_log_file_3)
process_log_file(human_log_file_4)
process_log_file(human_log_file_5)

# ----------------------
# Compute derived features.
# ----------------------
SEQUENTIAL_THRESHOLD = 2.0  # seconds

for session_id, feat in session_features.items():
    # Image Requests Percent.
    feat['image_requests_percent'] = (feat['image_requests'] / feat['total_requests'] * 100
                                        if feat['total_requests'] > 0 else 0)
    # HTML-to-Image Ratio.
    feat['html_to_image_ratio'] = (feat['html_requests'] / feat['image_requests']
                                    if feat['image_requests'] > 0 else feat['html_requests'])
    # Depth Standard Deviation.
    feat['depth_std'] = np.std(feat['depths']) if feat['depths'] else 0
    # Max Requests Per Page.
    feat['max_requests_per_page'] = max(feat['page_requests'].values()) if feat['page_requests'] else 0
    
    # Timing-based features.
    timestamps = sorted(feat['timestamps'])
    if timestamps:
        session_duration = (timestamps[-1] - timestamps[0]).total_seconds()
    else:
        session_duration = 0
    feat['session_duration'] = session_duration
    if len(timestamps) > 1:
        diffs = [(timestamps[i+1] - timestamps[i]).total_seconds() for i in range(len(timestamps)-1)]
        feat['inter_request_avg'] = np.mean(diffs)
        sequential_count = sum(1 for diff in diffs if diff < SEQUENTIAL_THRESHOLD)
        feat['sequential_req_percent'] = (sequential_count / len(diffs)) * 100
    else:
        feat['inter_request_avg'] = 0
        feat['sequential_req_percent'] = 0
    # Browsing Speed.
    feat['browsing_speed'] = (feat['total_requests'] / session_duration
                              if session_duration > 0 else feat['total_requests'])

# ----------------------
# Convert session_features to DataFrame.
# ----------------------
df_features = pd.DataFrame.from_dict(session_features, orient='index')
df_features.index.name = 'session_id'
df_features.reset_index(inplace=True)

def read_annotation(file_path):
    """Read an annotation file with session_id and label into a DataFrame."""
    annotations = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                session_id, label = parts
                annotations.append({'session_id': session_id.strip(), 'label': label.strip().lower()})
    return pd.DataFrame(annotations)

df_train_annot = read_annotation(train_annotation_file)
df_test_annot  = read_annotation(test_annotation_file)

# ----------------------
# Merge features with annotations using concat.
# ----------------------
df_features.set_index('session_id', inplace=True)
df_train_annot.set_index('session_id', inplace=True)
df_test_annot.set_index('session_id', inplace=True)
df_train = pd.concat([df_features, df_train_annot], axis=1, join='inner').reset_index()
df_test  = pd.concat([df_features, df_test_annot], axis=1, join='inner').reset_index()

# Map labels: 'human' -> 0, 'moderate_bot'/'bot'/'advanced_bot' -> 1.
label_map = {'human': 0, 'moderate_bot': 1, 'bot': 1, 'advanced_bot': 1}
df_train['label'] = df_train['label'].map(label_map)
df_test['label']  = df_test['label'].map(label_map)

print("Training distribution:")
print("Human:", df_train[df_train['label'] == 0].shape[0])
print("Bot:", df_train[df_train['label'] == 1].shape[0])

# ----------------------
# Define feature columns and normalize inputs.
# ----------------------
feature_cols = [
    'total_requests', 
    'total_bytes', 
    'GET', 
    'POST', 
    'http_3xx', 
    'http_4xx', 
    'image_requests_percent',
    'html_to_image_ratio',
    'depth_std',
    'max_requests_per_page',
    'sequential_req_percent',
    'browsing_speed',
    'session_duration',
    'inter_request_avg'
]

X_train = df_train[feature_cols]
y_train = df_train['label']
X_test  = df_test[feature_cols]
y_test  = df_test['label']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ----------------------
# Train multiple classifiers, including TabNet.
# ----------------------
classifiers = {
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "KNN": KNeighborsClassifier()
}

# Add TabNet as an additional classifier.
classifiers["TabNet"] = TabNetClassifier(
    n_d=8, n_a=8, n_steps=3, gamma=1.5,
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',  # alternative: "sparsemax"
    verbose=0,
    seed=42
)

results = {}
for name, clf in classifiers.items():
    if name == "TabNet":
        clf.fit(
            X_train_scaled, y_train,
            eval_set=[(X_test_scaled, y_test)],
            eval_name=['test'],
            eval_metric=['auc'],
            max_epochs=100,
            patience=10,
            batch_size=2,
            virtual_batch_size=128
        )
        y_pred = clf.predict(X_test_scaled)
        y_prob = clf.predict_proba(X_test_scaled)[:, 1]
    else:
        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
        y_prob = clf.predict_proba(X_test_scaled)[:, 1] if hasattr(clf, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "y_pred": y_pred, "y_prob": y_prob}
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix for each classifier.
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Human", "Bot"], yticklabels=["Human", "Bot"])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"{name} Confusion Matrix")
    plt.tight_layout()
    plt.savefig(f"phase_1_with_advance_bot/{name.replace(' ', '_').lower()}_confusion_matrix.png")
    plt.close()

# ----------------------
# Plot a bar chart comparing classifier metrics.
# ----------------------
metrics = ["accuracy", "precision", "recall", "f1"]
metric_values = {metric: [results[clf][metric] for clf in results] for metric in metrics}
x = np.arange(len(classifiers))
width = 0.2

plt.figure(figsize=(10,6))
for i, metric in enumerate(metrics):
    plt.bar(x + i*width, metric_values[metric], width, label=metric.title())
plt.xticks(x + width*1.5, list(classifiers.keys()))
plt.xlabel("Classifier")
plt.ylabel("Metric Value")
plt.title("Classifier Performance Comparison")
plt.legend()
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/classifier_performance_comparison.png")
plt.close()

# ----------------------
# Plot ROC curves for classifiers that support probability estimates.
# ----------------------
plt.figure(figsize=(8,6))
for name in classifiers:
    if results[name]["y_prob"] is not None:
        fpr, tpr, _ = roc_curve(y_test, results[name]["y_prob"])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/roc_curve_comparison.png")
plt.close()

# ----------------------
# Additional Graphs on Feature Distributions and Scatter Plots.
# ----------------------
features_to_plot = ['total_requests', 'session_duration', 'browsing_speed', 'inter_request_avg']
for feature in features_to_plot:
    plt.figure(figsize=(6,4))
    sns.histplot(df_train[feature], bins=20, kde=True)
    plt.xlabel(feature.replace('_', ' ').title())
    plt.title(f"Distribution of {feature.replace('_', ' ').title()}")
    plt.tight_layout()
    plt.savefig(f"phase_1_with_advance_bot/{feature}_distribution.png")
    plt.close()

plt.figure(figsize=(6,4))
sns.scatterplot(x="session_duration", y="browsing_speed", hue="label", data=df_train, palette="Set1")
plt.xlabel("Session Duration (s)")
plt.ylabel("Browsing Speed (req/s)")
plt.title("Session Duration vs. Browsing Speed")
plt.legend(title="Label", labels=["Human", "Bot"])
plt.tight_layout()
plt.savefig("phase_1_with_advance_bot/session_duration_vs_browsing_speed.png")
plt.close()

print("All graphs have been saved as PNG files.")


Training distribution:
Human: 35
Bot: 35

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86        15
           1       1.00      0.67      0.80        15

    accuracy                           0.83        30
   macro avg       0.88      0.83      0.83        30
weighted avg       0.88      0.83      0.83        30


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.73      0.67        15
           1       0.67      0.53      0.59        15

    accuracy                           0.63        30
   macro avg       0.64      0.63      0.63        30
weighted avg       0.64      0.63      0.63        30


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        15
           1       1.00      0.73      0.85        15

    accuracy                   




TabNet Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.73      0.73        15
           1       0.73      0.73      0.73        15

    accuracy                           0.73        30
   macro avg       0.73      0.73      0.73        30
weighted avg       0.73      0.73      0.73        30

All graphs have been saved as PNG files.
