# Botnet Detection Analysis

<!--
Project: Botnet Detection with Machine Learning
Category: ML Projects
Developer: RSK World
Founder: Molla Samser
Designer & Tester: Rima Khatun
Contact: help@rskworld.in, support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147
Website: https://rskworld.in
-->

This notebook provides comprehensive analysis of network traffic data for botnet detection.


## 1. Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import sys
import os

# Add scripts directory to path
sys.path.append('../scripts')
from feature_extractor import FeatureExtractor
from data_processor import DataProcessor

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


## 2. Load and Explore Data


In [None]:
# Load data
data_path = '../data/processed/training_data.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


In [None]:
# Data info
df.info()


In [None]:
# Statistical summary
df.describe()


In [None]:
# Check class distribution
if 'is_botnet' in df.columns:
    print("Class Distribution:")
    print(df['is_botnet'].value_counts())
    print(f"\nBotnet ratio: {df['is_botnet'].mean():.2%}")
    
    plt.figure(figsize=(8, 6))
    df['is_botnet'].value_counts().plot(kind='bar', color=['green', 'red'])
    plt.title('Class Distribution')
    plt.xlabel('Is Botnet')
    plt.ylabel('Count')
    plt.xticks([0, 1], ['Normal', 'Botnet'], rotation=0)
    plt.show()


## 3. Feature Extraction


In [None]:
# Extract features
feature_extractor = FeatureExtractor()
df_features = feature_extractor.extract_features(df)

print(f"Original features: {len(df.columns)}")
print(f"Extracted features: {len(df_features.columns)}")


In [None]:
# Select features for modeling
X, y = feature_extractor.select_features(df_features)

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape if y is not None else 'None'}")
print(f"\nFeatures: {list(X.columns)}")


## 4. Feature Analysis


In [None]:
# Correlation analysis
if y is not None:
    X_with_target = X.copy()
    X_with_target['is_botnet'] = y
    
    plt.figure(figsize=(12, 10))
    correlation = X_with_target.corr()
    sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()


In [None]:
# Feature distributions
if y is not None and len(X.columns) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.ravel()
    
    for idx, col in enumerate(X.columns[:4]):
        normal_data = X[y == 0][col]
        botnet_data = X[y == 1][col]
        
        axes[idx].hist(normal_data, alpha=0.5, label='Normal', bins=30)
        axes[idx].hist(botnet_data, alpha=0.5, label='Botnet', bins=30)
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].legend()
    
    plt.tight_layout()
    plt.show()


## 5. Model Training


In [None]:
# Prepare data
if y is None:
    # Create synthetic target if not available
    y = (X.iloc[:, 0] > X.iloc[:, 0].median()).astype(int)

# Remove NaN
mask = ~(X.isnull().any(axis=1) | y.isnull())
X_clean = X[mask]
y_clean = y[mask]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")


In [None]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
print("Model Performance:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted', zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred, average='weighted', zero_division=0):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred, average='weighted', zero_division=0):.4f}")


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Botnet'],
            yticklabels=['Normal', 'Botnet'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
# Feature Importance
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Feature Importance')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))


## 6. Summary

This analysis demonstrates:
- Network traffic data exploration
- Feature extraction and engineering
- Machine learning model training
- Performance evaluation
- Feature importance analysis
