In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, average_precision_score,
                           classification_report, confusion_matrix)
from sklearn.ensemble import IsolationForest
from torch.utils.data import DataLoader, TensorDataset, Dataset

import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [2]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [3]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

Torch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1


In [None]:
import pandas as pd

df = pd.read_parquet("cicdarknet2020.parquet", engine="fastparquet")
df.info()
df.head()
df['Label'].value_counts()
from sklearn.preprocessing import StandardScaler


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103121 entries, 0 to 103120
Data columns (total 79 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   Protocol                    103121 non-null  int8    
 1   Flow Duration               103121 non-null  int32   
 2   Total Fwd Packet            103121 non-null  int32   
 3   Total Bwd packets           103121 non-null  int32   
 4   Total Length of Fwd Packet  103121 non-null  int32   
 5   Total Length of Bwd Packet  103121 non-null  int32   
 6   Fwd Packet Length Max       103121 non-null  int32   
 7   Fwd Packet Length Min       103121 non-null  int16   
 8   Fwd Packet Length Mean      103121 non-null  float32 
 9   Fwd Packet Length Std       103121 non-null  float32 
 10  Bwd Packet Length Max       103121 non-null  int32   
 11  Bwd Packet Length Min       103121 non-null  int16   
 12  Bwd Packet Length Mean      103121 non-null  float32 
 13 

Label
Non-Tor    64804
NonVPN     20216
VPN        16922
Tor         1179
Name: count, dtype: int64

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_parquet("cicdarknet2020.parquet", engine="fastparquet")

print("=== INITIAL DATA INSPECTION ===")
print(f"DataFrame shape: {df.shape}")
print(f"Memory usage: {df.memory_usage().sum() / 1024 / 1024:.2f} MB")

# Let's check the actual dtypes more carefully
print("\n=== DATA TYPES DETAILED ===")
print("Label column info:")
print(f"  dtype: {df['Label'].dtype}")
print(f"  type of dtype: {type(df['Label'].dtype)}")
print(f"  is categorical? {pd.api.types.is_categorical_dtype(df['Label'])}")
print(f"  is string? {pd.api.types.is_string_dtype(df['Label'])}")

print("\nLabel.1 column info:")
print(f"  dtype: {df['Label.1'].dtype}")
print(f"  type of dtype: {type(df['Label.1'].dtype)}")
print(f"  is categorical? {pd.api.types.is_categorical_dtype(df['Label.1'])}")
print(f"  is string? {pd.api.types.is_string_dtype(df['Label.1'])}")

# Let's see what values are actually in these columns
print("\n=== LABEL VALUES INSPECTION ===")
print("First few rows of Label:")
print(df['Label'].head(10).tolist())
print("\nFirst few rows of Label.1:")
print(df['Label.1'].head(10).tolist())

# Check if they might be numeric codes disguised as strings
print("\n=== CHECKING FOR NUMERIC LABELS ===")
print("Sample of unique Label values:")
print(df['Label'].drop_duplicates().head(20).tolist())

print("\nSample of unique Label.1 values:")
print(df['Label.1'].drop_duplicates().head(20).tolist())

# Let's see the actual value counts you mentioned
print("\n=== FULL LABEL DISTRIBUTIONS ===")
print("Label column value counts (full):")
print(df['Label'].value_counts())

print("\nLabel.1 column value counts (full):")
print(df['Label.1'].value_counts())

# Based on your output, it seems Label might be numeric codes
# Let's investigate the mappings
print("\n=== INVESTIGATING LABEL MAPPINGS ===")
print(f"Label has {df['Label'].nunique()} unique values")
print(f"Label.1 has {df['Label.1'].nunique()} unique values")

# Create a cross-tab to see the relationship
if df['Label'].nunique() < 20 and df['Label.1'].nunique() < 20:
    print("\nCross-tabulation of Label vs Label.1:")
    print(pd.crosstab(df['Label'], df['Label.1']))
else:
    print("\nToo many unique values for cross-tab (showing sample)")
    sample_df = df[['Label', 'Label.1']].sample(min(1000, len(df)))
    print(pd.crosstab(sample_df['Label'], sample_df['Label.1']))

# DECISION TIME: Which label to use?
print("\n=== DECIDING WHICH LABEL TO USE ===")
print("Based on your output:")
print("1. Label column: Has values 0, 1, 2, 3 (4 classes)")
print("2. Label.1 column: Has actual names like 'Browsing', 'P2P', etc.")

# Let me check if Label might be encoded already
print("\nChecking if Label is already encoded...")
# Get a mapping by sampling
sample_size = min(100, len(df))
sample = df[['Label', 'Label.1']].sample(sample_size)
for _, row in sample.iterrows():
    print(f"Label: {row['Label']} -> Label.1: {row['Label.1']}")

# Based on what you showed, it looks like:
# Label: 0, 1, 2, 3 (encoded classes)
# Label.1: Actual class names

# Let's create a proper mapping
print("\n=== CREATING PROPER LABEL MAPPING ===")
# Group by Label and see what Label.1 values correspond
label_mapping_df = df.groupby('Label')['Label.1'].agg(['first', 'nunique', lambda x: list(x.unique())[:5]])
label_mapping_df.columns = ['most_common', 'num_unique', 'sample_values']
print(label_mapping_df)

# If Label is already encoded and Label.1 has the names, use Label.1
print("\nBased on analysis, I recommend using Label.1 as it has the actual class names")

# Clean up the data
print("\n=== DATA CLEANING ===")
print("Checking for duplicate columns...")
duplicate_columns = df.columns[df.columns.duplicated()]
print(f"Duplicate columns: {list(duplicate_columns)}")

# Check for constant columns
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
print(f"Constant columns: {constant_columns}")

if constant_columns:
    df = df.drop(columns=constant_columns)
    print(f"Dropped constant columns: {constant_columns}")

# Handle missing/infinite values
print("\n=== HANDLING MISSING/INFINITE VALUES ===")
numeric_cols = df.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    df[col] = df[col].replace([np.inf, -np.inf], np.nan)

nan_counts = df.isnull().sum()
if nan_counts.any():
    nan_cols = nan_counts[nan_counts > 0].index.tolist()
    print(f"Columns with NaN: {nan_cols}")
    
    for col in numeric_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())

# Now process the labels
print("\n=== LABEL PROCESSING ===")
# Use Label.1 since it has the actual names
df['Label_original'] = df['Label.1'].astype(str)

# Clean the labels
df['Label_cleaned'] = df['Label_original'].str.lower().str.strip()

# Check cleaned labels
print("\nCleaned label distribution:")
cleaned_counts = df['Label_cleaned'].value_counts()
for label, count in cleaned_counts.items():
    proportion = count / len(df) * 100
    print(f"  '{label}': {count} samples ({proportion:.2f}%)")

# Encode labels
label_encoder = LabelEncoder()
df['Label_encoded'] = label_encoder.fit_transform(df['Label_cleaned'])

print("\n=== FINAL LABEL ENCODING ===")
print("Class mapping:")
for i, label in enumerate(label_encoder.classes_):
    count = (df['Label_cleaned'] == label).sum()
    proportion = count / len(df) * 100
    print(f"  {i}: '{label}' - {count} samples ({proportion:.2f}%)")

label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Prepare features
print("\n=== FEATURE PREPARATION ===")
# Exclude all label-related columns
label_cols = ['Label', 'Label.1', 'Label_original', 'Label_cleaned', 'Label_encoded']
exclude_cols = [col for col in label_cols if col in df.columns]

feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
y = df['Label_encoded']

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Scale features
print("\n=== FEATURE SCALING ===")
scaler = StandardScaler()
X_scaled = X.copy()
numeric_features = X.select_dtypes(include=[np.number]).columns

if len(numeric_features) > 0:
    X_scaled[numeric_features] = scaler.fit_transform(X[numeric_features])
    print(f"Scaled {len(numeric_features)} numeric features")
else:
    print("No numeric features to scale")

# Split data
print("\n=== DATA SPLITTING ===")
class_counts = y.value_counts()
print("Class distribution:")
for class_id, count in class_counts.items():
    class_name = label_encoder.inverse_transform([class_id])[0]
    proportion = count / len(y) * 100
    print(f"  {class_id}: '{class_name}' - {count} samples ({proportion:.2f}%)")

# Use stratification if possible
if class_counts.min() >= 2:
    print("\nUsing stratified split")
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
else:
    print("\nUsing random split (some classes have < 2 samples)")
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Save processed data
print("\n=== SAVING DATA ===")
import pickle
from datetime import datetime

preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'scaler': scaler,
    'label_encoder': label_encoder,
    'feature_names': X_train.columns.tolist(),
    'label_mapping': label_mapping,
    'num_classes': len(label_encoder.classes_)
}

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'cicdarknet_preprocessed_{timestamp}.pkl'

with open(filename, 'wb') as f:
    pickle.dump(preprocessed_data, f)

print(f"Saved to: {filename}")
print("\n=== SUMMARY ===")
print(f"Original data: {df.shape}")
print(f"Features: {X_train.shape[1]}")
print(f"Classes: {len(label_encoder.classes_)}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

# Also save a CSV for inspection
print("\nSaving sample for inspection...")
sample_data = pd.DataFrame(X_train.iloc[:100])
sample_data['label'] = y_train.iloc[:100].values
sample_data['label_name'] = label_encoder.inverse_transform(y_train.iloc[:100])
sample_data.to_csv('preprocessed_sample.csv', index=False)
print("Sample saved to 'preprocessed_sample.csv'")

# Quick summary
print("\n=== PREPROCESSING SUMMARY ===")
print(f"Original dataset shape: {df.shape}")
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"Feature matrix shape: {X_scaled.shape}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

=== INITIAL DATA INSPECTION ===
DataFrame shape: (103121, 79)
Memory usage: 23.60 MB

=== DATA TYPES DETAILED ===
Label column info:
  dtype: category
  type of dtype: <class 'pandas.core.dtypes.dtypes.CategoricalDtype'>
  is categorical? True
  is string? True

Label.1 column info:
  dtype: category
  type of dtype: <class 'pandas.core.dtypes.dtypes.CategoricalDtype'>
  is categorical? True
  is string? True

=== LABEL VALUES INSPECTION ===
First few rows of Label:
['Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor', 'Non-Tor']

First few rows of Label.1:
['AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING', 'AUDIO-STREAMING']

=== CHECKING FOR NUMERIC LABELS ===
Sample of unique Label values:
['Non-Tor', 'NonVPN', 'Tor', 'VPN']

Sample of unique Label.1 values:
['AUDIO-STREAMING', 'Browsing', 'Chat', 'Email', 'File-Transf