In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# 2. Load dataset
df = pd.read_csv('../data/sample_logs.csv')
print(f"Dataset shape: {df.shape}")

# 3. Data overview
print(df.info())
print(df.describe())

# 4. Check class distribution
if 'label' in df.columns:
    plt.figure(figsize=(10, 6))
    df['label'].value_counts().plot(kind='bar')
    plt.title('Distribution of Normal vs Threat Events')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.show()

# 5. Feature correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

# 6. Temporal analysis (if timestamp exists)
if 'timestamp' in df.columns:
    df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
    plt.figure(figsize=(12, 6))
    df.groupby('hour').size().plot()
    plt.title('Event Frequency by Hour')
    plt.xlabel('Hour of Day')
    plt.ylabel('Number of Events')
    plt.grid(True)
    plt.show()

# 7. Threat patterns visualization
if 'label' in df.columns:
    threat_data = df[df['label'] == 1]
    normal_data = df[df['label'] == 0]
    
    # Compare distributions of key features
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    for idx, col in enumerate(['duration', 'src_bytes', 'dst_bytes', 'count'][:4]):
        ax = axes[idx//2, idx%2]
        ax.hist([normal_data[col].dropna(), threat_data[col].dropna()], 
                bins=30, alpha=0.7, label=['Normal', 'Threat'], stacked=True)
        ax.set_title(f'Distribution of {col}')
        ax.legend()
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

# 8. Outlier detection visualization
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = iso_forest.fit_predict(df[numeric_cols].fillna(0))

df['outlier_score'] = outliers

# Visualize outliers in 2D (first two principal components)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(df[numeric_cols].fillna(0))

plt.figure(figsize=(10, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], 
            c=df['outlier_score'], cmap='coolwarm', alpha=0.6)
plt.title('PCA Visualization of Outliers')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Outlier Score')
plt.show()