# Lab 02: Malware Sample Clustering

Use unsupervised learning to cluster malware samples by behavior and identify families.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab02_malware_clustering.ipynb)

## Learning Objectives
- Feature extraction from malware samples
- K-Means, DBSCAN, and hierarchical clustering
- Dimensionality reduction (PCA, t-SNE)
- Cluster evaluation metrics

In [None]:
# Install dependencies (uncomment for Colab)
# !pip install scikit-learn pandas numpy matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
np.random.seed(42)

## 1. Load and Explore Malware Features

In [None]:
# Sample malware feature dataset
np.random.seed(42)
n_samples = 200

# Generate synthetic malware samples with different families
families = ['Emotet', 'TrickBot', 'Ryuk', 'Dridex', 'QBot']
data = {
    'sha256': [f'sample_{i:04d}' for i in range(n_samples)],
    'family': np.random.choice(families, n_samples),
    'file_size': np.random.lognormal(12, 1.5, n_samples).astype(int),
    'entropy': np.random.uniform(5.0, 8.0, n_samples),
    'num_imports': np.random.randint(10, 500, n_samples),
    'num_sections': np.random.randint(3, 12, n_samples),
    'has_debug': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
    'has_signature': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
}

# Add family-specific characteristics
df = pd.DataFrame(data)
for i, row in df.iterrows():
    if row['family'] == 'Emotet':
        df.loc[i, 'entropy'] = np.random.uniform(7.0, 7.8)
        df.loc[i, 'num_imports'] = np.random.randint(200, 400)
    elif row['family'] == 'Ryuk':
        df.loc[i, 'file_size'] = np.random.lognormal(14, 0.5).astype(int)
        df.loc[i, 'entropy'] = np.random.uniform(7.5, 7.99)

print(f"Dataset shape: {df.shape}")
print(f"\nFamily distribution:")
print(df['family'].value_counts())

In [None]:
# Visualize feature distributions by family
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, feature in zip(axes.flatten(), ['entropy', 'num_imports', 'file_size', 'num_sections']):
    for family in families:
        subset = df[df['family'] == family][feature]
        ax.hist(subset, alpha=0.5, label=family, bins=20)
    ax.set_xlabel(feature)
    ax.set_ylabel('Count')
    ax.legend()
    ax.set_title(f'{feature} Distribution by Family')

plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
# Prepare features for clustering
feature_cols = ['entropy', 'num_imports', 'num_sections', 'has_debug', 'has_signature']

# Log transform file_size (highly skewed)
df['log_file_size'] = np.log1p(df['file_size'])
feature_cols.append('log_file_size')

# Create feature matrix
X = df[feature_cols].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")
print(f"Features: {feature_cols}")

## 3. Dimensionality Reduction

In [None]:
# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

# t-SNE for better separation
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_scaled)

In [None]:
# Visualize with true labels
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# PCA plot
for family in families:
    mask = df['family'] == family
    axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1], label=family, alpha=0.7)
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].set_title('PCA Projection')
axes[0].legend()

# t-SNE plot
for family in families:
    mask = df['family'] == family
    axes[1].scatter(X_tsne[mask, 0], X_tsne[mask, 1], label=family, alpha=0.7)
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
axes[1].set_title('t-SNE Projection')
axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Clustering with K-Means

In [None]:
# Find optimal k using elbow method and silhouette score
k_range = range(2, 11)
inertias = []
silhouettes = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, labels))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')

axes[1].plot(k_range, silhouettes, 'go-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score')

plt.tight_layout()
plt.show()

optimal_k = k_range[np.argmax(silhouettes)]
print(f"Optimal k based on silhouette score: {optimal_k}")

In [None]:
# Apply K-Means with optimal k
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

print("K-Means Cluster Distribution:")
print(df['kmeans_cluster'].value_counts().sort_index())

## 5. Clustering with DBSCAN

In [None]:
# DBSCAN clustering
dbscan = DBSCAN(eps=0.8, min_samples=5)
df['dbscan_cluster'] = dbscan.fit_predict(X_scaled)

print("DBSCAN Cluster Distribution:")
print(df['dbscan_cluster'].value_counts().sort_index())
print(f"\nNoise points (label=-1): {(df['dbscan_cluster'] == -1).sum()}")

## 6. Evaluate Clustering Results

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode true labels
le = LabelEncoder()
true_labels = le.fit_transform(df['family'])

# Calculate metrics
kmeans_silhouette = silhouette_score(X_scaled, df['kmeans_cluster'])
kmeans_ari = adjusted_rand_score(true_labels, df['kmeans_cluster'])

# DBSCAN (excluding noise)
dbscan_mask = df['dbscan_cluster'] != -1
if dbscan_mask.sum() > 1:
    dbscan_silhouette = silhouette_score(X_scaled[dbscan_mask], df.loc[dbscan_mask, 'dbscan_cluster'])
    dbscan_ari = adjusted_rand_score(true_labels[dbscan_mask], df.loc[dbscan_mask, 'dbscan_cluster'])
else:
    dbscan_silhouette = 0
    dbscan_ari = 0

print("Clustering Evaluation:")
print("=" * 40)
print(f"K-Means Silhouette Score: {kmeans_silhouette:.3f}")
print(f"K-Means Adjusted Rand Index: {kmeans_ari:.3f}")
print(f"\nDBSCAN Silhouette Score: {dbscan_silhouette:.3f}")
print(f"DBSCAN Adjusted Rand Index: {dbscan_ari:.3f}")

In [None]:
# Visualize clustering results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# True labels
for i, family in enumerate(families):
    mask = df['family'] == family
    axes[0].scatter(X_tsne[mask, 0], X_tsne[mask, 1], label=family, alpha=0.7)
axes[0].set_title('True Malware Families')
axes[0].legend()

# K-Means clusters
scatter = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=df['kmeans_cluster'], cmap='viridis', alpha=0.7)
axes[1].set_title(f'K-Means Clusters (k=5)')
plt.colorbar(scatter, ax=axes[1])

# DBSCAN clusters
scatter = axes[2].scatter(X_tsne[:, 0], X_tsne[:, 1], c=df['dbscan_cluster'], cmap='viridis', alpha=0.7)
axes[2].set_title('DBSCAN Clusters')
plt.colorbar(scatter, ax=axes[2])

plt.tight_layout()
plt.show()

## 7. Cluster Analysis

In [None]:
# Analyze cluster composition
print("Cluster Composition (K-Means):")
print("=" * 50)

for cluster_id in sorted(df['kmeans_cluster'].unique()):
    cluster_data = df[df['kmeans_cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_data)} samples):")
    print(cluster_data['family'].value_counts().to_string())
    print(f"  Avg Entropy: {cluster_data['entropy'].mean():.2f}")
    print(f"  Avg Imports: {cluster_data['num_imports'].mean():.0f}")

## Summary

In this lab, we:
- Extracted features from malware samples (entropy, imports, sections)
- Applied dimensionality reduction (PCA, t-SNE) for visualization
- Clustered samples using K-Means and DBSCAN
- Evaluated clustering quality with silhouette score and ARI

### Key Insights:
- **High entropy** often indicates packed/encrypted malware
- **Import patterns** can distinguish malware families
- **t-SNE** provides better visual separation than PCA
- **DBSCAN** can identify outliers (noise points)

### Next Steps:
1. Add more features (strings, API calls, PE headers)
2. Try hierarchical clustering for dendrogram visualization
3. Build a classification model using cluster labels