<a href="https://colab.research.google.com/github/shiri9/non-iid/blob/main/label_skew_statitistical-dirichlet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install TensorFlow and all dependencies explicitly compatible with TFF 0.87.0
%pip install tensorflow==2.15.0
%pip install tensorflow-federated==0.81.0
%pip install tensorflow-privacy==0.9.0
%pip install tensorflow-model-optimization==0.7.5
%pip install jax==0.4.14 jaxlib==0.4.14
%pip install google-vizier==0.1.11
%pip install dp-accounting==0.4.3
%pip install portpicker==1.6.0
%pip install scipy==1.9.3
%pip install numpy==1.25.2
%pip install protobuf==3.20.3
%pip install typing-extensions==4.7.1
%pip install googleapis-common-protos==1.61.0
%pip install dm-tree==0.1.8

In [1]:
!python --version

Python 3.11.12


In [1]:
!rm -rf /usr/local/lib/python3.11/dist-packages/jax_plugins

In [2]:
# Verify
import tensorflow as tf
import tensorflow_federated as tff

print("TF version:", tf.__version__)
print("TFF version:", tff.__version__)



TF version: 2.14.1
TFF version: 0.81.0


In [3]:
#cell 1
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from google.colab import drive

# Mount Google Drive to access data files
drive.mount('/content/drive')

# Load datasets
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kdd_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kdd_test.csv')

# Define label mapping for attack categories (including all labels from train and test sets)
attack_mapping = {
    'normal': 0, 'neptune': 1, 'land': 1, 'back': 1, 'teardrop': 1, 'pod': 1, 'smurf': 1,
    'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2,
    'mailbomb': 1, 'apache2': 1, 'processtable': 1,  # Missing DoS labels in test set
    'phf': 3, 'multihop': 3, 'warezclient': 3, 'warezmaster': 3, 'spy': 3, 'ftp_write': 3,
    'guess_passwd': 3, 'imap': 3,
    'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4,
    # Ensure all test labels are included
    'mscan': 2, 'saint': 2, 'snmpgetattack': 3, 'snmpguess': 3, 'xlock': 3, 'xsnoop': 3,
    'httptunnel': 3, 'ps': 4, 'xterm': 4,
    'sendmail': 3, 'named': 3  # Missing labels in test set
}

# Apply the label mapping
df_train['labels'] = df_train['labels'].replace(attack_mapping)
df_test['labels'] = df_test['labels'].replace(attack_mapping)

# Verify the unique labels after mapping
print("Unique labels in train set:", df_train['labels'].unique())
print("Unique labels in test set:", df_test['labels'].unique())

# Dropping the irrelevant column 'num_outbound_cmds'
df_train = df_train.drop('num_outbound_cmds', axis=1)
df_test = df_test.drop('num_outbound_cmds', axis=1)

# Encoding categorical columns: 'protocol_type', 'service', 'flag'
categorical_columns = ['protocol_type', 'service', 'flag']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])  # Important: use transform for test set, not fit_transform

# Scaling numerical columns
numerical_columns = [
    'duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_srv_diff_host_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'hot', 'num_compromised', 'num_root'
]

scaler = MinMaxScaler()
df_train[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])
df_test[numerical_columns] = scaler.transform(df_test[numerical_columns])

# Convert to NumPy arrays and enforce correct types for TensorFlow
X_train = np.array(df_train.drop('labels', axis=1)).astype(np.float32)
y_train = np.array(df_train['labels']).astype(np.int32)

X_test = np.array(df_test.drop('labels', axis=1)).astype(np.float32)
y_test = np.array(df_test['labels']).astype(np.int32)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

# Check dataset shapes
print("Train dataset shape:", X_train.shape, y_train.shape)
print("Test dataset shape:", X_test.shape, y_test.shape)


Mounted at /content/drive


  df_train['labels'] = df_train['labels'].replace(attack_mapping)
  df_test['labels'] = df_test['labels'].replace(attack_mapping)


Unique labels in train set: [0 1 3 2 4]
Unique labels in test set: [0 2 1 3 4]
Train dataset shape: (125973, 40) (125973,)
Test dataset shape: (22544, 40) (22544,)


In [4]:
#cell2
import numpy as np
import pandas as pd
import tensorflow as tf

# Number of partitions (clients)
num_partitions = 10

# Define minimum number of samples to ensure visibility for each class
min_samples_per_class = 50  # Set this to a value that ensures visibility

# Get unique classes/labels in the dataset (assuming 5 classes)
unique_labels_noniid = df_train['labels'].unique()
num_classes = len(unique_labels_noniid)

# Assign classes to each client. Adjusted for 5 classes
# Each client will observe a subset of the available classes
client_class_map = {
    0: [0, 1],  # Client 1 observes classes 0 and 1
    1: [0, 2],  # Client 2 observes classes 0 and 2
    2: [0, 3],  # Client 3 observes classes 0 and 3
    3: [0, 4],  # Client 4 observes classes 0 and 4
    4: [1, 2],  # Client 5 observes classes 1 and 2
    5: [1, 3],  # Client 6 observes classes 1 and 3
    6: [1, 4],  # Client 7 observes classes 1 and 4
    7: [2, 3],  # Client 8 observes classes 2 and 3
    8: [2, 4],  # Client 9 observes classes 2 and 4
    9: [3, 4]   # Client 10 observes classes 3 and 4
}

# Initialize list to store partitions
data_partitions = []

# For each partition (client), we only sample data from the assigned classes
for i in range(num_partitions):
    partition = pd.DataFrame()

    # Get the classes this client is supposed to observe
    client_classes = client_class_map[i]

    # Iterate through each class assigned to the current client
    for label in client_classes:
        class_data = df_train[df_train['labels'] == label]

        # Generate a random proportion for the current partition and class
        proportion = np.random.uniform(0.05, 0.5)  # Random proportion between 5% and 50%

        # Ensure at least `min_samples_per_class` samples are included for each class
        num_samples = max(min_samples_per_class, int(len(class_data) * proportion))

        # Randomly sample this number of instances from the class data
        sampled_data = class_data.sample(n=num_samples, replace=False)

        # Append the sampled data to the current partition
        partition = pd.concat([partition, sampled_data])

    # Shuffle the partition data and reset the index
    partition = partition.sample(frac=1).reset_index(drop=True)

    # Add the partition to the list
    data_partitions.append(partition)

    # Display class distribution in this partition
    print(f"Partition {i+1} class distribution:")
    print(partition['labels'].value_counts())
    print()


Partition 1 class distribution:
labels
0    27046
1     3766
Name: count, dtype: int64

Partition 2 class distribution:
labels
0    26311
2     5139
Name: count, dtype: int64

Partition 3 class distribution:
labels
0    4915
3     165
Name: count, dtype: int64

Partition 4 class distribution:
labels
0    14793
4       50
Name: count, dtype: int64

Partition 5 class distribution:
labels
1    15258
2     3878
Name: count, dtype: int64

Partition 6 class distribution:
labels
1    18415
3       57
Name: count, dtype: int64

Partition 7 class distribution:
labels
1    8050
4      50
Name: count, dtype: int64

Partition 8 class distribution:
labels
2    4521
3     491
Name: count, dtype: int64

Partition 9 class distribution:
labels
2    2226
4      50
Name: count, dtype: int64

Partition 10 class distribution:
labels
3    95
4    50
Name: count, dtype: int64



In [5]:

# ## Cell3: Create Label-Skew Partitions

import numpy as np
import pandas as pd

# Configuration
NUM_CLIENTS = 10
CLASS_MAPPING = {'Benign': 0, 'DoS': 1, 'Probe': 2, 'U2R': 3, 'R2L': 4}
MIN_SAMPLES_PER_CLASS = 50  # Prevent class starvation

# Label distribution per client (matches your paper's setup)
client_class_map = {
    0: ['Benign', 'DoS'],
    1: ['Benign', 'Probe'],
    2: ['Benign', 'U2R'],
    3: ['Benign', 'R2L'],
    4: ['DoS', 'Probe'],
    5: ['DoS', 'U2R'],
    6: ['DoS', 'R2L'],
    7: ['Probe', 'U2R'],
    8: ['Probe', 'R2L'],
    9: ['U2R', 'R2L']
}

data_partitions = []
for client_id in range(NUM_CLIENTS):
    client_partition = pd.DataFrame()
    classes = client_class_map[client_id]

    for class_name in classes:
        label = CLASS_MAPPING[class_name]
        class_data = df_train[df_train['labels'] == label]

        # Dynamic sampling with minimum guarantee
        proportion = np.random.uniform(0.1, 0.4)  # 10-40% of class data
        num_samples = max(MIN_SAMPLES_PER_CLASS, int(len(class_data) * proportion))

        client_partition = pd.concat([
            client_partition,
            class_data.sample(n=num_samples, random_state=42+client_id)
        ])

    # Shuffle and store
    data_partitions.append(
        client_partition.sample(frac=1, random_state=42).reset_index(drop=True)
    )

    # Verification
    print(f"\nClient {client_id+1} Distribution:")
    print(client_partition['labels'].value_counts().sort_index())


Client 1 Distribution:
labels
0    15098
1    17091
Name: count, dtype: int64

Client 2 Distribution:
labels
0    20927
2     2774
Name: count, dtype: int64

Client 3 Distribution:
labels
0    7598
3     205
Name: count, dtype: int64

Client 4 Distribution:
labels
0    7355
4      50
Name: count, dtype: int64

Client 5 Distribution:
labels
1    8762
2    4268
Name: count, dtype: int64

Client 6 Distribution:
labels
1    12575
3      389
Name: count, dtype: int64

Client 7 Distribution:
labels
1    7528
4      50
Name: count, dtype: int64

Client 8 Distribution:
labels
2    2738
3     166
Name: count, dtype: int64

Client 9 Distribution:
labels
2    2232
4      50
Name: count, dtype: int64

Client 10 Distribution:
labels
3    131
4     50
Name: count, dtype: int64


In [6]:

# ## Cell4 : Create TensorFlow Datasets (Final Corrected Version)

import numpy as np
import tensorflow as tf

# Configuration
batch_size = 32
SEED = 42  # For reproducible shuffling

train_datasets = []
val_datasets = []

for client_id, partition in enumerate(data_partitions):
    # ========== Exact 90/10 Split ==========
    total_samples = len(partition)
    train_samples = (total_samples // 10) * 9  # Exact 90%
    val_samples = total_samples - train_samples  # Exact 10%

    # Shuffle with client-specific seed
    shuffled_partition = partition.sample(frac=1, random_state=SEED+client_id).reset_index(drop=True)

    # Split into train/val
    train_part = shuffled_partition.iloc[:train_samples]
    val_part = shuffled_partition.iloc[train_samples:]

    # ========== Feature/Label Conversion ==========
    # Training data
    train_features = train_part.drop(columns=['labels']).values.astype(np.float32)
    train_labels = train_part['labels'].values.astype(np.int32)

    # Validation data
    val_features = val_part.drop(columns=['labels']).values.astype(np.float32)
    val_labels = val_part['labels'].values.astype(np.int32)

    # ========== Dataset Creation ==========
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_features, train_labels)
    ).batch(batch_size)

    val_dataset = tf.data.Dataset.from_tensor_slices(
        (val_features, val_labels)
    ).batch(batch_size)

    # Store datasets
    train_datasets.append(train_dataset)
    val_datasets.append(val_dataset)

    # ========== Verification ==========
    print(f"Client {client_id+1}:")
    print(f"  Train: {len(train_part)} samples | Classes: {np.unique(train_labels)}")
    print(f"  Val: {len(val_part)} samples | Classes: {np.unique(val_labels)}\n")

# ========== Test Dataset ==========
test_features = df_test.drop(columns=['labels']).values.astype(np.float32)
test_labels = df_test['labels'].values.astype(np.int32)
test_dataset = tf.data.Dataset.from_tensor_slices(
    (test_features, test_labels)
).batch(batch_size)

print("=== Final Verification ===")
print(f"Total training clients: {len(train_datasets)}")
print(f"Test samples: {len(test_labels)}")
print(f"Test features shape: {test_features.shape}")

Client 1:
  Train: 28962 samples | Classes: [0 1]
  Val: 3227 samples | Classes: [0 1]

Client 2:
  Train: 21330 samples | Classes: [0 2]
  Val: 2371 samples | Classes: [0 2]

Client 3:
  Train: 7020 samples | Classes: [0 3]
  Val: 783 samples | Classes: [0 3]

Client 4:
  Train: 6660 samples | Classes: [0 4]
  Val: 745 samples | Classes: [0 4]

Client 5:
  Train: 11727 samples | Classes: [1 2]
  Val: 1303 samples | Classes: [1 2]

Client 6:
  Train: 11664 samples | Classes: [1 3]
  Val: 1300 samples | Classes: [1 3]

Client 7:
  Train: 6813 samples | Classes: [1 4]
  Val: 765 samples | Classes: [1 4]

Client 8:
  Train: 2610 samples | Classes: [2 3]
  Val: 294 samples | Classes: [2 3]

Client 9:
  Train: 2052 samples | Classes: [2 4]
  Val: 230 samples | Classes: [2 4]

Client 10:
  Train: 162 samples | Classes: [3 4]
  Val: 19 samples | Classes: [3 4]

=== Final Verification ===
Total training clients: 10
Test samples: 22544
Test features shape: (22544, 40)


In [7]:

# ## Cell5: Centralized Training (Label Skew Version)

import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

def centralized_training(seed=42):
    tf.keras.utils.set_random_seed(seed)

    # 1. Combine all client partitions (NEW FOR LABEL SKEW)
    full_train = pd.concat(data_partitions).sample(frac=1, random_state=seed)
    full_train_features = full_train.drop('labels', axis=1).values.astype(np.float32)
    full_train_labels = full_train['labels'].values.astype(np.int32)

    # 2. Create model (same architecture as FL)
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(40,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

    # 3. Training with metrics tracking
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])

    history = model.fit(
        full_train_features, full_train_labels,
        epochs=30,
        batch_size=32,
        validation_data=(test_features, test_labels),  # Use your existing test data
        verbose=0
    )

    # 4. Final evaluation
    y_pred = np.argmax(model.predict(test_features), axis=1)

    return {
        'seed': seed,
        'train_loss': history.history['loss'],
        'val_loss': history.history['val_loss'],
        'test_accuracy': history.history['val_accuracy'][-1],
        'test_precision': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'test_recall': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'test_f1': f1_score(y_test, y_pred, average='macro', zero_division=0)
    }

# Run with multiple seeds
results_cl = [centralized_training(seed=s) for s in [42, 123, 456]]

# Generate report
report_cl = pd.DataFrame(results_cl)
print("\nCentralized Training Results (Label Skew Scenario):")
display(report_cl[['seed', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1']])


Centralized Training Results (Label Skew Scenario):


Unnamed: 0,seed,test_accuracy,test_precision,test_recall,test_f1
0,42,0.926056,0.933264,0.68701,0.737093
1,123,0.924015,0.865954,0.685259,0.717707
2,456,0.925169,0.852854,0.696172,0.717019


In [9]:
# ## Cell6: Federated Learning with Statistical Significance
import tensorflow as tf
import tensorflow_federated as tff
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import collections
from scipy import stats

# ======================
# 1. Data Preprocessing
# ======================

def preprocess(dataset):
    def batch_format_fn(features, labels):
        return collections.OrderedDict(
            x=tf.reshape(features, [-1, 40]),  # Flatten features
            y=tf.reshape(labels, [-1])  # Reshape labels
        )
    padded_shapes = ([None, 40], [None])
    return dataset.padded_batch(32, padded_shapes=padded_shapes).map(batch_format_fn).prefetch(tf.data.experimental.AUTOTUNE)

# ======================
# 2. Core FL Functions
# ======================

def create_keras_model():
    return tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(40,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

def make_federated_data(client_data, client_ids):
    """Global function to create federated datasets"""
    return [
        preprocess(client_data[i])
        for i in client_ids
        if len(list(client_data[i])) > 0  # Filter empty datasets
    ]

def run_fl_trial(seed=42, num_rounds=30):
    """Run complete FL pipeline with specified seed"""
    # Set all seeds
    tf.keras.utils.set_random_seed(seed)
    np.random.seed(seed)

    # Model function
    def model_fn():
        return tff.learning.models.from_keras_model(
            create_keras_model(),
            input_spec=federated_train_data[0].element_spec,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )

    # Build training process
    training_process = tff.learning.algorithms.build_weighted_fed_avg(
        model_fn,
        client_optimizer_fn=lambda: tf.keras.optimizers.Adam(0.001),
        server_optimizer_fn=lambda: tf.keras.optimizers.Adam(0.01)
    )

    # Training loop
    state = training_process.initialize()
    for _ in range(num_rounds):
        state = training_process.next(state, federated_train_data).state

    # Evaluation
    eval_model = create_keras_model()
    eval_model.set_weights(list(training_process.get_model_weights(state).trainable))
    y_pred = np.argmax(eval_model.predict(test_features), axis=1)


    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'f1': f1_score(y_test, y_pred, average='macro', zero_division=0)
    }

# ======================
# 3. Execution & Analysis
# ======================

# Configuration
NUM_CLIENTS = 10
SEEDS = [42, 123, 456]

# Create federated data (ensure train_datasets exists)
federated_train_data = make_federated_data(train_datasets, list(range(NUM_CLIENTS)))

# Run trials
fl_results = [run_fl_trial(seed=s) for s in SEEDS]

# Statistical comparison with centralized results (assuming results_cl exists)
def print_stat_comparison(cl_results, fl_results):
    for metric in ['accuracy', 'precision', 'recall', 'f1']:
        cl_values = [r[f'test_{metric}'] for r in cl_results]
        fl_values = [r[metric] for r in fl_results]
        t_stat, p_value = stats.ttest_ind(cl_values, fl_values)

        print(f"\n{metric.upper():<10} CL: {np.mean(cl_values):.4f} ± {np.std(cl_values):.4f}")
        print(f"{'FL:':<10} {np.mean(fl_values):.4f} ± {np.std(fl_values):.4f}")
        print(f"{'p-value:':<10} {p_value:.4e}{'*' if p_value < 0.05 else ''}")

print("\n=== Statistical Significance ===")
print_stat_comparison(results_cl, fl_results)


=== Statistical Significance ===

ACCURACY   CL: 0.9251 ± 0.0008
FL:        0.8724 ± 0.0328
p-value:   8.5748e-02

PRECISION  CL: 0.8840 ± 0.0352
FL:        0.5449 ± 0.0063
p-value:   1.7914e-04*

RECALL     CL: 0.6895 ± 0.0048
FL:        0.4811 ± 0.0665
p-value:   1.1487e-02*

F1         CL: 0.7239 ± 0.0093
FL:        0.4791 ± 0.0770
p-value:   1.1147e-02*


In [10]:
import numpy as np
unique_labels, counts = np.unique(y_test, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f"Class {label}: {count} samples ({count/len(y_test):.1%})")

Class 0: 11245 samples (49.9%)
Class 1: 8095 samples (35.9%)
Class 2: 2157 samples (9.6%)
Class 3: 1009 samples (4.5%)
Class 4: 38 samples (0.2%)
