In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data from output.csv
data = pd.read_csv('/home/kavi/Code/PacketMasti/nit_research/output/output-small.csv')

# Split features and labels
feature_df = data.drop(data.columns[:2], axis=1)
X = np.asarray(feature_df)
y = np.asarray(data['pii_exist'])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y))
y_train_one_hot = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_one_hot = tf.one_hot(y_test, depth=num_classes).numpy()

# Define function to create model
def create_model(num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Create a list of models representing each client
num_clients = 10
client_models = [create_model(num_classes) for _ in range(num_clients)]

# Define a function for training on each client's data
def train_on_client(X, y, model):
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=32, verbose=0)
    return model

# Train each client's model
for i, model in enumerate(client_models):
    X_client_train, _, y_client_train, _ = train_test_split(X_train, y_train_one_hot, test_size=0.8, random_state=i)  # Use a portion of the training data for each client
    client_models[i] = train_on_client(X_client_train, y_client_train, model)

# Aggregate weights of all client models to update the global model
global_model = create_model(num_classes)
for model in client_models:
    global_model.set_weights([(w1 + w2) / 2 for w1, w2 in zip(global_model.get_weights(), model.get_weights())])
    global_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
    y_pred = np.argmax(global_model.predict(X_test), axis=1)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("F1 Score:", f1)

# Compile the global model
global_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])

# Predict classes using the global model
y_pred = np.argmax(global_model.predict(X_test), axis=1)

# Calculate confusion matrix and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)


2024-04-27 20:00:49.259388: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-27 20:00:49.296722: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-27 20:00:49.806350: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FileNotFoundError: [Errno 2] No such file or directory: '/home/kavi/Code/PacketMasti/nit_research/output/output-small.csv'

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data from output.csv
data = pd.read_csv('../output/output.csv')

# Split features and labels
feature_df = data.drop(data.columns[:2], axis=1)
X = np.asarray(feature_df)
y = np.asarray(data['pii_exist'])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y))
y_train_one_hot = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_one_hot = tf.one_hot(y_test, depth=num_classes).numpy()

# Define SVM model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_hinge',
              metrics=['accuracy'])

# Define privacy parameters
epsilon = 20 # Privacy budget
sensitivity = 1  # Sensitivity of the gradients

# Define a function for training on each client's data with differential privacy
def train_on_client_dp(X, y, model, epsilon):
    model.compile(optimizer='adam',
                  loss='categorical_hinge',
                  metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=32, verbose=0)

    # Add noise to the gradients
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.Dense):
            for weight in layer.trainable_variables:
                noise = tf.random.normal(shape=weight.shape, stddev=sensitivity / epsilon)
                weight.assign_add(noise)

    return model

# Federated learning loop with differential privacy
client_models = []  # List to store models of individual clients
for i in range(10):  # 10 clients
    # Divide the training data into 10 parts
    start_index = int(i * len(X_train) / 10)
    end_index = int((i + 1) * len(X_train) / 10)
    X_client_train = X_train[start_index:end_index]
    y_client_train = y_train_one_hot[start_index:end_index]

    # Train client model on its data with differential privacy
    client_model = train_on_client_dp(X_client_train, y_client_train, model, epsilon)
    client_models.append(client_model)  # Store the trained model for aggregation

    # Evaluate client model on the test set
    y_pred = np.argmax(client_model.predict(X_test), axis=1)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Client", i+1, "F1 Score:", f1)

# Aggregate weights of all client models to update the global model
global_model = tf.keras.models.clone_model(model)  # Create a copy of the original model
for client_model in client_models:
    for global_layer, client_layer in zip(global_model.layers, client_model.layers):
        global_layer_weights = global_layer.get_weights()
        client_layer_weights = client_layer.get_weights()
        aggregated_weights = [(w1 + w2) / 2 for w1, w2 in zip(global_layer_weights, client_layer_weights)]
        global_layer.set_weights(aggregated_weights)

# Predict classes using the global model
y_pred = np.argmax(global_model.predict(X_test), axis=1)

# Calculate confusion matrix and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
Client 1 F1 Score: 0.8411420563362862
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Client 2 F1 Score: 0.8811417943116095
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Client 3 F1 Score: 0.905981662024451
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Client 4 F1 Score: 0.912776140881875
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Client 5 F1 Score: 0.9161811416845413
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
Client 6 F1 Score: 0.9162984029251067
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Client 7 F1 Score: 0.9158287528870898
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Client 8 F1 Score: 0.91881712217981
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Client 9 F1 Score: 0.9176490232183685
[1m5

In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data from output.csv

data = pd.read_csv('output/output_binary.csv')

# Split features and labels
feature_df = data.drop(data.columns[:2], axis=1)
X = np.asarray(feature_df)
y = np.asarray(data['pii_exist'])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y))
y_train_one_hot = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_one_hot = tf.one_hot(y_test, depth=num_classes).numpy()

# Define privacy parameters
epsilon = 0.1 # Privacy budget
delta = 1e-5  # Desired overall privacy failure probability

# Define SVM model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_hinge',
              metrics=['accuracy'])

# Define a function for training on each client's data with differential privacy
def train_on_client_dp(X, y, model, epsilon, delta):
    # Compute the scale parameter for Gaussian noise
    delta_prime = delta / (2 * len(X) / 32)  # Assuming batch size of 32
    c = np.sqrt(2 * np.log(1.25 / delta_prime))
    sensitivity = 2 *c
    sigma =  c * sensitivity / epsilon

    model.compile(optimizer='adam',
                  loss='categorical_hinge',
                  metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=32, verbose=0)

    # Add noise to the gradients
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.Dense):
            for weight in layer.trainable_variables:
                noise = tf.random.normal(shape=weight.shape, stddev=sigma)
                weight.assign_add(noise)

    return model

# Federated learning loop with differential privacy
client_models = []  # List to store models of individual clients
for i in range(10):  # 10 clients
    # Divide the training data into 10 parts
    start_index = int(i * len(X_train) / 10)
    end_index = int((i + 1) * len(X_train) / 10)
    X_client_train = X_train[start_index:end_index]
    y_client_train = y_train_one_hot[start_index:end_index]

    # Train client model on its data with differential privacy
    client_model = train_on_client_dp(X_client_train, y_client_train, model, epsilon, delta)
    client_models.append(client_model)  # Store the trained model for aggregation

    # Evaluate client model on the test set
    y_pred = np.argmax(client_model.predict(X_test), axis=1)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Client", i+1, "F1 Score:", f1)

# Aggregate weights of all client models to update the global model
global_model = tf.keras.models.clone_model(model)  # Create a copy of the original model
for client_model in client_models:
    for global_layer, client_layer in zip(global_model.layers, client_model.layers):
        global_layer_weights = global_layer.get_weights()
        client_layer_weights = client_layer.get_weights()
        aggregated_weights = [(w1 + w2) / 2 for w1, w2 in zip(global_layer_weights, client_layer_weights)]
        global_layer.set_weights(aggregated_weights)

# Predict classes using the global model
y_pred = np.argmax(global_model.predict(X_test), axis=1)

# Calculate confusion matrix and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Client 1 F1 Score: 0.5206869933937553
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Client 2 F1 Score: 0.5236772362907769
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Client 3 F1 Score: 0.4977901295674495
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Client 4 F1 Score: 0.46233568298723565
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Client 5 F1 Score: 0.46718949213769656
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Client 6 F1 Score: 0.4729114059991899
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Client 7 F1 Score: 0.482847953027354
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Client 8 F1 Score: 0.6247292059627126
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Client 9 F1 Score: 0.51254131

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data from output.csv
data = pd.read_csv('/media/jay/Windows/Users/jay/Downloads/nit_research/output/output_1.csv')

X = np.asarray(data.drop(data.columns[:2], axis=1))
y = np.asarray(data['pii_exist'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
Xg_train, Xg_test, yg_train, yg_test = train_test_split(X_scaled, y, test_size=0.2, random_state=101)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y))
y_train_one_hot = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_one_hot = tf.one_hot(y_test, depth=num_classes).numpy()

# Define privacy parameters
epsilon = 100  # Privacy budget
delta = 1e-2   # Desired overall privacy failure probability

# Create a simple MLP model for classification
model = tf.keras.Sequential([
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Define a function for training on each client's data with differential privacy
def train_on_client_dp(X, y, model, epsilon, delta):
    # Compute the scale parameter for Gaussian noise
    delta_prime = delta / (2 * len(X) / 32)  # Assuming batch size of 32
    c = np.sqrt(2 * np.log(1.25 / delta_prime))
    sensitivity = 2 * c
    sigma = c * sensitivity / epsilon

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=32, verbose=0)

    # Add noise to the gradients
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.Dense):
            for weight in layer.trainable_variables:
                noise = tf.random.normal(shape=weight.shape, stddev=sigma)
                weight.assign_add(noise)

    return model

# Define the global aggregator function
def global_aggregator(global_model, client_model):
    """
    Aggregate weights of the client model onto the global model.

    Parameters:
        global_model (tf.keras.Model): Global model to be updated.
        client_model (tf.keras.Model): Client model whose weights are to be aggregated onto the global model.

    Returns:
        tf.keras.Model: Updated global model with aggregated weights.
    """
    for global_layer, client_layer in zip(global_model.layers, client_model.layers):
        global_layer_weights = global_layer.get_weights()
        client_layer_weights = client_layer.get_weights()
        aggregated_weights = [(w1 + w2) / 2 for w1, w2 in zip(global_layer_weights, client_layer_weights)]
        global_layer.set_weights(aggregated_weights)

    return global_model

# Federated learning loop with differential privacy
global_model = tf.keras.models.clone_model(model)  # Create a copy of the original model
noclient = 10
for i in range(noclient):  # 10 clients
    # Divide the training data into parts for each client
    start_index = int(i * len(X_train) / noclient)
    end_index = int((i + 1) * len(X_train) / noclient)
    X_client_train = X_train[start_index:end_index]
    y_client_train = y_train_one_hot[start_index:end_index]

    # Train client model on its data with differential privacy
    client_model = train_on_client_dp(X_client_train, y_client_train, global_model, epsilon, delta)

    # Aggregate weights of the client model onto the global model
    global_model = global_aggregator(global_model, client_model)

    # Evaluate client model on the test set
    y_pred = np.argmax(client_model.predict(X_test), axis=1)
    yg_pred = np.argmax(global_model.predict(Xg_test), axis=1)
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1g = f1_score(yg_test, yg_pred, average='weighted')
    print("Client", i+1, "F1 Score:", f1, "F1 Global", f1g)

# Predict classes using the global model
y_pred = np.argmax(global_model.predict(X_test), axis=1)

# Calculate confusion matrix and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)


2024-05-01 19:07:42.229672: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-01 19:07:42.394952: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 19:07:42.395075: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 19:07:42.395174: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-01 19:07:42.420425: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-01 19:07:42.424038: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

Client 1 F1 Score: 0.6377482454830521 F1 Global 0.670362505502314
Client 2 F1 Score: 0.7310972871617593 F1 Global 0.7287413102474406
Client 3 F1 Score: 0.809917591150839 F1 Global 0.7846549082759466
Client 4 F1 Score: 0.8529281767955801 F1 Global 0.8380146014206787
Client 5 F1 Score: 0.8624697695476832 F1 Global 0.8380146014206787
Client 6 F1 Score: 0.8996868036198794 F1 Global 0.8561945059166903
Client 7 F1 Score: 0.9022354200095724 F1 Global 0.8687301873188021
Client 8 F1 Score: 0.9151983817226998 F1 Global 0.8743993713067694
Client 9 F1 Score: 0.9022354200095724 F1 Global 0.8792317723332979
Client 10 F1 Score: 0.9069565852522431 F1 Global 0.8844825648405459
Confusion Matrix:
[[239  31]
 [  4  88]]
F1 Score: 0.9069565852522431
