In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data from output.csv
data = pd.read_csv('../output/output_binary.csv')

# Split features and labels
feature_df = data.drop(data.columns[:2], axis=1)
X = np.asarray(feature_df)
y = np.asarray(data['pii_exist'])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y))
y_train_one_hot = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_one_hot = tf.one_hot(y_test, depth=num_classes).numpy()

# Define function to create model
def create_model(num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Create a list of models representing each client
num_clients = 10
client_models = [create_model(num_classes) for _ in range(num_clients)]

# Define a function for training on each client's data
def train_on_client(X, y, model):
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=32, verbose=0)
    return model

# Train each client's model
for i, model in enumerate(client_models):
    X_client_train, _, y_client_train, _ = train_test_split(X_train, y_train_one_hot, test_size=0.8, random_state=i)  # Use a portion of the training data for each client
    client_models[i] = train_on_client(X_client_train, y_client_train, model)

# Aggregate weights of all client models to update the global model
global_model = create_model(num_classes)
for model in client_models:
    global_model.set_weights([(w1 + w2) / 2 for w1, w2 in zip(global_model.get_weights(), model.get_weights())])

# Compile the global model
global_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])

# Predict classes using the global model
y_pred = np.argmax(global_model.predict(X_test), axis=1)

# Calculate confusion matrix and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)
