In [None]:
import numpy as np
from sklearn.datasets import load_svmlight_file

data_phising_X,data_phising_Y=load_svmlight_file(r'./data/phishing')
data_phising_X=data_phising_X.toarray()
data_phising_Y

array([0., 0., 0., ..., 0., 0., 0.], shape=(11055,))

In [None]:
import numpy as np
from sklearn.datasets import load_svmlight_file

data_w8a_X,data_w8a_Y=load_svmlight_file(r'./data/w8a')
data_w8a_X=data_w8a_X.toarray()
data_w8a_Y


array([-1., -1., -1., ..., -1., -1., -1.], shape=(49749,))

In [15]:
def partition_data(dataset_X,dataset_Y, num_clients, alpha):
    data_indices = np.arange(len(dataset_X))
    targets = np.array(dataset_Y)
    num_classes = len(np.unique(targets))

    # Create Dirichlet distribution
    class_distribution = np.random.dirichlet(alpha=[alpha] * num_clients, size=num_classes)

    client_data_indices = [[] for _ in range(num_clients)]
    for class_idx, class_dist in enumerate(class_distribution):
        class_indices = data_indices[targets == class_idx]
        np.random.shuffle(class_indices)
        split_indices = np.array_split(class_indices, [int(np.round(val)) for val in np.cumsum(class_dist[:-1]) * len(class_indices)])
        for client_idx, client_indices in enumerate(split_indices):
            client_data_indices[client_idx].extend(client_indices)

    return client_data_indices

def assign_data_to_clients_niid(train_dataset_X,train_dataset_Y, no_of_clients,alpha):
    client_indices = partition_data(train_dataset_X, train_dataset_Y, no_of_clients, alpha)
    client_datasets = [(train_dataset_X[indices],train_dataset_Y[indices]) for indices in client_indices]
    return client_datasets

In [19]:
def make_json(client_dataset):
 client_data_dict={}
 user=0
 for X,y in client_dataset:
    if(X.shape[0]!=0):
        client_data_dict['user'+str(user)]={'X':X,'Y':y}
        user+=1
 return client_data_dict

In [20]:
import json
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert ndarray to list
        return super().default(obj)

## Create non-iid  federated data

### phishing data

In [24]:
phishing_client_datas=assign_data_to_clients_niid(data_phising_X,data_phising_Y,100,0.5)
phishing_client_data_train=phishing_client_datas[:70]
phishing_client_data_test=phishing_client_datas[70:]

In [25]:
client_train_dict_phishing=make_json(phishing_client_data_train)
client_test_dict_phishing=make_json(phishing_client_data_test)

In [26]:
import json
with open('fed_phishing_train_niid.json','w')as f1:
    json.dump(client_train_dict_phishing,f1, cls=NumpyEncoder)
with open('fed_phishing_test_niid.json','w')as f2:
    json.dump(client_test_dict_phishing,f2, cls=NumpyEncoder)

#### w8a data

In [28]:
w8a_client_datas=assign_data_to_clients_niid(data_w8a_X,data_w8a_Y,100,0.5)
w8a_client_data_train=w8a_client_datas[:70]
w8a_client_data_test=w8a_client_datas[70:]
client_train_dict_w8a=make_json(w8a_client_data_train)
client_test_dict_w8a=make_json(w8a_client_data_test)
import json
with open('fed_w8a_train_niid.json','w')as f1:
    json.dump(client_train_dict_w8a,f1, cls=NumpyEncoder)
with open('fed_w8a_test_niid.json','w')as f2:
    json.dump(client_test_dict_w8a,f2, cls=NumpyEncoder)

## Create iid federated data

In [19]:
def assign_data_to_clients_iid(train_dataset_X,train_dataset_Y):
    num_samples=int(len(train_dataset_Y)/40)
    idx=0
    client_indices = []
    for ni in range(40):
        first=idx
        last=first+num_samples
        client_indices.append([i for i in range(first,last)])
        idx+=num_samples
    client_datasets = [(train_dataset_X[indices],train_dataset_Y[indices]) for indices in client_indices]
    return client_datasets

In [20]:
client_datas_iid=assign_data_to_clients_iid(data_phising_X,data_phising_Y)

In [21]:
train_client_dataset=client_datas_iid[:]
test_client_dataset=client_datas_iid[26:]

In [24]:
client_train_dict=make_json(train_client_dataset)
client_test_dict=make_json(test_client_dataset)

In [27]:
import json
with open('fed_phishing_train_iid.json','w')as f1:
    json.dump(client_train_dict,f1, cls=NumpyEncoder)
with open('fed_phishing_test_iid.json','w')as f2:
    json.dump(client_test_dict,f2, cls=NumpyEncoder)