In [1]:
import json,os

import pandas as pd
import torch
import copy
import numpy as np

from fedavg.server import Server
from fedavg.client import Client
from fedavg.models import CNN_Model,weights_init_normal, ReTrainModel,MLP
from utils import get_data

from collections import Counter
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def min_max_norm(train_datasets, test_dataset, cat_columns, label):
    
    train_data = None
    for key in train_datasets.keys():
        train_datasets[key]['tag'] = key
        train_data = pd.concat([train_data, train_datasets[key]])
    test_dataset['tag'] = key+1
    data = pd.concat([train_data, test_dataset])
    
    min_max = MinMaxScaler()
    con = []

    # select continue columns
    for c in data.columns:
        if c not in cat_columns and c not in [label, 'tag']:
            con.append(c)

    data[con] = min_max.fit_transform(data[con])

    # one-hot encode discrete columns
    data = pd.get_dummies(data, columns=cat_columns)
    
    for key in train_datasets.keys():
        c_data = data[data['tag'] == key]
        c_data = c_data.drop(columns=['tag'])
        train_datasets[key] = c_data
    
    test_dataset = data[data['tag'] == key+1]
    test_dataset = test_dataset.drop(columns=['tag'])

    return train_datasets, test_dataset

In [3]:
def model_init(conf, train_datasets, test_dataset, device):
    
    ### init weight of every client node
    client_weight = {}
    if conf["is_init_avg"]:
        for key in train_datasets.keys():
            client_weight[key] = 1 / len(train_datasets)
    print("各节点的聚合权值为：", client_weight)
    
    clients = {}
    
    ## init train model 
    if conf['model_name'] == "mlp":
        n_input = test_dataset.shape[1] - 1
        model = MLP(n_input, 512, conf["num_classes"][conf['which_dataset']])
    elif conf['model_name'] == 'cnn':
        model = CNN_Model()
    model.apply(weights_init_normal)

    if torch.cuda.is_available():
        model.cuda(device=device)
    
    server = Server(conf, model, test_dataset, device)
    print("Server init finish!")
    
    for key in train_datasets.keys():
        clients[key] = Client(conf, copy.deepcopy(server.global_model), train_datasets[key], device)
    print("Clients init finish！")

    # save model
    if not os.path.isdir(conf["model_dir"]):
        os.mkdir(conf["model_dir"])
    
    return clients, server, client_weight

In [4]:
def train_and_eval(clients, server, client_weight):
    # fed_avg train
    max_score = 0
    score_list = []
    loss_list = []
    for e in range(conf["global_epochs"]):
        
        clients_models = {}
        for key in clients.keys():
#             print('training client {}...'.format(key))
            model_k = clients[key].local_train(server.global_model)
            clients_models[key] = copy.deepcopy(model_k)

    #         acc, loss = test_model(clients_models[key], test_dataset)
    #         print("client %d,Epoch %d, global_acc: %f, global_loss: %f\n" % (key, e, acc, loss))


        # fed_avg agra
        server.model_aggregate(clients_models, client_weight)

        # evaluate global model 
    #     acc, loss, auc_roc, f1 = server.model_eval()
        acc, loss, auc_roc = server.model_eval()
        loss_list.append(loss)
        
        if conf['num_classes'][conf['which_dataset']] == 2:
            score_list.append(auc_roc)
            print("Epoch %d, global_loss: %f, auc_roc: %f" % (e, loss, auc_roc))
            if auc_roc > max_score:
                torch.save(server.global_model.state_dict(),
                           os.path.join(conf["model_dir"], "global-model.pth"))
                for key in clients.keys():
                    torch.save(clients[key].local_model.state_dict(),
                               os.path.join(conf["model_dir"], "local-model{}.pth".format(key)))
#                 torch.save(server.global_model.state_dict(), 
#                            os.path.join(conf["model_dir"], "model-epoch{}.pth".format(e)))
    #             print("model save done !")
                max_score = auc_roc
                maxe = e
        else:
            score_list.append(acc)
            print("Epoch %d, global_loss: %f, acc: %f" % (e, loss, acc))
            
            # save best model
            if acc > max_score:
                torch.save(server.global_model.state_dict(),
                           os.path.join(conf["model_dir"], "globalmodel-epoch{}.pth".format(e)))
                for key in clients.keys():
                    torch.save(clients[key].local_model.state_dict(),
                               os.path.join(conf["model_dir"], "local-model{}-epoch{}.pth".format(key, e)))
#                 torch.save(server.global_model.state_dict(), 
#                            os.path.join(conf["model_dir"], "model-epoch{}.pth".format(e)))
    #             print("model save done !")
                max_score = acc
                maxe = e
        
    print('max score = {0}, epoch = {1}'.format(max_score, maxe))
    
    return max_score, loss_list, score_list

In [5]:
def base_train(conf, dataset_name, b, clients_num, path, label_name, train_gpu):
    
    train_files_path_list = [path + "b={}/".format(b) + label_name + "_{}.csv".format(i) for i in range(clients_num)]
    print("path of partition data:\n" + str(train_files_path_list))
    
    # read file
    train_datasets = {}
    for i in range(len(train_files_path_list)):
        train_datasets[i] = pd.read_csv(train_files_path_list[i])
        print(train_datasets[i][label_name].value_counts())
    test_dataset = pd.read_csv(path + '{}_test.csv'.format(dataset_name))
    print("shape of test dataset: " + str(test_dataset.shape))
    
    train_datasets, test_dataset = min_max_norm(
        train_datasets, test_dataset, 
        conf['discrete_columns'][dataset_name], 
        conf['label_column']) 
    
    clients, server, client_weight = model_init(conf, train_datasets, test_dataset, train_gpu)
    
    max_score, loss_list, score_list = train_and_eval(clients, server, client_weight)
    
    return max_score, loss_list, score_list

In [6]:
def get_random_data(syn_data, aug_numbers,label, ratio):
    """
    select augment data from synthesis data
    """
  
    aug_data = None   
    for i in range(len(aug_numbers)):
        
        aug_i = syn_data[syn_data[label] == i]
        if aug_i.shape[0] >= aug_numbers[i]:
#             aug_data = pd.concat([aug_data, aug_i.sample(aug_numbers[i])])
            aug_data = pd.concat([aug_data, aug_i.sample(int(ratio*len(aug_i)))])
        else:
            print('label {} has no enough synthetic data'.format(i))
    
    return aug_data
        
def random_aug(train_datasets, path, dataset_name, label, label_num, ratio, aug_type='same_number'):
    """
    random select
    """ 
    labels_dis = []
    
    for key in train_datasets.keys():
        label_dis = []
        for i in range(label_num):
            label_i = len(train_datasets[key][train_datasets[key][label] == i])
            label_dis.append(label_i)
        labels_dis.append(label_dis)
    labels_dis = np.array(labels_dis)
    print(labels_dis)
    total_dis = np.sum(labels_dis, axis=0)
    print(total_dis)
    aug_numbers = total_dis - labels_dis

    if aug_type == 'same_number':
        
        for key in train_datasets.keys():
#             syn_data = pd.read_csv('./data/clinical/syn_data/clinical_syn_{}.csv'.format(key))
            syn_data = pd.read_csv('{0}/{1}_syn_{2}.csv'.format(path, dataset_name, key))
            aug_data = get_random_data(syn_data, aug_numbers[key],label, ratio)
            train_datasets[key] = pd.concat([train_datasets[key], aug_data])
            print(train_datasets[key].shape)
    
    return train_datasets

In [8]:
def augment_train(conf, dataset_name, b, clients_num, path, label_name, label_num, augment_path, ratio, train_gpu):

    train_files_path_list = [path + "b={}/".format(b) + label_name + "_{}.csv".format(i) for i in range(clients_num)]
    print("path of partition data:\n" + str(train_files_path_list))
    
    # read file
    train_datasets = {}
    for i in range(len(train_files_path_list)):
        train_datasets[i] = pd.read_csv(train_files_path_list[i])
        print(train_datasets[i][label_name].value_counts())
    test_dataset = pd.read_csv(path + '{}_test.csv'.format(dataset_name))
    print("shape of test dataset: " + str(test_dataset.shape))
    
    train_datasets = random_aug(train_datasets, augment_path, dataset_name, label_name, label_num, ratio)
    
    train_datasets, test_dataset = min_max_norm(
        train_datasets, test_dataset, 
        conf['discrete_columns'][dataset_name], 
        conf['label_column']) 
    
    clients, server, client_weight = model_init(conf, train_datasets, test_dataset, train_gpu)
    
    max_score, loss_list, score_list = train_and_eval(clients, server, client_weight)
    
    return max_score, loss_list, score_list

# Clinical

In [10]:
conf = {

	#type of data: tabular, image
	"data_type" : "tabular",

	#select model from mlp,simple-cnn,vgg
	"model_name" : "mlp",

	#fed_ccvr
	"no-iid": "",

	"global_epochs" : 100,

	"local_epochs" : 1,

	# Dirichlet param
	"beta" : 0.05,

	"batch_size" : 65,

	"weight_decay":1e-5,

    #learning rate
	"lr" : 0.001,

	"momentum" : 0.9,

	"num_parties":5,

    # if set weight of different clients even
	"is_init_avg": True,

    # percentage of eval dataset in total train dataset
	"split_ratio": 0.2,

    # name of the column using as label in ml mission
	"label_column": "label",

    # path to save model
	"model_dir":"./save_model/clinical",

    # name of saved model
	"model_file":"model.pth",
    
    # which dataset is using in current mission
    "which_dataset": "clinical",
    
    "num_classes": {
        "clinical": 2,
        "credit": 2,
        "tb": 2,
        "covtype": 7,
        "intrusion": 10,
    },
    "discrete_columns": {
        "adult":[
            'workclass',
            'education',
            'marital_status',
            'occupation',
            'relationship',
            'race',
            'gender',
            'native_country'
        ],
        "intrusion":['protocol_type', 'service', 'flag'],
        "credit":[],
        "covtype":
            ['Wilderness_Area4', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Soil_Type40', 'Soil_Type1',
             'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9',
             'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 
             'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 
             'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 
             'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 
             'Soil_Type38', 'Soil_Type39'],
        "clinical":["anaemia","diabetes","high_blood_pressure","sex","smoking"]
    }
}

In [12]:
augment_train(conf, 
              "clinical",
              0.05,
              5,
              "./data/clinical/",
              "label",
              2,
              "./data/clinical/syn",
              ratio=1,
              train_gpu=6)

划分数据目录如下:
['./data/clinical/b=0.05/label_0.csv', './data/clinical/b=0.05/label_1.csv', './data/clinical/b=0.05/label_2.csv', './data/clinical/b=0.05/label_3.csv', './data/clinical/b=0.05/label_4.csv']
1    13
Name: label, dtype: int64
1    25
Name: label, dtype: int64
1    29
Name: label, dtype: int64
0    48
Name: label, dtype: int64
0    94
Name: label, dtype: int64
测试数据格式如下: (90, 13)
[[ 0 13]
 [ 0 25]
 [ 0 29]
 [48  0]
 [94  0]]
[142  67]
(1013, 13)
(1025, 13)
(1029, 13)
(1048, 13)
(1094, 13)
各节点的聚合权值为： {0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2}
Server初始化完成!
参与方初始化完成！


  torch.nn.init.normal(m.weight, 0.0, 0.01)
  torch.nn.init.constant(m.bias, 0.0)
  torch.nn.init.normal(m.weight, 1.0, 0.02)
  torch.nn.init.constant(m.bias, 0.0)


Epoch 0, global_loss: 0.015296, auc_roc: 0.746750
Epoch 1, global_loss: 0.014833, auc_roc: 0.965517
Epoch 2, global_loss: 0.013319, auc_roc: 0.960430
Epoch 3, global_loss: 0.012910, auc_roc: 0.946297
Epoch 4, global_loss: 0.012412, auc_roc: 0.934426
Epoch 5, global_loss: 0.012582, auc_roc: 0.962125
Epoch 6, global_loss: 0.012475, auc_roc: 0.967778
Epoch 7, global_loss: 0.012505, auc_roc: 0.986433
Epoch 8, global_loss: 0.012906, auc_roc: 0.972301
Epoch 9, global_loss: 0.012810, auc_roc: 0.929904
Epoch 10, global_loss: 0.012737, auc_roc: 0.949689
Epoch 11, global_loss: 0.012829, auc_roc: 0.952516
Epoch 12, global_loss: 0.012813, auc_roc: 0.928773
Epoch 13, global_loss: 0.012957, auc_roc: 0.932730
Epoch 14, global_loss: 0.012923, auc_roc: 0.964952
Epoch 15, global_loss: 0.012777, auc_roc: 0.944036
Epoch 16, global_loss: 0.013044, auc_roc: 0.945732
Epoch 17, global_loss: 0.013077, auc_roc: 0.955907
Epoch 18, global_loss: 0.013546, auc_roc: 0.955907
Epoch 19, global_loss: 0.013424, auc_roc:

(0.9864330130016958,
 [0.015296440654330784,
  0.014832589361402724,
  0.013318781057993572,
  0.012910016377766927,
  0.012411559952629938,
  0.012582037183973525,
  0.012474881278143989,
  0.01250477499432034,
  0.01290593147277832,
  0.012810476620992025,
  0.012736855612860785,
  0.012828654713100858,
  0.01281255880991618,
  0.012957139809926351,
  0.012922766473558214,
  0.012777119212680392,
  0.013044053978390164,
  0.013076702753702799,
  0.013545862833658854,
  0.01342389186223348,
  0.013364262051052518,
  0.013767980204688179,
  0.01379294925265842,
  0.013533686267005072,
  0.013549958335028754,
  0.013767984178331163,
  0.013870965109931098,
  0.013610484864976671,
  0.013854961925082737,
  0.013704878754085964,
  0.013958328300052218,
  0.013830855157640246,
  0.013847292794121637,
  0.013670367664761013,
  0.013830751842922635,
  0.014008408122592502,
  0.013876684506734212,
  0.013780172665913899,
  0.01384432315826416,
  0.013822139634026422,
  0.01409913698832194,
  