In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import shutil
from typing import Union
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

sys.path.append("../../../")

from examples.logistic_regression.adult.dataset import adult_csv_to_dataframe, adult_df_to_dataset
from examples.logistic_regression.model import LogisticRegressionTrainingModel, LogisticRegressionTrainingModelParams, LogisticRegressionEvaluationModelParams

In [None]:
class Fake:  # wrapper to denote to generate data that is representative but not necessarily correct
    def __init__(self, fraction, *, attack_type: str):
        self.fraction = fraction
        self.attack_type = attack_type

def generate_rel_split(num_special_agents: int, num_normal_agents: int, special_agent_factor: float):
    denom = num_special_agents * special_agent_factor + num_normal_agents
    answer = []
    for _ in range(num_special_agents):
        answer.append(special_agent_factor/denom)
    for _ in range(num_normal_agents):
        answer.append(1/denom)
    return answer
    

# dict of names to split sequences.
# each subsequence is a sequence of tuples representing what % of the (false data, true data) each client should have. True data and false data %s must each sum to 1.0
SPLITS = {
    # symmetric and equal splits
    "sym_eq_1": (1.0,),  # one client with everything
    "sym_eq_3": [1/3] * 3,  # 3 clients with equal splits
    "sym_eq_3": [1/5] * 5,  # 3 clients with equal splits
    "sym_eq_25": [1/25] * 25,  # 25 clients with equal splits
    "sym_eq_50": [1/50] * 50,  # 50 clients with equal splits
    "sym_eq_100": [1/100] * 100,  # 50 clients with equal splits

    # symmetric but unequal splits
    "sym_neq_50x0.125": generate_rel_split(10, 40, .125),
    "sym_neq_50x0.25": generate_rel_split(10, 40, .25),
    "sym_neq_50x0.5": generate_rel_split(10, 40, .5),
    "sym_neq_50x0.75": generate_rel_split(10, 40, .75),
    "sym_neq_50x1.5": generate_rel_split(10, 40, 1.5),
    "sym_neq_50x2": generate_rel_split(10, 40, 2),
    "sym_neq_50x4": generate_rel_split(10, 40, 4),


    # attack splits
    "attack_random_50_one":       [1/50] * 49 + [Fake(1/50, attack_type='random')],
    "attack_random_50_quarter":   [1/50] * 38 + [Fake(1/50, attack_type='random')] * 12,
    "attack_random_50_half":      [1/50] * 26 + [Fake(1/50, attack_type='random')] * 24,
    "attack_inverted_50_one":     [1/50] * 49 + [Fake(1/50, attack_type='inverted')],
    "attack_inverted_50_quarter": [1/50] * 38 + [Fake(1/50, attack_type='inverted')] * 12,
    "attack_inverted_50_half":    [1/50] * 26 + [Fake(1/50, attack_type='inverted')] * 24,
}
VALIDATION_FRACTIONS = [0.2]  # put aside 20% of the records for validation. Not gonna sweep on this
DATA_FOLDER = "/path/to/data/root/folder/adult"
os.makedirs(DATA_FOLDER, exist_ok=True)

In [None]:
df = adult_csv_to_dataframe("adult.data")
df_test = adult_csv_to_dataframe("adult.test")
adult_df_to_dataset(df).save(os.path.join(DATA_FOLDER, "train_validate.dat"))
adult_df_to_dataset(df_test).save(os.path.join(DATA_FOLDER, "test.dat"))

In [None]:
df_shuffle = df.sample(frac=1)
df_shuffle

In [None]:
df_shuffle.info()

In [None]:
def slice_df(df: pd.DataFrame, start_i: int, fraction: Union[float, Fake]):
    if isinstance(fraction, float):
        num_records = int(fraction * len(df))
        new_df = df.iloc[start_i:start_i+num_records]
        new_start_i = start_i + num_records
        return (new_start_i, new_df)
    else:
        assert isinstance(fraction, Fake)
        num_records = int(fraction.fraction * len(df))
        real_df = df.iloc[start_i:start_i+num_records]
        if fraction.attack_type == "random":
            fake_data = real_df.to_numpy(copy=True)
            for i, column in enumerate(real_df.columns):
                size = len(real_df)
                if real_df.dtypes[column] == int or real_df.dtypes[column] == float:
                    mean = real_df[column].mean()
                    std = real_df[column].std()
                    fake_data[:, i] = np.random.normal(mean, std, size=size)
                else:
                    names = []
                    counts = []
                    for name, count in real_df[column].value_counts().items():
                        names.append(name)
                        counts.append(count)
                    np_counts = np.array(counts, dtype=np.float)
                    np_counts /= np.sum(np_counts)
                    fake_data[:, i] = np.random.choice(names, size=size, replace=True, p=np_counts)
            fake_df = pd.DataFrame(data=fake_data, columns=real_df.columns).infer_objects()
        else:
            assert fraction.attack_type == "inverted"
            fake_df = real_df.copy(deep=True)
            fake_df.loc[fake_df['income'] == '<=50K', 'income'] = 'FAKE'
            fake_df.loc[fake_df['income'] == '>50K', 'income'] = '<=50K' 
            fake_df.loc[fake_df['income'] == 'FAKE', 'income'] = '>50K'
        new_start_i = start_i + num_records
        return (new_start_i, fake_df)

In [None]:
for split_name, split in SPLITS.items():
    i = 0
    for validation_size in VALIDATION_FRACTIONS:
        split_folder = os.path.join(DATA_FOLDER, f"split_{split_name}_validation_fraction_{validation_size}")
        shutil.rmtree(split_folder, ignore_errors=True)
    for client_i, fraction in enumerate(split):
        assert i < len(df_shuffle)
        i, new_df = slice_df(df_shuffle, i, fraction)
        for validation_size in VALIDATION_FRACTIONS:
            train_size = int((1 - validation_size) * len(new_df))
            train_df = new_df[:train_size]
            validation_df = new_df[train_size:]
            split_folder = os.path.join(DATA_FOLDER, f"split_{split_name}_validation_fraction_{validation_size}")
            client_folder = os.path.join(split_folder, f"client_{client_i}")
            os.makedirs(client_folder)
            print("Saving to ", client_folder)
            adult_df_to_dataset(train_df).save(os.path.join(client_folder, f"train.dat"))
            adult_df_to_dataset(validation_df).save(os.path.join(client_folder, f"validation.dat"))
            adult_df_to_dataset(new_df).save(os.path.join(client_folder, "score.dat"))