In [1]:
"""
Federated Learning is about a model going into a secure environment and learning how
to solve a problem w/o needing the data to move anywhere. 

This is huge for security. 
"""

import hashlib
import numpy as np
from collections import Counter
import random
import sys
import codecs

from toydl import layer
from toydl.layer.layer import Embedding, RNNCell, CrossEntropyLoss, MSELoss
from toydl.sgd.sgd import SGD
from toydl.tensor.tensor import Tensor

np.random.seed(12345)

ham_ = hashlib.md5("ham.txt".encode("utf-8")).hexdigest()
spam_ = hashlib.md5("spam.txt".encode("utf-8")).hexdigest()

with codecs.open(ham_, 'r', encoding='utf-8', errors='ignore') as f: 
    raw_h = f.readlines()
with codecs.open(spam_, 'r', encoding="utf-8", errors="ignore") as f: 
    raw_s = f.readlines()

vocab, spam, ham = (set(["<unk>"]), list(), list())
for row in raw_s:
    spam.append(set(row[:-2].split(" ")))
    for word in spam[-1]: 
        vocab.add(word)

for row in raw_h: 
    ham.append(set(row[:-2].split(" ")))
    for word in ham[-1]: 
        vocab.add(word)

vocab, w2i = (list(vocab), {})
for i, w in enumerate(vocab):
    w2i[w] = i

# here we make a function to pad the emails into 500 word long chunks
# we also append and prepend <unk> as a seperator token.
def to_indices(input, l=500):
    indices = list()
    for line in input:
        if(len(line) < l):
            line = list(line) + ["<unk>"] * (l - len(line))
            idxs = list()
            for word in line: 
                idxs.append(w2i[word])
            indices.append(idxs)
    return indices
    
    
spam_idx = to_indices(spam)
ham_idx = to_indices(ham)



train_spam_idx = spam_idx[0:-1000]
train_ham_idx = ham_idx[0:-1000]

test_spam_idx = spam_idx[-1000:]
test_ham_idx = ham_idx[-1000:]

train_data = list()
train_target = list()


test_data = list()
test_target = list()

for i in range(max(len(train_spam_idx), len(train_ham_idx))): 
    train_data.append(train_spam_idx[i%len(train_spam_idx)])
    train_target.append([1])
    
    train_data.append(train_ham_idx[i%len(train_ham_idx)])
    train_target.append([0])
    
for i in range(max(len(test_spam_idx), len(test_ham_idx))): 
    test_data.append(test_spam_idx[i%len(test_spam_idx)])
    test_target.append([1])
    
    test_data.append(test_ham_idx[i%len(test_ham_idx)])
    test_target.append([0])
    
    
def train(model, input_data, target_data, batch_size=500, iterations=5):
    n_batches = int(len(input_data) / batch_size)
    for iter in range(iterations):
        iter_loss = 0
        for b_i in range(n_batches):
            bs = n_batches
            # padding token should stay at 0
            model.weight.data[w2i['<unk>']] *= 0
            input = Tensor(input_data[b_i*bs:(b_i+1)*bs], autograd=True)
            target = Tensor(target_data[b_i*bs:(b_i+1)*bs], autograd=True)
            
            pred = model.forward(input).sum(1).sigmoid()
            loss = criterion.forward(pred, target)
            
            loss.backward(grad=None)
            optim.step()
            
            iter_loss += loss.data[0] / bs
            
            sys.stdout.write("\r\tLoss:" + str(iter_loss / (b_i+1)))
        print()
    return model


def test(model, test_input, test_output): 
    model.weight.data[w2i['<unk>']] *= 0
    input = Tensor(test_input, autograd=True)
    target = Tensor(test_output, autograd=True)
    pred = model.forward(input).sum(1).sigmoid()
    return ((pred.data > 0.5) == target.data).mean()

In [2]:
# Create our model here
model = Embedding(vocab_size=len(vocab), dim=1)
model.weight.data *= 0
criterion = MSELoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.01)

for i in range(3):
    model = train(model, train_data, train_target, iterations=1)
    print("% Correct on Test Set: " + str(test(model, test_data, test_target)*100))

	Loss:0.06458066488952852
% Correct on Test Set: 97.75
	Loss:0.029321291479249295
% Correct on Test Set: 98.1
	Loss:0.021904861617501638
% Correct on Test Set: 98.3


In [3]:
"""
Federating the Data Sets

Each person's dataset changes the model slightly.
"""
bob = (train_data[0:1000], train_target[0:1000])
alice = (train_data[1000:2000], train_target[1000:2000])
sue = (train_data[2000:], train_target[2000:])
import copy
for i in range(3):
    print("Starting Training Round")
    print("\t Step 1: send the model to Bob")
    bob_model = train(copy.deepcopy(model), bob[0], bob[1], iterations=1)
    
    print("\n\t Step 2: send the model to Alica")
    alice_model = train(copy.deepcopy(model), alice[0], alice[1], iterations=1)
    
    print("\n\t Step 3: send the model to Sue")
    sue_model = train(copy.deepcopy(model), sue[0], sue[1], iterations=1)
    
    print("\n\tAverage Everyone's New Models")
    model.weight.data = (bob_model.weight.data + \
                         alice_model.weight.data + \
                         sue_model.weight.data)/3
    
    print("\t% Correct on Test Set: " + \
           str(test(model, test_data, test_target)*100))
    
    print("\nRepeat the process..\n")


Starting Training Round
	 Step 1: send the model to Bob
	Loss:0.11674711108092445

	 Step 2: send the model to Alica
	Loss:0.0154820568028543057

	 Step 3: send the model to Sue
	Loss:0.020925986728137825

	Average Everyone's New Models
	% Correct on Test Set: 98.3

Repeat the process..

Starting Training Round
	 Step 1: send the model to Bob
	Loss:0.11674711108092445

	 Step 2: send the model to Alica
	Loss:0.015482056802854308

	 Step 3: send the model to Sue
	Loss:0.020925986728137825

	Average Everyone's New Models
	% Correct on Test Set: 98.3

Repeat the process..

Starting Training Round
	 Step 1: send the model to Bob
	Loss:0.11674711108092445

	 Step 2: send the model to Alica
	Loss:0.015482056802854308

	 Step 3: send the model to Sue
	Loss:0.020925986728137825

	Average Everyone's New Models
	% Correct on Test Set: 98.3

Repeat the process..



In [5]:
"""
The problem stated above  - we can still backtrack and calculate a single
person's gradient. The more noise we add the more the data becomes obfuscated.

The only problem w/ this is that it hurts training. 

We use homomorphic encryption to solve the problem ~ we sum all the gradients
from all the participants in sucha way taht no one can see anyone's gradient but 
their own.
"""
!pip3 install phe
import phe
pub, pri = phe.generate_paillier_keypair(n_length=1024)
x = pub.encrypt(5)
y = pub.encrypt(4)
z = x+y
z_ = pri.decrypt(z)
print(z_)

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
Collecting phe
  Downloading phe-1.5.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.9 MB/s eta 0:00:01
[?25hInstalling collected packages: phe
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
Successfully installed phe-1.5.0
You should consider upgrading via the '/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m
9
