<a href="https://colab.research.google.com/github/sayakpaul/Federated-Learning-Shenanigans/blob/master/Toy_example_of_federated_learning_with_syft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install syft

In [0]:
import syft
from torch import nn

# Ignore unnecessary warnings and set the random seed
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
np.random.seed(7)

In [3]:
# Data loading preview
import pandas as pd

columns = ["Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                    header=None,
                    names=columns)

data.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [0]:
# Splitting w.r.t fixed indexes rather than random splitting
train = data[:30561]
test = data[30561:]

In [0]:
# Extracting the categorical and non-categorical features
cat_feats = [column for column in data.columns if data[column].dtypes=='object']
non_cat_feats = [column for column in data.columns if data[column].dtypes!='object']

In [0]:
# Label encode categories
from sklearn.preprocessing import LabelEncoder

train[cat_feats] = train[cat_feats].apply(LabelEncoder().fit_transform)
test[cat_feats] = test[cat_feats].apply(LabelEncoder().fit_transform)

In [7]:
# Split the data w.r.t fixed set of indices
X_train = train.iloc[:,0:-1]
y_train = train['Income']

X_test = test.iloc[:,0:-1]
y_test = test['Income']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30561, 14), (30561,), (2000, 14), (2000,))

In [0]:
# Standard scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_train = StandardScaler().fit_transform(X_train[non_cat_feats].values)
scaled_test = StandardScaler().fit_transform(X_test[non_cat_feats].values)

X_train[non_cat_feats] = scaled_train
X_test[non_cat_feats] = scaled_test

In [0]:
from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
  def __init__(self, data, cat_cols=None, output_col=None):
    """
    Courtesy: https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/
    """

    self.n = data.shape[0]

    if output_col:
      self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
    else:
      self.y =  np.zeros((self.n, 1))

    self.cat_cols = cat_cols if cat_cols else []
    self.cont_cols = [col for col in data.columns
                      if col not in self.cat_cols + [output_col]]

    if self.cont_cols:
      self.cont_X = data[self.cont_cols].astype(np.float32).values
    else:
      self.cont_X = np.zeros((self.n, 1))

    if self.cat_cols:
      self.cat_X = data[cat_cols].astype(np.int64).values
    else:
      self.cat_X =  np.zeros((self.n, 1))

  def __len__(self):
    """
    Denotes the total number of samples.
    """
    return self.n

  def __getitem__(self, idx):
    """
    Generates one sample of data.
    """
    return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [0]:
train = X_train
train['Income'] = y_train

test = X_test
test['Income'] = y_test

In [0]:
train_data = TabularDataset(data=train, cat_cols=cat_feats[:-1],
                             output_col='Income')

test_data = TabularDataset(data=test, cat_cols=cat_feats[:-1],
                             output_col='Income')


In [0]:
train_loader = DataLoader(train_data, 32, shuffle=True)
test_loader = DataLoader(test_data, 32, shuffle=True)

In [13]:
y, cont_x, cat_x = next(iter(train_loader))
y.shape, cont_x.shape, cat_x.shape

(torch.Size([32, 1]), torch.Size([32, 6]), torch.Size([32, 8]))

In [30]:
from torch import nn

# Model definition
model = nn.Sequential(nn.Linear(14, 64),
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(64, 64),
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(64, 1),
                      nn.Sigmoid())
model

Sequential(
  (0): Linear(in_features=14, out_features=64, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2)
  (3): Linear(in_features=64, out_features=64, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.2)
  (6): Linear(in_features=64, out_features=1, bias=True)
  (7): Sigmoid()
)

In [0]:
import torch 

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [32]:
hook = syft.TorchHook(torch)
bob = syft.VirtualWorker(hook, id='bob1')



In [33]:
train_losses, test_losses = [], []
for e in range(40):
    running_loss = 0
    for y, cat_x, cont_x in train_loader:
      
        concat = torch.cat((cat_x.type(torch.float), cont_x.type(torch.float)), 1)
        concat = concat.view(concat.shape[0], -1).send(bob)
        concat = concat.get()
        
        y = y.type(torch.float).send(bob)
        y = y.get()
        
        optimizer.zero_grad()
        
        preds = model(concat)

        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    else:
        test_loss = 0
        accuracy = 0
        with torch.no_grad():
            model.eval()
            for y, cat_x, cont_x in test_loader:
              concat = torch.cat((cat_x.type(torch.float), cont_x.type(torch.float)), 1)
              concat = concat.view(concat.shape[0], -1).send(bob)
              concat = concat.get()
        
              y = y.type(torch.float).send(bob)
              y = y.get()
              preds = model(concat)
              test_loss += criterion(preds, y)
              # ps = torch.exp(logps)
              _, top_class = preds.topk(1, dim=1)
              # print(top_class.size(), preds.size())
              # print(top_class)
              # print(type(top_class), type(y))
              equals = top_class.type(torch.float) == y.type(torch.float)
              accuracy += torch.mean(equals.type(torch.FloatTensor))
        model.train()     
        
        train_losses.append(running_loss/len(train_loader))
        test_losses.append(test_loss/len(test_loader))
        
        print("Epoch: {}/{}.. ".format(e+1, 40),
              "Training Loss: {:.3f}.. ".format(running_loss/len(train_loader)),
              "Test Loss: {:.3f}.. ".format(test_loss/len(test_loader)),
              "Test Accuracy: {:.3f}".format(accuracy/len(test_loader)))


Epoch: 1/40..  Training Loss: 0.430..  Test Loss: 0.386..  Test Accuracy: 0.742
Epoch: 2/40..  Training Loss: 0.368..  Test Loss: 0.359..  Test Accuracy: 0.741
Epoch: 3/40..  Training Loss: 0.348..  Test Loss: 0.350..  Test Accuracy: 0.743
Epoch: 4/40..  Training Loss: 0.343..  Test Loss: 0.349..  Test Accuracy: 0.744
Epoch: 5/40..  Training Loss: 0.340..  Test Loss: 0.351..  Test Accuracy: 0.743
Epoch: 6/40..  Training Loss: 0.335..  Test Loss: 0.344..  Test Accuracy: 0.743
Epoch: 7/40..  Training Loss: 0.335..  Test Loss: 0.341..  Test Accuracy: 0.744
Epoch: 8/40..  Training Loss: 0.332..  Test Loss: 0.342..  Test Accuracy: 0.743
Epoch: 9/40..  Training Loss: 0.331..  Test Loss: 0.343..  Test Accuracy: 0.743
Epoch: 10/40..  Training Loss: 0.331..  Test Loss: 0.338..  Test Accuracy: 0.743
Epoch: 11/40..  Training Loss: 0.328..  Test Loss: 0.345..  Test Accuracy: 0.741
Epoch: 12/40..  Training Loss: 0.329..  Test Loss: 0.335..  Test Accuracy: 0.743
Epoch: 13/40..  Training Loss: 0.328.