Currently, I am taking the course [Secure and Private AI](https://www.udacity.com/course/secure-and-private-ai--ud185) on Udacity. The course is very well curated and is amazingly taught by none other than [Andrew Trask](https://iamtrask.github.io/). This notebook demonstrates a toy example where I am using Federated Learning using the syft framework for training a shallow neural network.

In [0]:
# Install syft
!pip install syft

In [0]:
import syft, torch

# Create syft hook to modify the torch functionalities
hook = syft.TorchHook(torch)

# Create a remote worker
bob = syft.VirtualWorker(hook, id='bob')



In [0]:
# Ignore unnecessary warnings and set the random seed
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
np.random.seed(7)

In [0]:
# Data loading preview
import pandas as pd

columns = ["Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                    header=None,
                    names=columns)

data.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [0]:
# Splitting w.r.t fixed indexes rather than random splitting
train = data[:30561]
test = data[30561:]

In [0]:
# Extracting the categorical and non-categorical features
cat_feats = [column for column in data.columns if data[column].dtypes=='object']
non_cat_feats = [column for column in data.columns if data[column].dtypes!='object']


In [0]:
# Label encode categories
from sklearn.preprocessing import LabelEncoder

train[cat_feats] = train[cat_feats].apply(LabelEncoder().fit_transform)
test[cat_feats] = test[cat_feats].apply(LabelEncoder().fit_transform)

In [0]:
# Split the data w.r.t fixed set of indices
X_train = train.iloc[:,0:-1]
y_train = train['Income']

X_test = test.iloc[:,0:-1]
y_test = test['Income']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30561, 14), (30561,), (2000, 14), (2000,))

In [0]:
# Standard scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_train = StandardScaler().fit_transform(X_train[non_cat_feats].values)
scaled_test = StandardScaler().fit_transform(X_test[non_cat_feats].values)

X_train[non_cat_feats] = scaled_train
X_test[non_cat_feats] = scaled_test

In [0]:
# Setup the dataset for compatibility with PyTorch
X_train['Income'] = y_train
X_test['Income'] = y_test

train_t = torch.from_numpy(X_train.values)
test_t = torch.from_numpy(X_test.values) 

In [0]:
# Check the shapes
train_t.size(), test_t.size()

(torch.Size([30561, 15]), torch.Size([2000, 15]))

In [0]:
# Create a custom Dataset
from torch.utils.data import Dataset, DataLoader
class TabularDataset(Dataset):
  def __init__(self, data):
    """
    Courtesy: https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/
    """

    self.n = data.shape[0]
    self.X = data[:,0:-1].type(torch.float)
    self.y = data[:,-1].type(torch.float).view(-1, 1)

  def __len__(self):
    """
    Denotes the total number of samples.
    """
    return self.n

  def __getitem__(self, idx):
    """
    Generates one sample of data.
    """
    return [self.X[idx], self.y[idx]]

In [0]:
# Create the custom train and test datasets
full_train = TabularDataset(data=train_t)
full_test = TabularDataset(data=test_t)

In [0]:
# Prepare the dataloaders to stream data to the model
# in batches of 64

train_loader = DataLoader(full_train, 64)
test_loader = DataLoader(full_test, 64)

In [0]:
# Verify on one sample
features, labels = next(iter(train_loader))
features.size(), labels.size()

(torch.Size([64, 14]), torch.Size([64, 1]))

In [0]:
from torch import nn

# Model definition
model = nn.Sequential(nn.Linear(14, 64),
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(64, 64),
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(64, 1),
                      nn.Sigmoid())
model

Sequential(
  (0): Linear(in_features=14, out_features=64, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2)
  (3): Linear(in_features=64, out_features=64, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.2)
  (6): Linear(in_features=64, out_features=1, bias=True)
  (7): Sigmoid()
)

In [0]:
# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [0]:
# Train and validate
train_losses, test_losses = [], []
for e in range(20):
  running_loss = 0
  for features, labels in train_loader:  
    features = features.send(bob)
    labels = labels.send(bob)
    model = model.send(features.location)
    optimizer.zero_grad()
    preds = model(features)
    loss = criterion(preds, labels)
    loss.backward()
    optimizer.step()
    model = model.get()
    running_loss += loss.get().item()
  
  else:
    test_loss = 0
    accuracy = 0
    with torch.no_grad():
      model.eval()
      for features, labels in test_loader:
        features = features.send(bob)
        labels = labels.send(bob)
        model = model.send(features.location)
        preds = model(features)
        test_loss += criterion(preds, labels).get()
        _, top_class = preds.topk(1, dim=1)
        equals = top_class.long() == labels.long()
        accuracy += torch.mean(equals.float()) 
        model = model.get()
    
    model.train()
    train_losses.append(running_loss/len(train_loader))
    test_losses.append(test_loss/len(test_loader))

    print("Epoch: {}/{}.. ".format(e+1, 40),
          "Training Loss: {:.3f}.. ".format(running_loss/len(train_loader)),
          "Test Loss: {:.3f}.. ".format(test_loss/len(test_loader)),
          "Test Accuracy: {:.3f}".format(accuracy/len(test_loader)))



Epoch: 1/40..  Training Loss: 0.191..  Test Loss: 0.184..  Test Accuracy: 0.743
Epoch: 2/40..  Training Loss: 0.180..  Test Loss: 0.179..  Test Accuracy: 0.743
Epoch: 3/40..  Training Loss: 0.176..  Test Loss: 0.174..  Test Accuracy: 0.743
Epoch: 4/40..  Training Loss: 0.172..  Test Loss: 0.167..  Test Accuracy: 0.743
Epoch: 5/40..  Training Loss: 0.166..  Test Loss: 0.160..  Test Accuracy: 0.743
Epoch: 6/40..  Training Loss: 0.160..  Test Loss: 0.152..  Test Accuracy: 0.743
Epoch: 7/40..  Training Loss: 0.154..  Test Loss: 0.146..  Test Accuracy: 0.743
Epoch: 8/40..  Training Loss: 0.150..  Test Loss: 0.142..  Test Accuracy: 0.743
Epoch: 9/40..  Training Loss: 0.145..  Test Loss: 0.139..  Test Accuracy: 0.743
Epoch: 10/40..  Training Loss: 0.142..  Test Loss: 0.136..  Test Accuracy: 0.743
Epoch: 11/40..  Training Loss: 0.140..  Test Loss: 0.135..  Test Accuracy: 0.743
Epoch: 12/40..  Training Loss: 0.138..  Test Loss: 0.134..  Test Accuracy: 0.743
Epoch: 13/40..  Training Loss: 0.135.