Currently, I am taking the course [Secure and Private AI](https://www.udacity.com/course/secure-and-private-ai--ud185) on Udacity. The course is very well curated and is amazingly taught by none other than Andrew Trask. This notebook demonstrates a toy example where I am using Federated Learning using the `syft` framework for training a shallow neural network. 

In [0]:
# Install syft
!pip install syft

In [0]:
# Import syft and torch
import syft
import torch

# Create syft hook to modify the torch functionalities
hook = syft.TorchHook(torch)
# Create a remote worker
bob = syft.VirtualWorker(hook, id='bob')

In [0]:
# Ignore unnecessary warnings and set the random seed
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
np.random.seed(7)

In [6]:
# Data loading preview
import pandas as pd

columns = ["Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                    header=None,
                    names=columns)

data.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [0]:
# Splitting w.r.t fixed indexes rather than random splitting
train = data[:30561]
test = data[30561:]

In [0]:
# Extracting the categorical and non-categorical features
cat_feats = [column for column in data.columns if data[column].dtypes=='object']
non_cat_feats = [column for column in data.columns if data[column].dtypes!='object']

In [0]:
# Label encode categories
from sklearn.preprocessing import LabelEncoder

train[cat_feats] = train[cat_feats].apply(LabelEncoder().fit_transform)
test[cat_feats] = test[cat_feats].apply(LabelEncoder().fit_transform)

In [10]:
# Split the data w.r.t fixed set of indices
X_train = train.iloc[:,0:-1]
y_train = train['Income']

X_test = test.iloc[:,0:-1]
y_test = test['Income']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30561, 14), (30561,), (2000, 14), (2000,))

In [0]:
# Standard scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_train = StandardScaler().fit_transform(X_train[non_cat_feats].values)
scaled_test = StandardScaler().fit_transform(X_test[non_cat_feats].values)

X_train[non_cat_feats] = scaled_train
X_test[non_cat_feats] = scaled_test

In [0]:
# Define a custom dataset to load the data
from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
  def __init__(self, data, cat_cols=None, output_col=None):
    """
    Courtesy: https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/
    """

    self.n = data.shape[0]

    if output_col:
      self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
    else:
      self.y =  np.zeros((self.n, 1))

    self.cat_cols = cat_cols if cat_cols else []
    self.cont_cols = [col for col in data.columns
                      if col not in self.cat_cols + [output_col]]

    if self.cont_cols:
      self.cont_X = data[self.cont_cols].astype(np.float32).values
    else:
      self.cont_X = np.zeros((self.n, 1))

    if self.cat_cols:
      self.cat_X = data[cat_cols].astype(np.int64).values
    else:
      self.cat_X =  np.zeros((self.n, 1))

  def __len__(self):
    """
    Denotes the total number of samples.
    """
    return self.n

  def __getitem__(self, idx):
    """
    Generates one sample of data.
    """
    return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [0]:
# Append the labels
train = X_train
train['Income'] = y_train

test = X_test
test['Income'] = y_test

In [0]:
# Create the datasets
train_data = TabularDataset(data=train, cat_cols=cat_feats[:-1],
                             output_col='Income')

test_data = TabularDataset(data=test, cat_cols=cat_feats[:-1],
                             output_col='Income')


In [0]:
# Prepare the dataloaders
train_loader = DataLoader(train_data, 64)
test_loader = DataLoader(test_data, 64)

In [16]:
# Check one sample
y, cont_x, cat_x = next(iter(train_loader))
y.shape, cont_x.shape, cat_x.shape

(torch.Size([64, 1]), torch.Size([64, 6]), torch.Size([64, 8]))

In [20]:
from torch import nn

# Model definition
model = nn.Sequential(nn.Linear(14, 64),
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(64, 64),
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(64, 1),
                      nn.Sigmoid())
model

Sequential(
  (0): Linear(in_features=14, out_features=64, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2)
  (3): Linear(in_features=64, out_features=64, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.2)
  (6): Linear(in_features=64, out_features=1, bias=True)
  (7): Sigmoid()
)

In [0]:
# Send the model to a remote worker Bob
model = model.send(bob)

In [0]:
# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [22]:
# Train and validation
train_losses, test_losses = [], []
for e in range(40):
    running_loss = 0
    for y, cat_x, cont_x in train_loader:
      
        concat = torch.cat((cat_x.type(torch.float), cont_x.type(torch.float)), 1)
        # Send the features to Bob
        concat = concat.view(concat.shape[0], -1).send(bob)
        # Send the labels to Bob
        y = y.type(torch.float).send(bob)
        
        optimizer.zero_grad()
        # Computation is happening on Bob's worker
        preds = model(concat)
        # Retrieve the tesnors
        preds = preds.get()
        y = y.get()
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    else:
        test_loss = 0
        accuracy = 0
        with torch.no_grad():
            model.eval()
            for y, cat_x, cont_x in test_loader:
              concat = torch.cat((cat_x.type(torch.float), cont_x.type(torch.float)), 1)
              concat = concat.view(concat.shape[0], -1).send(bob)
              
              y = y.type(torch.float).send(bob)
              
              preds = model(concat)
              preds = preds.get()
              y = y.get()
              test_loss += criterion(preds, y)
              _, top_class = preds.topk(1, dim=1)
              equals = top_class.type(torch.float) == y.type(torch.float)
              accuracy += torch.mean(equals.type(torch.FloatTensor))
        model.train()     
        
        train_losses.append(running_loss/len(train_loader))
        test_losses.append(test_loss/len(test_loader))
        
        print("Epoch: {}/{}.. ".format(e+1, 40),
              "Training Loss: {:.3f}.. ".format(running_loss/len(train_loader)),
              "Test Loss: {:.3f}.. ".format(test_loss/len(test_loader)),
              "Test Accuracy: {:.3f}".format(accuracy/len(test_loader)))


Epoch: 1/40..  Training Loss: 6.639..  Test Loss: 7.127..  Test Accuracy: 0.742
Epoch: 2/40..  Training Loss: 6.645..  Test Loss: 7.086..  Test Accuracy: 0.744
Epoch: 3/40..  Training Loss: 6.617..  Test Loss: 7.141..  Test Accuracy: 0.742
Epoch: 4/40..  Training Loss: 6.617..  Test Loss: 7.141..  Test Accuracy: 0.742
Epoch: 5/40..  Training Loss: 6.617..  Test Loss: 7.072..  Test Accuracy: 0.744
Epoch: 6/40..  Training Loss: 6.645..  Test Loss: 7.141..  Test Accuracy: 0.742
Epoch: 7/40..  Training Loss: 6.617..  Test Loss: 7.127..  Test Accuracy: 0.742
Epoch: 8/40..  Training Loss: 6.617..  Test Loss: 7.086..  Test Accuracy: 0.744
Epoch: 9/40..  Training Loss: 6.617..  Test Loss: 7.086..  Test Accuracy: 0.744
Epoch: 10/40..  Training Loss: 6.617..  Test Loss: 7.100..  Test Accuracy: 0.743
Epoch: 11/40..  Training Loss: 6.617..  Test Loss: 7.072..  Test Accuracy: 0.744
Epoch: 12/40..  Training Loss: 6.617..  Test Loss: 7.141..  Test Accuracy: 0.742
Epoch: 13/40..  Training Loss: 6.617.