In [1]:
# This Proof of Concept is largely influenced by 

# https://github.com/OpenMined/TenSEAL/blob/main/tutorials/Tutorial%201%20-%20Training%20and%20Evaluation%20of%20Logistic%20Regression%20on%20Encrypted%20Data.ipynb#:~:text=Tutorial,-1%20%2D%20Training%20and

[157.00002108341832, -90.00001200395398, 153.00002047548332]

In [1]:
# Arbitrarily chosen dataset 
# https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset/data

! kaggle datasets download uom190346a/sleep-health-and-lifestyle-dataset
! mkdir .data
! unzip sleep-health-and-lifestyle-dataset -d .data
! mv .data/Sleep_health_and_lifestyle_dataset.csv .data/data.csv

Dataset URL: https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset
License(s): CC0-1.0
sleep-health-and-lifestyle-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
mkdir: .data: File exists
Archive:  sleep-health-and-lifestyle-dataset.zip
  inflating: .data/Sleep_health_and_lifestyle_dataset.csv  


In [136]:
import torch
import torch.nn as nn
import tenseal as ts
import pandas as pd
import random
from time import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# those are optional and are not necessary for training
import numpy as np
import matplotlib.pyplot as plt

In [181]:
def load_and_cleanse_data(filepath: str) -> pd.DataFrame:
    ''' 
    Perform basic data cleansing and load the dataset
    '''
    df = pd.read_csv(filepath)

    df = df.drop(['Person ID'], axis=1)
    df['Gender'] = df['Gender'].replace({"Male": 0, "Female":1 })
    df['BMI Category'] = df['BMI Category'].replace({"Normal Weight": 0, "Normal":0, "Obese": 2, "Overweight": 1 }) 


    for col in ['Occupation']:
        categoricals = pd.get_dummies(df[col])
        df[categoricals.columns.values] = categoricals 
        df = df.drop(col, axis=1)
        
    
    df['Systolic'] = df['Blood Pressure'].str[:3].astype(int)
    df['Diastolic'] = df['Blood Pressure'].str[4:].astype(int)
    df = df.drop('Blood Pressure', axis=1)
    
    # Assuming the target is sleep disorder and make it "easier" by converting it to a binary problem

    df['Sleep Disorder'] = df['Sleep Disorder'].replace({"Sleep Apnea": 1, "Insomnia": 1, np.nan: 0}) 
    # x = df.drop('Sleep Disorder', axis=1).to_numpy().reshape((374, 1, 23))
    # y = df['Sleep Disorder'].to_numpy().reshape(374, 1, 1)
    x = df.drop('Sleep Disorder', axis=1)
    y = df['Sleep Disorder']
    
    
    return x, y

In [187]:
X, y = load_and_cleanse_data('.data/data.csv')

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [191]:
def process_as_tensors(X1, X2, y1, y2) -> pd.DataFrame:
    scaler = StandardScaler()
    
    # X1 = scaler.fit_transform(X1).reshape((X1.shape[0], 8, X1.shape[1]))
    # X2 = scaler.transform(X2).reshape((X2.shape[0], 8, X2.shape[1]))
    X1 = scaler.fit_transform(X1)
    X2 = scaler.transform(X2)
    
    X1 = torch.tensor(X1).type(torch.float32)
    X2 = torch.tensor(X2).type(torch.float32)
    y1 = torch.tensor(y1.values).type(torch.float32).reshape(-1, 1)
    y2 = torch.tensor(y2.values).type(torch.float32).reshape(-1, 1)

    return X1, X2, y1, y2, scaler

In [192]:
X_train, X_test, y_train, y_test, scaler = process_as_tensors(X_train, X_test, y_train, y_test)

In [193]:
y_train.shape

torch.Size([336, 1])

# Simple Modelling

In [224]:
torch.random.manual_seed(73)
random.seed(73)

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, X):
        X = self.fc1(X)
        X = self.sigmoid(X)
        return X

In [374]:
lr = LogisticRegression(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(lr.parameters(), lr=0.00001)

In [375]:
for epoch in range(3):
    optimizer.zero_grad()
    
    output = lr(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()
    
    
    print(f"loss: {loss}")

    print(f"epoch {epoch}:")
    print(f"   acc: {accuracy_score(output.greater_equal_(0.5).detach().numpy(), y_train.detach().numpy())}")
    
    with torch.no_grad():
        output = lr(X_test)
        print(f"   acc: {accuracy_score(output.greater_equal_(0.5).detach().numpy(), y_test.detach().numpy())}")
    

loss: 0.6398484110832214
epoch 0:
   acc: 0.7380952380952381
   acc: 0.7368421052631579
loss: 0.6398444175720215
epoch 1:
   acc: 0.7380952380952381
   acc: 0.7368421052631579
loss: 0.6398404240608215
epoch 2:
   acc: 0.7380952380952381
   acc: 0.7368421052631579


In [279]:
# Well, probably should've chosen a better dataset. Oh well. 

# LR HE Modelling

In [291]:
class EncryptedLR:
    
    def __init__(self, torch_lr):
        # TenSEAL processes lists and not torch tensors,
        # so we take out the parameters from the PyTorch model
        self.weight = torch_lr.fc1.weight.data.tolist()[0]
        self.bias = torch_lr.fc1.bias.data.tolist()
        
    def forward(self, enc_x):
        # We don't need to perform sigmoid as this model
        # will only be used for evaluation, and the label
        # can be deduced without applying sigmoid
        enc_out = enc_x.dot(self.weight) + self.bias
        return enc_out
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
        
    ################################################
    ## You can use the functions below to perform ##
    ## the evaluation with an encrypted model     ##
    ################################################
    
    def encrypt(self, context):
        # so we are converting the weight tensors and convert them into ckks 
        # and the list was only specifically to fit in the parameters? 
        self.weight = ts.ckks_vector(context, self.weight)
        self.bias = ts.ckks_vector(context, self.bias)
        
    def decrypt(self, context):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()
        

eelr = EncryptedLR(lr)

In [399]:
X_test.shape

torch.Size([38, 22])

In [400]:
lr.fc1.weight.shape

torch.Size([22, 22])

In [401]:
X_test @ lr.fc1.weight

tensor([[-6.3641e-01, -5.6711e-01,  4.6299e-01,  4.9486e-01, -2.9956e-01,
         -5.9192e-01,  5.2629e-01,  1.7862e+00, -6.7975e-01, -1.3454e-01,
         -2.1508e-01, -7.3722e-01, -8.0552e-01,  6.1498e-01,  5.4123e-01,
         -1.1277e+00,  1.7478e-01, -1.3616e+00,  1.2678e-01, -4.2391e-01,
         -1.2895e+00, -9.9172e-03],
        [-2.1464e-01, -4.3575e-01,  2.4720e-01,  4.4451e-01, -6.3491e-01,
          1.7685e-01, -4.7589e-01, -5.2195e-01,  5.5054e-02, -1.8185e-01,
          4.2141e-01,  6.6605e-02,  2.7149e-01,  9.2102e-02, -1.3754e+00,
          2.7931e-01, -5.5941e-01,  5.5787e-01,  9.5676e-02, -4.5596e-01,
          5.8295e-01, -4.6271e-01],
        [-1.3929e-01,  1.2786e-04,  6.5116e-01,  4.9737e-01,  3.2150e-01,
          2.6077e-01,  9.8190e-03, -6.4887e-01, -5.9816e-02, -3.0688e-01,
         -4.1849e-01, -6.7768e-01,  1.3161e-01, -1.4912e-01, -3.7672e-01,
          3.5482e-01, -2.0735e-01, -3.8464e-02,  4.1330e-01,  4.2140e-01,
          2.7222e-01, -1.2028e-01],
    

In [376]:
# parameters
poly_mod_degree = 4096
coeff_mod_bit_sizes = [40, 20, 40]
# create TenSEALContext
ctx_eval = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
# scale of ciphertext to use
ctx_eval.global_scale = 2 ** 20
# this key is needed for doing dot-product operations
ctx_eval.generate_galois_keys()

In [392]:
t_start = time()
enc_x_test = [ts.ckks_vector(ctx_eval, x.tolist()) for x in X_test]
y_test_2 = [y.detach().numpy() for y in y_test]
t_end = time()
print(f"Encryption of the test-set took {int(t_end - t_start)} seconds")

Encryption of the test-set took 0 seconds


In [397]:
def encrypted_evaluation(model, enc_x_test, y_test):
    t_start = time()
    
    correct = 0
    for enc_x, y in zip(enc_x_test, y_test):
        # encrypted evaluation
        enc_out = model(enc_x)
        # plain comparison
        out = enc_out.decrypt()
        out = torch.tensor(out)
        out = torch.sigmoid(out)
        if torch.abs(out - y) < 0.5:
            correct += 1
    
    t_end = time()
    print(f"Evaluated test_set of {len(X_test)} entries in {int(t_end - t_start)} seconds")
    print(f"Accuracy: {correct}/{len(X_test)} = {correct / len(X_test)}")
    return correct / len(X_test)
    

encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test_2)
diff_accuracy =  encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
    print("Oh! We got a better accuracy on the encrypted test-set! The noise was on our side...")

ValueError: can't add vectors of different sizes

- Two main ideas:
    1. It just seems fun to dissect and try to understand what's happening in the black box. I used to be very "skeptical" about these black box system and thought there was no way to open it up - but it seems we are making small steps. 
    2. Technical inventions would push governance (managing short term / actual risks)

- Output: really, just do something practical and small scale research. 


- revised commitments
- discuss, figure out a research question
- form groups
- spring break -> do projects 
- 