In [54]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append("../")

import gurobipy
from json import dumps, loads
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as skLogisticRegression
from sklearn.metrics import (classification_report, f1_score, precision_score, recall_score)
from tqdm import tnrange, trange
import tensorflow as tf

from mlsql.influence import InfluenceRanker
from mlsql.fixer import AutoFixer
from mlsql.manager import ModelManagerLM

from models.simple_cnn import SimpleCNN
from models.logreg import LogReg
from models.linear_comb import LinearComb
from processors.adultNoCorr import AdultNoCorrProcessor

from itertools import groupby
from functools import partial

import logging
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
import time
import altair as alt
alt.data_transformers.disable_max_rows()

@tf.function
def rank_fix(ranker, fixer, n):
    rank = ranker.predict()
    fixer.fix(rank, n)
    return rank

@tf.function
def rankit(ranker):
    rank = ranker.predict()
    return rank


@tf.function
def fixit(fixer, rank, n):
    fixer.fix(rank, n)


# @tf.function
def train(manager):
    manager.fit()

In [58]:
# Init
proc = AdultNoCorrProcessor()
model = LogReg(1)
manager0 = ModelManagerLM(proc.x_train, proc.y_train, model, 1024)
start = time.time()
manager0.fit(print_value=True)
print(time.time() - start)
manager0.report(proc.x_train, proc.y_train, proc.x_test, proc.y_test)

SGD loss: tf.Tensor(0.39970687, shape=(), dtype=float32)
SGD steps: 499
3.61708927154541
Model name: LogReg
On Training
               precision    recall  f1-score   support

         0.0       0.84      0.93      0.88     19751
         1.0       0.68      0.43      0.53      6297

    accuracy                           0.81     26048
   macro avg       0.76      0.68      0.70     26048
weighted avg       0.80      0.81      0.80     26048

On Testing
               precision    recall  f1-score   support

         0.0       0.84      0.94      0.88      2462
         1.0       0.69      0.44      0.54       794

    accuracy                           0.82      3256
   macro avg       0.76      0.69      0.71      3256
weighted avg       0.80      0.82      0.80      3256



In [19]:
from mlsql.lc_protocol import fit as lc_fit
model_a = LinearComb(1)
manager_a = ModelManagerLM(proc.x_a_train, proc.y_train, model_a)
model_b = LinearComb(1)
manager_b = ModelManagerLM(proc.x_b_train, proc.y_train, model_b)

In [8]:
start = time.time()
for iteration in range(1000):
    evaluate_a = manager_a.master_evaluate() # c1f1-y
    egrads_a = manager_a.egrads() # c1
    evaluate_b = manager_b.slave_evaluate() # c2f2
    egrads_b = manager_b.egrads() # c2
print(time.time() - start)

16.894684553146362


In [9]:
# evaluate_a = manager_a.master_evaluate() # c1f1-y
# egrads_a = manager_a.egrads() # c1
# evaluate_b = manager_b.slave_evaluate() # c2f2
# egrads_b = manager_b.egrads() # c2

In [12]:
# 2048
# *1000*26
start = time.time()
eval_a = evaluate_a.numpy()
enc_a_eval_a = manager_a.encrypt(eval_a[:1000]) # [c1f1-y]_a
eval_b = evaluate_b.numpy()
enc_b_eval_b = manager_b.encrypt(eval_b[:1000]) # [c2f2]_b
enc_a_grads_b = (enc_a_eval_a[:1000].reshape(-1, 1) + eval_b[:1000])
enc_a_grads_b = (egrads_b.numpy()[:1000] * enc_a_grads_b).mean(axis=0)
manager_a.decrypt(enc_a_grads_b)
print(time.time() - start)

245.4406714439392


In [10]:
# 1024
# *1000*26
start = time.time()
eval_a = evaluate_a.numpy()
enc_a_eval_a = manager_a.encrypt(eval_a[:1000]) # [c1f1-y]_a
eval_b = evaluate_b.numpy()
enc_b_eval_b = manager_b.encrypt(eval_b[:1000]) # [c2f2]_b
enc_a_grads_b = (enc_a_eval_a[:1000].reshape(-1, 1) + eval_b[:1000])
enc_a_grads_b = (egrads_b.numpy()[:1000] * enc_a_grads_b).mean(axis=0)
manager_a.decrypt(enc_a_grads_b)
print(time.time() - start)

36.96947455406189


In [16]:
# 512
# *100*26
start = time.time()
eval_a = evaluate_a.numpy()
enc_a_eval_a = manager_a.encrypt(eval_a[:10000]) # [c1f1-y]_a
eval_b = evaluate_b.numpy()
enc_b_eval_b = manager_b.encrypt(eval_b[:10000]) # [c2f2]_b
enc_a_grads_b = (enc_a_eval_a[:10000].reshape(-1, 1) + eval_b[:10000])
enc_a_grads_b = (egrads_b.numpy()[:10000] * enc_a_grads_b).mean(axis=0)
manager_a.decrypt(enc_a_grads_b)
print(time.time() - start)

63.94075036048889


In [18]:
# 256
# *100*26
start = time.time()
eval_a = evaluate_a.numpy()
enc_a_eval_a = manager_a.encrypt(eval_a[:10000]) # [c1f1-y]_a
eval_b = evaluate_b.numpy()
enc_b_eval_b = manager_b.encrypt(eval_b[:10000]) # [c2f2]_b
enc_a_grads_b = (enc_a_eval_a[:10000].reshape(-1, 1) + eval_b[:10000])
enc_a_grads_b = (egrads_b.numpy()[:10000] * enc_a_grads_b).mean(axis=0)
manager_a.decrypt(enc_a_grads_b)
print(time.time() - start)

17.684959411621094


In [50]:
df_enc = pd.DataFrame({
    "n_bit": [2048, 1025, 512, 256],
    "Time Cost (h)": [245.44*1000*26/60/60, 36.96*1000*26/60/60, 63.94*100*26/60/60, 17.68*100*26/60/60],
    "Method": np.repeat(" Training Encryption", 4),
})
alt.Chart(pd.concat([df_fuck])).mark_line().encode(
    x = "n_bit",
    y = "Time Cost (h)",
    color = "Method"
)
# n_bit=np.log2([2048, 1025, 512, 256])

# total_time_cost=[245.44*1000*26/60/60, 36.96*1000*26/60/60, 63.94*100*26/60/60, 17.68*100*26/60/60]
# plt.xlabel('log2(n_bit)')
# plt.ylabel('time cost (hrs)')
# plt.plot(n_bit, total_time_cost, 'bo--')

In [51]:
df_enc

Unnamed: 0,n_bit,Time Cost (h),Method
0,2048,1772.622222,Training Encryption
1,1025,266.933333,Training Encryption
2,512,46.178889,Training Encryption
3,256,12.768889,Training Encryption


In [None]:
from mlsql.lc_protocol import fit as lc_fit
model_a = LinearComb(1)
manager_a = ModelManagerLM(proc.x_a_train, proc.y_train, model_a)
model_b = LinearComb(1)
manager_b = ModelManagerLM(proc.x_b_train, proc.y_train, model_b)
lc_fit(manager_a, manager_b, max_iter=1, print_value=True)

In [None]:
K = 2600
corrsel = tf.cast(tf.ones(proc.y_train.shape[0]), dtype='bool')

In [None]:
from tqdm.notebook import tnrange, trange
manager = ModelManagerLM(proc.x_train, proc.y_train, LogReg(1))
manager.model.set_weights(manager0.model.get_weights())
manager.delta = tf.Variable(manager0.delta.value(), name="delta")
ranker = InfluenceRanker(manager=manager, on=proc.complain)
fixer = AutoFixer(manager, corrsel, K)

AQs = []
weighted_f1 = []
rank_list = []
rank_time_rain = 0
model_time_rain = 0
_, AQ, _, _ = proc.complain(manager)
f1 = f1_score(proc.y_test.numpy(), manager.model.predict(proc.x_test).numpy(), average='weighted')
AQs.append(float(AQ))
weighted_f1.append(f1)

step_size = 10
rain_k = int(np.ceil(K / step_size))
for k in trange(0, rain_k):
    nfix = min(step_size, K - step_size * k)
    assert nfix > 0

    start = time.time()
    rank = rank_fix(ranker, fixer, nfix)
    middle = time.time()
    train(manager)
    end = time.time()
    
    rank_list.append(rank.numpy())
    rank_time_rain += middle - start
    model_time_rain += end - middle

    _, AQ, _, _ = proc.complain(manager)
    f1 = f1_score(proc.y_test.numpy(), manager.model.predict(proc.x_test).numpy(), average='weighted')
    AQs.append(float(AQ))
    weighted_f1.append(f1)

print("Rank_time:", rank_time_rain)
print("Model_time:", model_time_rain)

df_rain = pd.DataFrame({
    "Complain": np.array(AQs),
    "F1": np.array(weighted_f1),
    "K": [1] + list(range(step_size, K + step_size, step_size)),
    "Method": np.repeat("Rain", len(AQs)),
})
alt.Chart(pd.concat([df_rain])).mark_line().encode(
    x = "K",
    y = "Complain",
    color = "Method"
)

In [None]:
#LinearComb
from mlsql.managerlm_new import ModelManagerLM
from mlsql.models.linear_comb_new import LinearComb
manager1 = ModelManagerLM(proc.X_Atrain, proc.ytrain, LinearComb(proc, 1))
manager1.fit(print_value=True, max_iter=2000, tol=1e-6)

In [None]:
print("LinearComb")
print("On Training\n", classification_report(proc.ytrain.numpy(), manager1.predict(proc.X_Atrain).numpy()))
print("On Testing\n", classification_report(proc.ytest.numpy(), manager1.predict(proc.X_Atest).numpy()))

In [None]:
K = 2600

In [None]:
manager = ModelManagerLM(proc.X_Atrain, proc.X_Btrain, proc.ytrain, LinearComb(proc, 1))
manager.model.set_weights(manager1.model.get_weights())
manager.delta = tf.Variable(manager1.delta.value(), name="delta")
ranker = InfluenceRanker(manager=manager, on=proc.complain)
fixer = AutoFixer(manager, corrsel, K)

AQs = []
weighted_f1 = []
rank_list = []
rank_time_lcomb = 0
model_time_lcomb = 0
_, AQ, _, _ = proc.complain(manager)
f1 = f1_score(proc.ytest.numpy(), manager.predict(proc.X_Atest, proc.X_Btest).numpy(), average='weighted')
AQs.append(float(AQ))
weighted_f1.append(f1)

step_size = 10
rain_k = int(np.ceil(K / step_size))
for k in trange(0, rain_k):
    nfix = min(step_size, K - step_size * k)
    assert nfix > 0

    start = time.time()
    rank = rank_fix(ranker, fixer, nfix)
    middle = time.time()
#     train(manager)
    manager.fit(tol=1e-6)
    end = time.time()
    
    rank_list.append(rank.numpy())
    rank_time_lcomb += middle - start
    model_time_lcomb += end - middle

    _, AQ, _, _ = proc.complain(manager)
    f1 = f1_score(proc.ytest.numpy(), manager.predict(proc.X_Atest, proc.X_Btest).numpy(), average='weighted')
    AQs.append(float(AQ))
    weighted_f1.append(f1)

print("Rank_time:", rank_time_lcomb)
print("Model_time:", model_time_lcomb)

df_lcomb = pd.DataFrame({
    "Complain": np.array(AQs),
    "F1": np.array(weighted_f1),
    "K": [1] + list(range(step_size, K + step_size, step_size)),
    "Method": np.repeat("Lcomb", len(AQs)),
})
alt.Chart(pd.concat([df_lcomb])).mark_line().encode(
    x = "K",
    y = "Complain",
    color = "Method"
)

In [None]:
alt.Chart(pd.concat([df_rain, df_lcomb]), title="Complain of query data vs. K, AC=0").mark_line().encode(
    x = "K",
    y = "Complain",
    color = "Method"
)

In [None]:
alt.Chart(pd.concat([df_rain, df_lcomb]), title="F1 score of test data vs. K",).mark_line().encode(
    alt.Y('F1',
        scale=alt.Scale(domain=(0.75, 0.85))
    ),
    x = "K",
    color = "Method"
)

In [None]:
lcomb_del = set(fixer.deletions.numpy())

In [None]:
rain_del = set(fixer.deletions.numpy())

In [None]:
len(rain_del.intersection(lcomb_del))/2600

In [None]:
len(rain_del.intersection(lcomb_del))