In [None]:
!pip install tensorflow-privacy
!pip install pyarrow
!pip install -q tensorflow-model-optimization

import tensorflow_privacy as tp
import tensorflow as tf
import tensorflow_privacy
from tensorflow_privacy.privacy.analysis.compute_noise_from_budget_lib import compute_noise

import tensorflow_model_optimization as tfmot

import pandas as pd
import numpy as np
import pyarrow
import pyarrow.parquet as pq

from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import gc
import os

dir='/content/drive/MyDrive/DGA_PPML/'
dir_model = dir + 'all_dp/'
dataset='all_dataset'

In [None]:
X_train = pd.read_parquet(dir+dataset+'/binary/X_train.parquet', engine='pyarrow')
X_test = pd.read_parquet(dir+dataset+'/binary/X_test.parquet', engine='pyarrow')
y_train = pd.read_parquet(dir+dataset+'/binary/y_train.parquet', engine='pyarrow')
y_test = pd.read_parquet(dir+dataset+'/binary/y_test.parquet', engine='pyarrow')

In [None]:
# parameters
noise_multiplier = 0.1
epochs = 30
batch_size = 64
learning_rate = 0.001
num_microbatches = 1
l2_norm_clip = 1
n=np.shape(X_train)[0]
delta=1/n
noise_lbd=0.01

In [None]:
# training binary model
def train_binary(epsilon, model, name_model):

  print(model.summary())

  hist = model.fit(X_train, y_train, batch_size=batch_size, validation_split=0.2, epochs=epochs, shuffle=True)

  fig = plt.figure(figsize=(6, 3), dpi=150)
  plt.plot(hist.history['accuracy'], label='Training accuracy')
  plt.plot(hist.history['val_accuracy'], label='Validation accuracy')
  plt.title(f'Epoch x Accuracy model={name_model} eps={epsilon}')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  fig.tight_layout()
  fig.savefig(dir_model+f"accuracy_{name_model}_dp_eps_{epsilon}.png", dpi=200)

  fig = plt.figure(figsize=(6, 3), dpi=150)
  plt.plot(hist.history['accuracy'], label='Training accuracy')
  plt.title(f'Epoch x Accuracy model={name_model} eps={epsilon}')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  fig.tight_layout()
  fig.savefig(dir_model+f"accuracy1_{name_model}_dp_eps_{epsilon}.png", dpi=200)

  fig = plt.figure(figsize=(6, 3), dpi=150)
  plt.plot(hist.history['loss'], label='Training loss')
  plt.plot(hist.history['val_loss'], label='Validation loss')
  plt.title(f'Epoch x Loss model={name_model} eps={epsilon}')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()
  plt.legend()
  fig.tight_layout()
  fig.savefig(dir_model+f"loss_{name_model}_dp_eps_{epsilon}.png", dpi=200)

  fig = plt.figure(figsize=(6, 3), dpi=150)
  plt.plot(hist.history['loss'], label='Training loss')
  plt.title(f'Epoch x Loss model={name_model} eps={epsilon}')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()
  plt.legend()
  fig.tight_layout()
  fig.savefig(dir_model+f"loss1_{name_model}_dp_eps_{epsilon}.png", dpi=200)

  score_train, acc_train = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1)
  score_test, acc_test = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

  print(f"score_train: {score_train} acc_train: {acc_train}")
  print(f"score_test: {score_test} acc_test: {acc_test}")

  modelo_json = model.to_json()
  with open(dir_model+"model_"+name_model+"_dp.json", "w") as json_file:
      json_file.write(modelo_json)
  model.save_weights(dir_model+"model_"+name_model+"_dp.h5")

  # Saving models to load on MPC protocols
  inputP0=dir_model+"input-P0-0_"+name_model+"_embedding_dp"
  os.system("echo \" \" > "+ inputP0)
  i=0
  for w in model.get_weights():
    try:
      arq=dir_model+"weights"+str(i)+".csv"
      np.savetxt(arq, w.ravel(), delimiter=" ",fmt='%f')
      os.system("cat "+arq+" >> "+ inputP0)
      print(i, np.array(w).shape, np.ravel(w).shape)
      i=i+1
    except:
      print('erro',w.shape)

  # Saving data to test inference in MPC protocols
  inputP1_aux=dir_model+"input-P1-0_"+name_model+"_aux"
  inputP1=dir_model+"input-P1-0_"+name_model+"_onehotencoder_dp"
  os.system("echo ''  > "+ inputP1)
  os.system("echo '' > "+ inputP1_aux)
  for i in range(30):
    one_hot_encoder_x=np.array(np.eye(128)[ X_test.iloc[i].values ])
    np.savetxt(inputP1_aux, np.ravel(one_hot_encoder_x), delimiter=" ",fmt='%f')
    os.system("cat "+inputP1_aux+" >> "+ inputP1)
    np.savetxt(inputP1_aux, y_test.iloc[i].values, delimiter=" ",fmt='%f')
    os.system("cat "+inputP1_aux+" >> "+ inputP1)

  # Prediction
  predicao = model.predict(np.array(X_test).reshape(np.shape(X_test)[0],64))
  predicao = predicao>=0.5

  print(classification_report(predicao, y_test))
  print(confusion_matrix(predicao, y_test))

  # metrics quantization
  converter = tf.lite.TFLiteConverter.from_keras_model(model)
  converter.optimizations = [tf.lite.Optimize.DEFAULT]
  converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
  converter._experimental_lower_tensor_list_ops = False
  converter.target_spec.supported_types = [tf.float16]
  tflite_model = converter.convert()

  interpreter = tf.lite.Interpreter(model_content=tflite_model)
  interpreter.allocate_tensors()
  input_details = interpreter.get_input_details()
  output_details = interpreter.get_output_details()
  predicao_quant=[]
  for i in range(len(X_test)):
      input_data = np.array(X_test.iloc[i], dtype=np.float32).reshape(1,64)
      interpreter.set_tensor(input_details[0]["index"], input_data)
      interpreter.invoke()
      prediction = interpreter.get_tensor(output_details[0]["index"])
      predicao_quant.append(prediction>=0.5)

  print(classification_report(np.expand_dims(np.ravel(predicao_quant),1), y_test))
  print(confusion_matrix(np.expand_dims(np.ravel(predicao_quant),1), y_test))

  return model

def quantization(model, name_model):
  converter = tf.lite.TFLiteConverter.from_keras_model(model)
  converter.optimizations = [tf.lite.Optimize.DEFAULT]
  converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
  converter._experimental_lower_tensor_list_ops = False
  converter.target_spec.supported_types = [tf.float16]
  tflite_model = converter.convert()
  open(dir_model+"/"+name_model+"_quant.tflite", "wb").write(tflite_model)

  print("Original model in Mb:", os.path.getsize(dir_model+"model_"+name_model+"_dp.h5") / float(2**20))
  print("Quantized model in Mb:", os.path.getsize(dir_model+name_model+"_quant.tflite") / float(2**20))
  print("Compression ratio:", os.path.getsize(dir_model+"model_"+name_model+"_dp.h5")/os.path.getsize(dir_model+"/"+name_model+"_quant.tflite"))


def get_accuracy_quantization(name_model):
  interpreter = tf.lite.Interpreter(model_path=dir_model+name_model+"_quant.tflite")
  interpreter.allocate_tensors()
  input_details = interpreter.get_input_details()
  output_details = interpreter.get_output_details()
  predicao_quant=[]
  for i in range(len(X_test)):
    input_data = np.array(X_test.iloc[i], dtype=np.float32).reshape(1,64)
    interpreter.set_tensor(input_details[0]["index"], input_data)
    interpreter.invoke()
    prediction = interpreter.get_tensor(output_details[0]["index"])
    predicao_quant.append(prediction>=0.5)
  return np.expand_dims(np.ravel(predicao_quant),1)

def save_weigths_quantized(list_index_tensors_quantization, model, name_model):
  interpreter = tf.lite.Interpreter(model_path=dir_model+name_model+"_quant.tflite")
  interpreter.allocate_tensors()
  input_details = interpreter.get_input_details()
  output_details = interpreter.get_output_details()

  inputP0=dir_model+"input-P0-0_model_quantized_"+name_model+"_embedding_dp"
  os.system("echo \" \" > "+ inputP0)
  i=0
  j=0
  for i_tensor in list_index_tensors_quantization:
    try:
      w = interpreter.get_tensor(i_tensor)
      arq=dir_model+"weights_quantized"+str(i)+".csv"
      np.savetxt(arq, w.ravel(), delimiter=" ",fmt='%.5f')
      os.system("cat "+arq+" >> "+ inputP0)
      print(j, np.array(w).shape, np.ravel(w).shape)
      j+=1
    except:
      pass

def save_weigths(model, name_model):
  inputP0=dir_model+"input-P0-0_"+name_model+"_embedding_dp"
  os.system("echo \" \" > "+ inputP0)
  i=0
  for w in model.get_weights():
    try:
      arq=dir_model+"weights"+str(i)+".csv"
      np.savetxt(arq, w.ravel(), delimiter=" ",fmt='%.8f')
      os.system("cat "+arq+" >> "+ inputP0)
      print(i, np.array(w).shape, np.ravel(w).shape)
      i=i+1
    except:
      print('erro',w.shape)


In [None]:
# DP-SGD
def runTraining_dpsgd(epsilon, model, name_model):

  noise_multiplier = compute_noise(n=n, batch_size=batch_size,target_epsilon=epsilon, epochs=epochs, delta=delta, noise_lbd=noise_lbd)

  optimizer = tp.DPKerasAdamOptimizer(l2_norm_clip=l2_norm_clip,
                                      noise_multiplier=noise_multiplier,
                                      num_microbatches=num_microbatches,
                                      learning_rate=learning_rate
                                      ,gradient_accumulation_steps=4)


  loss = tf.keras.losses.BinaryCrossentropy(axis=-1, reduction=tf.losses.Reduction.NONE, name='binary_crossentropy')
  model.compile(loss=loss,optimizer=optimizer, metrics=['accuracy'])
  tf.keras.utils.enable_interactive_logging()
  trained_model = train_binary(epsilon, model, name_model)


  return trained_model

In [None]:
# cnn1d_binary
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=128, output_dim=128, input_length=64))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=2, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=100, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
name_model="cnn1d_binary"
epsilons = [0.1, 2, 5]

for epsilon in epsilons:
  model_trained = runTraining_dpsgd(epsilon, model, name_model)

In [None]:
# mlp_binary
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=128, output_dim=128, input_length=64))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=100, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

name_model="mlp_binary"
epsilons = [0.1, 2, 5]
for epsilon in epsilons:
  model = runTraining_dpsgd(epsilon, model, name_model)

In [None]:
# lstm_binary
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=128, output_dim=128, input_length=64))
model.add(tf.keras.layers.LSTM(units=32, return_sequences=True))
model.add(tf.keras.layers.LSTM(units=32))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

name_model="lstm_binary"
epsilons = [0.1, 2, 5]
for epsilon in epsilons:
  model = runTraining_dpsgd(epsilon, model, name_model)