In [59]:
import tensorflow.keras as keras
import numpy as np
from sklearn.utils import resample
import pandas as pd
import pickle
import os
import csv
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from statistics import mean
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD
from keras.regularizers import l2
MODEL_PATH = './model/'
DATA_PATH = './data/'

In [60]:
# target_dataset = pd.DataFrame(data.iloc[1:20001,].values)
# shadow_dataset = pd.DataFrame(data.iloc[15001:50001,].values)
# attack_test_nonmembers = pd.DataFrame(data.iloc[75001:85001,].values)
# attack_test_members = pd.DataFrame(data.iloc[5001:15001,].values)

In [61]:
def read_data(data_name):
    with np.load(DATA_PATH + data_name) as f:
        train_x, train_y, test_x, test_y = [f['arr_%d' % i] for i in range(len(f.files))]
    return train_x, train_y, test_x, test_y

In [63]:
def transform_puchase_data(dataset): 
    df_tot = dataset
    df_tot.dropna(inplace=True)

    trainX = df_tot.iloc[:,0:dataset.shape[1]-1]
    trainY = df_tot.iloc[:,-1]

    dim=trainX.shape[1]


    #num of classes
    num_classes=100

    trainX=np.array(trainX)
    trainY=np.array(trainY)
    
    trainY = to_categorical(trainY)


    return trainX, trainY, dim

In [64]:
def load_target_data(dataset, train_size, test_ratio, is_synthetic):
    x, y, dim = transform_puchase_data(dataset)
    
    #trainX,testX, trainY, testY = train_test_split(x, y, test_size=test_ratio, random_state=0, stratify=y)
    trainX = x[0:train_size,]
    testX = x[14000:,]
    trainY = y[0:train_size,]
    testY = y[14000:,]
    return (trainX, trainY), (testX, testY), dim

In [65]:
def build_purchase_dnn(n_class,dim):
    model = Sequential()
    
    model.add(Dense(600, input_dim=dim))
    model.add(Activation("tanh"))
    #model.add(Dropout(0.01))
    
    #model.add(Dense(1024), kernel_regularizer=l2(0.001))
    #model.add(Activation("tanh"))
    #model.add(Dropout(0.01))
    
    model.add(Dense(512, kernel_regularizer=l2(0.00003)))
    model.add(Activation("tanh"))
    #model.add(Dropout(0.01))

    model.add(Dense(256, kernel_regularizer=l2(0.00003)))
    model.add(Activation("tanh"))
    #model.add(Dropout(0.01))
    
    model.add(Dense(128, kernel_regularizer=l2(0.00003)))
    model.add(Activation("tanh"))
    #model.add(Dropout(0.01))
    
    model.add(Dense(n_class, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    
    #opt = SGD(lr=0.01, momentum=0.9)
    #model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

    act_layer=6
    
    return model, act_layer

In [66]:
def build_simple_mlp(n_class,pix,d):

    model = Sequential()
    model.add(Dense(256, input_dim=pix))
    model.add(Activation("relu"))
    #model.add(Dropout(0.01))
    
    model.add(Dense(256, kernel_regularizer=l2(0.01)))
    model.add(Activation("relu"))
    #model.add(Dropout(0.01))
    
    
    #model.add(Dense(248))
    #model.add(Activation("relu"))
    #model.add(Dropout(0.01))

    model.add(Dense(64))
    model.add(Activation("relu"))
    model.add(Dropout(0.01))
    
    model.add(Dense(n_class, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    act_layer=3
    
    return model, act_layer

In [67]:
def load_shadow_data(dataset, n_shadow, shadow_size, test_ratio, is_synthetic):
    x, y, _ = transform_puchase_data(dataset)
    
    shadow_indices = np.arange(len(dataset))
    
   
    for i in range(n_shadow):
        shadow_i_indices = np.random.choice(shadow_indices, shadow_size, replace=False)
        shadow_i_x, shadow_i_y = x[shadow_i_indices], y[shadow_i_indices]
        trainX,testX, trainY, testY = train_test_split(shadow_i_x, shadow_i_y, test_size=test_ratio)
        print('shadow_i_trainX = ', len(trainX), 'shadow_i_trainY = ', len(trainY), 'shadow_i_testX = ', len(testX), 'shadow_i_testY = ', len(testY))
        
        np.savez(DATA_PATH + 'shadow_purchase{}_data.npz'.format(i), trainX, trainY, testX, testY)

In [68]:
def train_shadow_models(n_shadow, n_class, dim, channel):
    full_sm_train_pred=[]
    full_sm_train_class=[]
    
    full_sm_test_pred=[]
    full_sm_test_class=[]
    
    full_clz_train=[]
    full_clz_test=[]
    
    members=[]
    nonmembers=[]
    
    train_accuracy=[]
    test_accuracy=[]


    for j in range(n_shadow):
        
        print("Shadow Model ", j)
        
        print('Training shadow model {}'.format(j))
        data = read_data('shadow_purchase{}_data.npz'.format(j))
        x_shadow_train, y_shadow_train, x_shadow_test, y_shadow_test = data
        #print('x_shadow trian\n', x_shadow_train,'\n y_shadow trian\n', y_shadow_train, '\n x_shadow test\n', x_shadow_test, '\n y_shadow test\n', y_shadow_test)

        model, act_layer = build_purchase_dnn(n_class,dim)
            
            
        # fit model
        history = model.fit(x_shadow_train, y_shadow_train, epochs=EPS, batch_size=32, validation_data=(x_shadow_test, y_shadow_test), verbose=0)
    
        # evaluate model
        _, train_acc = model.evaluate(x_shadow_train, y_shadow_train, verbose=0)
        _, test_acc = model.evaluate(x_shadow_test, y_shadow_test, verbose=0)
        print("Shadow Train acc : ", (train_acc * 100.0),"Shadow Test acc : ", (test_acc * 100.0))
        train_accuracy.append((train_acc * 100.0))
        test_accuracy.append((test_acc * 100.0))
    
        #train SM
        sm_train_pred=model.predict(x_shadow_train, batch_size=32)
        sm_train_class=np.argmax(y_shadow_train,axis=1)
    
    
        #test SM
        sm_test_pred=model.predict(x_shadow_test, batch_size=32)
        sm_test_class=np.argmax(y_shadow_test,axis=1)
        
     
        full_sm_train_pred.append(sm_train_pred)        
        full_sm_train_class.append(sm_train_class)
        members.append(np.ones(len(sm_train_pred)))
        
        full_sm_test_pred.append(sm_test_pred)        
        full_sm_test_class.append(sm_test_class) 
        nonmembers.append(np.zeros(len(sm_test_pred)))


    full_sm_train_pred = np.vstack(full_sm_train_pred)
    full_sm_train_class = [item for sublist in full_sm_train_class for item in sublist]
    members = [item for sublist in members for item in sublist]
    
    full_sm_test_pred = np.vstack(full_sm_test_pred)
    full_sm_test_class = [item for sublist in full_sm_test_class for item in sublist]
    nonmembers = [item for sublist in nonmembers for item in sublist]
    
    shadow_train_performance=(full_sm_train_pred, np.array(full_sm_train_class))
    shadow_test_performance=(full_sm_test_pred, np.array(full_sm_test_class))


    ###atack data preparation
    attack_x = (full_sm_train_pred,full_sm_test_pred)
    #attack_x = np.vstack(attack_x)
    
    attack_y = (np.array(members).astype('int32'),np.array(nonmembers).astype('int32'))
    #attack_y = np.concatenate(attack_y)
    #attack_y = attack_y.astype('int32')
    
    
    classes = (np.array(full_sm_train_class),np.array(full_sm_test_class))
    #classes = np.array([item for sublist in classes for item in sublist])


    attack_dataset = (attack_x,attack_y,classes)
    shadow_accuracy = (train_accuracy, test_accuracy)
            
    return  shadow_train_performance, shadow_test_performance, attack_dataset, x_shadow_train, y_shadow_train, x_shadow_test, y_shadow_test, model, shadow_accuracy

In [69]:
def define_attack_model(n_class):
    model = Sequential()
    
    model.add(Dense(1))
    model.add(Activation("relu"))
    
    model.add(Dense(1))
    model.add(Activation("relu"))

    model.add(Dense(n_class, activation='softmax'))

    # compile model
    opt = SGD(learning_rate=0.0001, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [70]:
def attack_mlp(pix,d):

    model = Sequential()
    model.add(Dense(64, input_dim=pix))
    model.add(Activation("relu"))
    #model.add(Dropout(0.1))

#     model.add(Dense(32))
#     model.add(Activation("tanh"))
#     model.add(Dropout(0.01))
    
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    act_layer=1
    
    return model, act_layer

In [71]:
def prep_attack_train_data(n_attack_data):

    attack_mem = pd.DataFrame(n_attack_data[0][0])
    attack_nmem = pd.DataFrame(n_attack_data[0][1])
    
    attack_mem_status = pd.DataFrame(n_attack_data[1][0])
    attack_mem_status.columns = ["membership"]
    
    attack_nmem_status = pd.DataFrame(n_attack_data[1][1])
    attack_nmem_status.columns = ["membership"]
    
    real_class_mem = pd.DataFrame(n_attack_data[2][0])
    real_class_mem.columns = ["y"]
    
    real_class_nmem = pd.DataFrame(n_attack_data[2][1])
    real_class_nmem.columns = ["y"]

    memdf = pd.concat([attack_mem,attack_nmem],axis=0)
    memdf = memdf.reset_index(drop=True)

    memstatus =  pd.concat([attack_mem_status,attack_nmem_status],axis=0)
    memstatus = memstatus.reset_index(drop=True)

    realclass = pd.concat([real_class_mem,real_class_nmem],axis=0)
    realclass = realclass.reset_index(drop=True)

    attack_df = pd.concat([memdf,realclass,memstatus],axis=1)
    
    return attack_df

In [72]:
def prep_validation_data(attack_test_data):

    attack_mem = pd.DataFrame(attack_test_data[0][0])
    attack_nmem = pd.DataFrame(attack_test_data[0][1])
    
    attack_mem_status = pd.DataFrame(attack_test_data[1][0])
    attack_mem_status.columns = ["membership"]
    
    attack_nmem_status = pd.DataFrame(attack_test_data[1][1])
    attack_nmem_status.columns = ["membership"]
    
    real_class_mem = pd.DataFrame(attack_test_data[2][0])
    real_class_mem.columns = ["y"]
    
    real_class_nmem = pd.DataFrame(attack_test_data[2][1])
    real_class_nmem.columns = ["y"]
    
    mem_df = pd.concat([attack_mem,real_class_mem],axis=1)
    nmem_df = pd.concat([attack_nmem,real_class_nmem],axis=1)

#     memdf = pd.concat([attack_mem,attack_nmem],axis=0)
#     memdf = memdf.reset_index(drop=True)

#     memstatus =  pd.concat([attack_mem_status,attack_nmem_status],axis=0)
#     memstatus = memstatus.reset_index(drop=True)

#     realclass = pd.concat([real_class_mem,real_class_nmem],axis=0)
#     realclass = realclass.reset_index(drop=True)

#     attack_df = pd.concat([memdf,realclass,memstatus],axis=1)
    
    return mem_df, nmem_df

In [73]:
def load_attack_test_data(members, nonmembers, is_synthetic):
    memberX, memberY, _ = transform_puchase_data(members)
    
    nonmemberX, nonmemberY, _ = transform_puchase_data(nonmembers)
    
    return memberX, memberY, nonmemberX, nonmemberY

In [74]:
def prety_print_result(mem, pred):
    tn, fp, fn, tp = confusion_matrix(mem, pred).ravel()
    print('TP: %d     FP: %d     FN: %d     TN: %d' % (tp, fp, fn, tn))
    if tp == fp == 0:
        print('PPV: 0\nAdvantage: 0')
    else:
        print('PPV: %.4f\nAdvantage: %.4f' % (tp / (tp + fp), tp / (tp + fn) - fp / (tn + fp)))

    return tp, fp, fn, tn, (tp / (tp + fp)), (tp / (tp + fn) - fp / (tn + fp)), ((tp+tn)/(tp+tn+fp+fn)),  (tp / (tp + fn))

In [75]:
def train_attack_model(attack_data, check_membership, n_hidden=50, learning_rate=0.01, batch_size=200, epochs=50, model='nn', l2_ratio=1e-7):

    x, y,  classes = attack_data

    train_x = x[0]
    train_y = y[0]
    test_x = x[1]
    test_y = y[1]
    train_classes = classes[0]
    test_classes = classes[1]
    
    
    checkmem_prediction_vals, checkmem_membership_status, checkmem_class_status = check_membership
    
    checkmem_prediction_vals=np.vstack(checkmem_prediction_vals)
    checkmem_membership_status=np.array([item for sublist in checkmem_membership_status for item in sublist])
    checkmem_class_status=np.array([item for sublist in checkmem_class_status for item in sublist])
    
    train_indices = np.arange(len(train_x))
    test_indices = np.arange(len(test_x))
    unique_classes = np.unique(train_classes)


    predicted_membership, target_membership = [], []
    for c in unique_classes:
        print("Class : ", c)
        c_train_indices = train_indices[train_classes == c]
        c_train_x, c_train_y = train_x[c_train_indices], train_y[c_train_indices]
        c_test_indices = test_indices[test_classes == c]
        c_test_x, c_test_y = test_x[c_test_indices], test_y[c_test_indices]
        c_dataset = (c_train_x, c_train_y, c_test_x, c_test_y)        
        
        full_cx_data=(c_train_x,c_test_x)
        full_cx_data = np.vstack(full_cx_data)

        full_cy_data=(c_train_y,c_test_y)
        full_cy_data = np.array([item for sublist in full_cy_data for item in sublist])
        
#         over_sampler = SMOTE(k_neighbors=2)
#         full_cx_data, full_cy_data = over_sampler.fit_resample(full_cx_data, full_cy_data)
#         full_cy_data = to_categorical(full_cy_data)
              
        
#         classifier = define_attack_model(2)
#         history = classifier.fit(full_cx_data, full_cy_data, epochs=EPS, batch_size=32, verbose=0)
        
        d=1
        pix = full_cx_data.shape[1]
        classifier, _ = attack_mlp(pix,d)
        history = classifier.fit(full_cx_data, full_cy_data, epochs=EPS, batch_size=32, verbose=0)
        

        #get predictions on real train and test data
        c_indices = np.where(checkmem_class_status==c)
        pred_y = classifier.predict(checkmem_prediction_vals[c_indices])
        c_pred_y = np.argmax(pred_y, axis=1)
        c_target_y = checkmem_membership_status[c_indices]
        
       
        target_membership.append(c_target_y)
        predicted_membership.append(c_pred_y)

    target_membership=np.array([item for sublist in target_membership for item in sublist])
    predicted_membership=np.array([item for sublist in predicted_membership for item in sublist])


    tp, fp, fn, tn, precision, advj, acc, recall = prety_print_result (target_membership,predicted_membership)   
    return tp, fp, fn, tn, precision, advj, acc, recall

In [76]:
def shokri_attack(attack_df, mem_validation, nmem_validation):
    
    predicted_membership, predicted_nmembership, true_membership, TP_idx, TN_idx  = [], [], [], [], []

    class_val = np.unique(attack_df['y'])
    ncval=attack_df.shape[1]-1
    
    for c_val in class_val:

        print(c_val)
        
        filter_rec_all = attack_df[(attack_df['y'] == c_val)]
        filter_rec_idx = np.array(filter_rec_all.index)
        
        attack_feat = filter_rec_all.iloc[:, 0:ncval]
        attack_class = filter_rec_all['membership']
             
        d=1
        pix = attack_feat.shape[1]
        
        attack_model, _ = attack_mlp(pix,d)
        
       
        history = attack_model.fit(attack_feat, attack_class, epochs=EPS, batch_size=32, verbose=0)
        
        mcval=mem_validation.shape[1]-1
        
        
        check_mem_feat = mem_validation[mem_validation['y']==c_val]
        check_nmem_feat = nmem_validation[nmem_validation['y']==c_val]
        
        if (len(check_mem_feat)!=0) and (len(check_nmem_feat)!=0):
        
            check_mem_feat_idx =  np.array(check_mem_feat.index)


            check_nmem_feat_idx =  np.array(check_nmem_feat.index)

            #print(check_nmem_feat_idx)
            #print(np.argmax(mpred,axis=1)==0)


            mpred = attack_model.predict(np.array(check_mem_feat))    
            predicted_membership.append(np.argmax(mpred,axis=1) )

            nmpred = attack_model.predict(np.array(check_nmem_feat))    
            predicted_nmembership.append(np.argmax(nmpred,axis=1) )        



            TP_idx.append(check_mem_feat_idx[np.where(np.argmax(mpred,axis=1)==1)[0]])

            TN_idx.append(check_nmem_feat_idx[np.where(np.argmax(nmpred,axis=1)==0)[0]])

    pred_members = np.array([item for sublist in predicted_membership for item in sublist])
    pred_nonmembers = np.array([item for sublist in predicted_nmembership for item in sublist])
    
    TP_idx_list = np.array([item for sublist in TP_idx for item in sublist])
    TN_idx_list = np.array([item for sublist in TN_idx for item in sublist])
    
    members=np.array(list(pred_members))
    nonmembers=np.array(list(pred_nonmembers))
    
    pred_membership = np.concatenate([members,nonmembers])
    ori_membership = np.concatenate([np.ones(len(members)), np.zeros(len(nonmembers))])
    
    return pred_membership, ori_membership, TP_idx_list, TN_idx_list

In [77]:
def train_target_model(target_dataset, per_class_sample, epoch, act_layer, n_class, is_synthetic, train_size, channel=0, verbose=0, test_ratio=0.3):
    
    (target_trainX, target_trainY), (target_testX, target_testY), dim = load_target_data(target_dataset, train_size, test_ratio, is_synthetic)
    target_model,_ = build_purchase_dnn(n_class,dim)
    #get_trained_keras_models(model, (target_trainX, target_trainY), (target_testX, target_testY), num_models=1)
    history = target_model.fit(target_trainX, target_trainY, epochs=EPS, batch_size=32, verbose=VERBOSE)
    score = target_model.evaluate(target_testX, target_testY, verbose=VERBOSE)
    _, train_acc = target_model.evaluate(target_trainX, target_trainY, verbose=VERBOSE)
    _, test_acc = target_model.evaluate(target_testX, target_testY, verbose=VERBOSE)
    print('\n', "Target Train acc : ", (train_acc * 100.0),"Target Test acc : ", (test_acc * 100.0))
    #print('\n', 'Model test accuracy:', score[1])
    return target_model, dim

In [78]:
def prepare_attack_test_data(attack_test_members, attack_test_nonmembers, target_model, is_synthetic):
    members = []
    nonmembers = []

    memberX, memberY, nonmemberX, nonmemberY = load_attack_test_data(attack_test_members, attack_test_nonmembers, is_synthetic)

    # member
    target_model_member_pred = target_model.predict(memberX, batch_size=32)
    target_model_member_class = np.argmax(memberY, axis=1)
    target_model_member_pred = np.vstack(target_model_member_pred)
    #target_model_member_class = [item for sublist in target_model_member_class for item in sublist]
    members.append(np.ones(len(target_model_member_pred)))
    members = [item for sublist in members for item in sublist]


    # nonmember
    target_model_nonmember_pred = target_model.predict(nonmemberX, batch_size=32)
    target_model_nonmember_class = np.argmax(nonmemberY, axis=1)
    target_model_nonmember_pred = np.vstack(target_model_nonmember_pred)
    #target_model_nonmember_class = [item for sublist in target_model_nonmember_class for item in sublist]
    nonmembers.append(np.zeros(len(target_model_nonmember_pred)))
    nonmembers = [item for sublist in nonmembers for item in sublist]

    full_attack_test_pred_val = (target_model_member_pred, target_model_nonmember_pred)
    full_attack_test_mem_status = (np.array(members).astype('int32'),np.array(nonmembers).astype('int32'))
    full_attack_test_class_status = (np.array(target_model_member_class),np.array(target_model_nonmember_class))

    print('\n pred', full_attack_test_pred_val)
    print('\n class', full_attack_test_class_status)
    print('\n mem status', full_attack_test_mem_status)

    attack_test_data = (full_attack_test_pred_val, full_attack_test_mem_status,full_attack_test_class_status)
    
    return attack_test_data

In [79]:
# --------------------------------------------Original Data--------------------------------------------------------------#

In [33]:
train_size = 14000
attack_test_size = 10000
pur_data = np.load(DATA_PATH+'purchase100.npz')
features = pur_data['features']
labels = pur_data['labels']
data = pd.DataFrame(features[:,:])
labels = np.argmax(labels, axis=1)
data['600'] = labels

target_dataset = data.sample(n = 20000, replace = False)
df_rest = data.loc[~data.index.isin(target_dataset.index)]
shadow_dataset = df_rest.sample(n = 35000, replace = False)
df_rest = df_rest.loc[~df_rest.index.isin(shadow_dataset.index)]
# attack_test_nonmembers = pd.DataFrame(data.iloc[75001:85001,].values)
# attack_test_members = pd.DataFrame(data.iloc[5001:15001,].values)
attack_test_nonmembers = df_rest.sample(n = attack_test_size, replace = False)
attack_test_members =  target_dataset.iloc[:train_size,:].sample(n = attack_test_size, replace = False)

In [34]:
target_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,591,592,593,594,595,596,597,598,599,600
191433,0,0,1,1,0,0,1,0,1,1,...,1,0,0,0,0,1,1,1,0,66
77411,0,1,0,1,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,48
290,0,0,0,1,0,0,1,0,1,1,...,0,1,0,1,0,0,0,0,0,18
106275,0,0,0,1,0,0,1,1,0,1,...,0,0,0,1,0,1,0,0,0,8
26951,0,1,0,1,1,1,1,0,0,1,...,0,1,0,1,0,0,0,1,0,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83286,0,0,0,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,3
164214,0,1,1,1,1,1,1,1,0,1,...,0,1,0,0,1,0,0,0,0,49
47867,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,95
38589,0,1,0,1,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,58


In [25]:
# trian target model
per_class_sample=150
channel=0   
EPS=50
act_layer=6
n_class = 100
is_synthetic = False
VERBOSE = 0
test_ratio = 0.3

target_model, dim = train_target_model(target_dataset, per_class_sample, EPS, act_layer, n_class, is_synthetic, train_size)


 Target Train acc :  98.64285588264465 Target Test acc :  73.65000247955322


In [85]:
#train shadow model
n_shadow_models = 20
shadow_data_size = 750

load_shadow_data(shadow_dataset, n_shadow_models, shadow_data_size, test_ratio, is_synthetic)
n_shadow_train_performance, n_shadow_test_performance, n_attack_data, x_shadow_train, y_shadow_train, x_shadow_test, y_shadow_test, shadow_model_init, shadow_accuracy = train_shadow_models(n_shadow_models, n_class, dim, channel)

shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_trainX =  525 shadow_i_trainY =  525 shadow_i_testX =  225 shadow_i_testY =  225
shadow_i_t

In [86]:
#train attack model
attack_test_data = prepare_attack_test_data(attack_test_members, attack_test_nonmembers, target_model, is_synthetic)
mem_validation, nmem_validation = prep_validation_data(attack_test_data)
attack_train_df = prep_attack_train_data(n_attack_data)
pred_membership, ori_membership, TP_idx_list, TN_idx_list = shokri_attack(attack_train_df, mem_validation, nmem_validation)
tp, fp, fn, tn, precision, advj, acc, recall = prety_print_result (ori_membership,pred_membership)
print('Accuracy: ', acc, 'Precision: ', precision)


 pred (array([[4.3964255e-06, 1.6211403e-05, 9.4731413e-06, ..., 3.5881261e-05,
        3.5171019e-05, 1.3419745e-05],
       [6.2163912e-05, 5.4614480e-05, 3.7005495e-05, ..., 4.7637277e-05,
        5.6933015e-05, 3.7106263e-06],
       [5.3402640e-05, 2.9141104e-04, 5.2738133e-06, ..., 4.3856632e-04,
        1.6920932e-04, 7.9547543e-05],
       ...,
       [6.1987585e-04, 2.7877140e-05, 1.6566580e-04, ..., 5.0968497e-06,
        1.0208837e-06, 5.3073887e-05],
       [1.5488193e-05, 8.9001533e-06, 1.0927839e-05, ..., 7.0551664e-06,
        1.3671079e-04, 7.0258146e-05],
       [1.0156856e-04, 1.6025044e-05, 2.2576368e-04, ..., 2.3320847e-06,
        1.9406822e-05, 5.6473091e-06]], dtype=float32), array([[2.3608627e-04, 1.0805344e-04, 9.7918615e-04, ..., 1.8779080e-02,
        1.9779841e-03, 1.7132605e-03],
       [2.9557839e-04, 6.4444978e-04, 6.8452448e-04, ..., 6.2287464e-03,
        2.2831818e-03, 6.2352797e-04],
       [1.5649254e-03, 1.0181955e-02, 2.5876463e-04, ..., 3.8854103

In [None]:
# --------------------------------------------Original Data--------------------------------------------------------------#

In [None]:
# --------------------------------------------Synthetic Data------------------------------------------------------------#

In [80]:
# synthetic dataset
train_size = 14000
attack_test_size = 5000
target_dataset = pd.read_csv('data/purchase100_sds_sdv_ctgan.csv', na_values=["?"], header=None)


pur_data = np.load(DATA_PATH+'purchase100.npz')
features = pur_data['features']
labels = pur_data['labels']
data = pd.DataFrame(features[:,:])
labels = np.argmax(labels, axis=1)
data['600'] = labels

shadow_dataset = data.iloc[25000:79000,].sample(n = 35000, replace = False)
attack_test_nonmembers = data.iloc[80000:,].sample(n=5000, replace=False)
attack_test_members = data.iloc[0:train_size,].sample(n=5000, replace=False)

In [106]:
train_size = 14000
attack_test_size = 10000
target_dataset = pd.read_csv('data/purchase100_sds_sdv.csv', header=None)
target_dataset = target_dataset.sample(n=20000, replace=False)

#target_dataset = data.sample(n = 20000, replace = False)
# df_rest = data.loc[~data.index.isin(target_dataset.index)]
# shadow_dataset = df_rest.sample(n = 35000, replace = False)
# df_rest = df_rest.loc[~df_rest.index.isin(shadow_dataset.index)]
# # attack_test_nonmembers = pd.DataFrame(data.iloc[75001:85001,].values)
# # attack_test_members = pd.DataFrame(data.iloc[5001:15001,].values)
# attack_test_nonmembers = df_rest.sample(n = attack_test_size, replace = False)
# attack_test_members =  target_dataset.iloc[:train_size,:].sample(n = attack_test_size, replace = False)

In [107]:
# trian target model
per_class_sample=150
channel=0   
EPS=50
act_layer=6
n_class = 100
is_synthetic = False
VERBOSE = 0
test_ratio = 0.3

target_model, dim = train_target_model(target_dataset, per_class_sample, EPS, act_layer, n_class, is_synthetic, train_size)


 Target Train acc :  14.478571712970734 Target Test acc :  1.9999999552965164


In [None]:
# --------------------------------------------Synthetic Data------------------------------------------------------------#

In [None]:
# --------------------------------------------Overfitting Experiment-----------------------------------------------------#

In [74]:
def load_target_data(dataset, training_data_size, test_ratio, is_synthetic):
    x, y, dim = transform_puchase_data(dataset)
    
    #trainX,testX, trainY, testY = train_test_split(x, y, test_size=test_ratio, random_state=0, stratify=y)
    trainX = x[0:training_data_size,]
    testX = x[15000:,]
    trainY = y[0:training_data_size,]
    testY = y[15000:,]
    return (trainX, trainY), (testX, testY), dim

In [80]:
# trian target model
training_data_size = [2,5,10,15,20,25,50,100,200,500,1000,2000,5000]
per_class_sample=150
channel=0   
EPS=200
act_layer=6
n_class = 100
is_synthetic = False
VERBOSE = 0
test_ratio = 0.3

accuracy_df = pd.DataFrame()
accuracy_df['training data size'] = np.nan
accuracy_df['training accuracy'] = np.nan
accuracy_df['test accuracy'] = np.nan
accuracy_df['error'] = np.nan

pur_data = np.load(DATA_PATH+'purchase100.npz')
features = pur_data['features']
labels = pur_data['labels']
data = pd.DataFrame(features[:,:])
labels = np.argmax(labels, axis=1)
data['600'] = labels

for j in training_data_size:
    print('\n Training Data Size: ', j, '\n')
    for i in range(10):
        target_dataset = data.sample(n = 20000, replace = False)
        (target_trainX, target_trainY), (target_testX, target_testY), dim = load_target_data(target_dataset, j, test_ratio, is_synthetic)
        target_model,_ = build_purchase_dnn(n_class,dim)
        #get_trained_keras_models(model, (target_trainX, target_trainY), (target_testX, target_testY), num_models=1)
        history = target_model.fit(target_trainX, target_trainY, epochs=EPS, batch_size=32, verbose=VERBOSE)
        score = target_model.evaluate(target_testX, target_testY, verbose=VERBOSE)
        _, train_acc = target_model.evaluate(target_trainX, target_trainY, verbose=VERBOSE)
        _, test_acc = target_model.evaluate(target_testX, target_testY, verbose=VERBOSE)
        print('Iteration ', i, "Target Train acc : ", (train_acc * 100.0),"Target Test acc : ", (test_acc * 100.0))
        accuracy_df = accuracy_df.append({'training data size':j, 'training accuracy' : (train_acc * 100.0), 'test accuracy': (test_acc * 100.0), 'error': ((train_acc * 100.0)-(test_acc * 100.0))}, ignore_index=True)


 Training Data Size:  2 

Iteration  0 Target Train acc :  100.0 Target Test acc :  1.679999940097332
Iteration  1 Target Train acc :  100.0 Target Test acc :  1.4800000004470348
Iteration  2 Target Train acc :  100.0 Target Test acc :  2.4800000712275505
Iteration  3 Target Train acc :  100.0 Target Test acc :  3.4800000488758087
Iteration  4 Target Train acc :  100.0 Target Test acc :  1.4600000344216824
Iteration  5 Target Train acc :  100.0 Target Test acc :  1.7999999225139618
Iteration  6 Target Train acc :  100.0 Target Test acc :  1.6200000420212746
Iteration  7 Target Train acc :  100.0 Target Test acc :  0.7799999788403511
Iteration  8 Target Train acc :  100.0 Target Test acc :  2.759999968111515
Iteration  9 Target Train acc :  100.0 Target Test acc :  1.3199999928474426

 Training Data Size:  5 

Iteration  0 Target Train acc :  100.0 Target Test acc :  3.9400000125169754
Iteration  1 Target Train acc :  100.0 Target Test acc :  3.9400000125169754
Iteration  2 Target Trai

Iteration  4 Target Train acc :  100.0 Target Test acc :  19.220000505447388
Iteration  5 Target Train acc :  100.0 Target Test acc :  18.960000574588776
Iteration  6 Target Train acc :  100.0 Target Test acc :  19.97999995946884
Iteration  7 Target Train acc :  100.0 Target Test acc :  18.60000044107437
Iteration  8 Target Train acc :  100.0 Target Test acc :  18.719999492168427
Iteration  9 Target Train acc :  100.0 Target Test acc :  19.259999692440033

 Training Data Size:  2000 

Iteration  0 Target Train acc :  100.0 Target Test acc :  47.45999872684479
Iteration  1 Target Train acc :  100.0 Target Test acc :  46.639999747276306
Iteration  2 Target Train acc :  100.0 Target Test acc :  45.100000500679016
Iteration  3 Target Train acc :  100.0 Target Test acc :  45.80000042915344
Iteration  4 Target Train acc :  100.0 Target Test acc :  45.260000228881836
Iteration  5 Target Train acc :  100.0 Target Test acc :  43.59999895095825
Iteration  6 Target Train acc :  100.0 Target Test 

In [81]:
#accuracy_df =pd.read_csv('data/adult_overfitting_test_trainsize')
training_data_size = [2,5,10,15,20,25,50,100,200,500,1000,2000,5000]

avg_error_ci95_df = pd.DataFrame()
avg_error_ci95_df['training data size'] = np.nan
avg_error_ci95_df['average training accuracy'] = np.nan
avg_error_ci95_df['average test accuracy'] = np.nan
avg_error_ci95_df['average error'] = np.nan
avg_error_ci95_df['ci95 low'] = np.nan
avg_error_ci95_df['ci95 high'] = np.nan

for i in training_data_size:
    error = np.array(accuracy_df.loc[accuracy_df['training data size'] == i, 'error'])
    training_accuracy = np.array(accuracy_df.loc[accuracy_df['training data size'] == i, 'training accuracy'])
    test_accuracy = np.array(accuracy_df.loc[accuracy_df['training data size'] == i, 'test accuracy'])
    ci95 = stats.t.interval(alpha=0.95, df=len(error)-1, loc=np.mean(error), scale=stats.sem(error))
    row = pd.DataFrame({'training data size': [i], 'average training accuracy': \
            [np.mean(training_accuracy)], 'average test accuracy': [np.mean(test_accuracy)], 'average error': [np.mean(error)],\
                                                  'ci95 low': [ci95[0]], 'ci95 high': [ci95[1]]})
    avg_error_ci95_df = pd.concat([avg_error_ci95_df, row], axis=0, ignore_index=True)

In [82]:
avg_error_ci95_df

Unnamed: 0,training data size,average training accuracy,average test accuracy,average error,ci95 low,ci95 high
0,2.0,100.0,1.886,98.114,97.546301,98.681699
1,5.0,100.0,3.01,96.99,96.372809,97.607191
2,10.0,100.0,4.296,95.704,95.066273,96.341727
3,15.0,100.0,4.714,95.286,94.996299,95.575701
4,20.0,100.0,5.072,94.928,94.304213,95.551787
5,25.0,100.0,5.378,94.622,94.145122,95.098878
6,50.0,100.0,6.424,93.576,93.100538,94.051462
7,100.0,100.0,7.476,92.524,92.112213,92.935787
8,200.0,100.0,8.264,91.736,91.190583,92.281417
9,500.0,100.0,11.154,88.846,88.600216,89.091784
