In [1]:
import os
os.chdir("C:\\Users\\tsarcevic\\PycharmProjects\\fingerprinting-toolbox")
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pickle
import numpy as np
import pandas as pd
import collections

from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif as MIC

from scipy.stats import chi2
from imblearn.under_sampling import *

from datasets import GermanCredit
from attacks import *
from parameter_guidelines.guidelines import *
from scheme import *

The experiment:
- find most important feature using some other strategy than the attacker (attacker's strategy: impurity-based feature importances); verify that the features are different
- embed the fingerprint in those
- feature selection by the attacker (done in other notebook)
- find the strenght that removes the fingerprint i.e.
    - attack with 1 removed
    - detection: successful? increase atrength by 1 and repeat
    - until detection unsuccessful
- this is robustness
- record utility of attacked data


- also record utility of fingerprinted from the beginning of the experiment

In [3]:
# experiment parameters
fpaatr = [4,8,12,16,20]
gammae = gammae = [1, 2, 3, 4, 5, 10, 18]  
    #gammae = [1.11, 1.25, 1.43, 1.67, 2.5]

In [4]:
# original data
data = GermanCredit()
X = data.preprocessed().drop('target', axis=1)
y = data.preprocessed()['target']
X.columns

Index(['checking_account', 'duration', 'credit_hist', 'purpose',
       'credit_amount', 'savings', 'employment_since', 'installment_rate',
       'sex_status', 'debtors', 'residence_since', 'property', 'age',
       'installment_other', 'housing', 'existing_credits', 'job',
       'liable_people', 'tel', 'foreign'],
      dtype='object')

In [5]:
# defender features
mi_scores = MIC(X,y)
print(mi_scores)

[0.08624355 0.03582748 0.01656036 0.06367377 0.01089036 0.02129347
 0.01769453 0.         0.00086295 0.0127753  0.         0.00941495
 0.01725588 0.001739   0.01632517 0.00301842 0.0029824  0.
 0.         0.        ]


In [6]:
feature_importances_defense = dict(zip(X.columns, mi_scores))
dict(sorted(feature_importances_defense.items(), key=lambda item: -item[1]))

{'checking_account': 0.08624354677608981,
 'purpose': 0.06367377054305368,
 'duration': 0.03582748215457143,
 'savings': 0.021293474251628197,
 'employment_since': 0.017694527959987383,
 'age': 0.0172558815187851,
 'credit_hist': 0.01656035858835092,
 'housing': 0.016325165005846687,
 'debtors': 0.012775303392170922,
 'credit_amount': 0.010890361621698652,
 'property': 0.009414947173296495,
 'existing_credits': 0.003018421663078419,
 'job': 0.00298240258789062,
 'installment_other': 0.001739003260140759,
 'sex_status': 0.0008629546447782577,
 'installment_rate': 0.0,
 'residence_since': 0.0,
 'liable_people': 0.0,
 'tel': 0.0,
 'foreign': 0.0}

In [7]:
# attacker's features
feature_importances_attack = {'checking_account': 0.21514958469895673,
 'duration': 0.13044392484665898,
 'credit_hist': 0.08577361202438204,
 'purpose': 0.037542360031586446,
 'credit_amount': 0.17261276703874145,
 'savings': 0.04680897088703411,
 'employment_since': 0.03161875938692153,
 'installment_rate': 0.022951027357519246,
 'sex_status': 0.006977414324585844,
 'debtors': 0.01549920743767321,
 'residence_since': 0.006314167794645683,
 'property': 0.04161068704700113,
 'age': 0.09356728372727559,
 'installment_other': 0.04487824182105095,
 'housing': 0.010520190596630376,
 'existing_credits': 0.013612419726994892,
 'job': 0.008691845710110693,
 'liable_people': 0.010437271782373785,
 'tel': 0.003110805448656495,
 'foreign': 0.0018794583112007593}
#dict(sorted(feature_importances_attack.items(), key=lambda item: -item[1]))

In [8]:
# embedding the fingerpting - test case
fplen = 8
numbuyers = 100
column_subset = 20
gamma = 18
xi=1
uid=1
SK=0

scheme = Universal(gamma=gamma, xi=xi, fingerprint_bit_length=fplen, number_of_recipients=100)
exclude=['debtors','residence_since','age','tel']
fingerprinted_data = scheme.insertion(data, recipient_id=uid, secret_key=SK, exclude=exclude, primary_key_attribute='Id', 
                                      target_attribute='target')

Start insertion algorithm...
	gamma: 18
	fingerprint length: 8
	xi: 1
	# recipients: 100

	(secret key -- for evaluation purposes): 0

Generated fingerprint for recipient 1: 01011111
	Inserting a fingerprint into columns: Index(['checking_account', 'duration', 'credit_hist', 'purpose',
       'credit_amount', 'savings', 'employment_since', 'installment_rate',
       'sex_status', 'property', 'installment_other', 'housing',
       'existing_credits', 'job', 'liable_people', 'foreign'],
      dtype='object')
Fingerprint inserted.
	marked tuples: ~6.7%
	single fingerprint bit embedded 8 times
Time: <1 sec.


In [9]:
fingerprinted_data.dataframe

Unnamed: 0,Id,checking_account,duration,credit_hist,purpose,credit_amount,savings,employment_since,installment_rate,sex_status,...,property,age,installment_other,housing,existing_credits,job,liable_people,tel,foreign,target
0,0,A11,6,A34,A43,1169,A65,A75,4,A93,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,1,A12,48,A32,A43,5951,A61,A73,2,A92,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,2,A14,12,A34,A46,2096,A61,A74,2,A93,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,3,A11,42,A32,A42,7882,A61,A74,2,A93,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,4,A11,24,A33,A40,4870,A61,A73,3,A93,...,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,A14,12,A32,A42,1736,A61,A74,3,A92,...,A121,31,A143,A152,1,A172,1,A191,A201,1
996,996,A11,30,A32,A41,3857,A61,A73,4,A91,...,A122,40,A143,A152,1,A174,1,A192,A201,1
997,997,A14,12,A32,A43,804,A61,A75,4,A93,...,A123,38,A143,A152,1,A173,1,A191,A201,1
998,998,A11,45,A32,A43,1845,A61,A73,4,A93,...,A124,23,A143,A153,1,A173,1,A192,A201,2


In [10]:
# test detection
suspect = scheme.detection(fingerprinted_data, secret_key=SK, target_attribute='target', primary_key_attribute='Id',
                          exclude=exclude, original_attributes=pd.Series(data=X.columns.to_list()))

Start detection algorithm...
	gamma: 18
	fingerprint length: 8


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
def drop_least_important(n, features):
    remaining = features
    for i in range(n):
        min_val = min(remaining.values())
        remaining = {k: v for k, v in remaining.items() if v != min_val}    
    return remaining

In [None]:
# find strength that removes the fingerprint - test run
attack_strength = 12
selected_f = drop_least_important(attack_strength, feature_importances_attack)
removed = list(feature_importances_attack.keys() - selected_f.keys())
attacked_data = fingerprinted_data.dataframe.drop(removed, axis=1)
print(attacked_data.size/len(attacked_data)-2)
suspect = scheme.detection(attacked_data, secret_key=SK, target_attribute='target', primary_key_attribute='Id',
                          exclude=exclude, original_attributes=pd.Series(data=X.columns.to_list()))

## The experiment

In [None]:
fplen = 8
numbuyers = 100
xi=1
uid=1

In [None]:
exclude=exclude[4]
fpattr=8

In [None]:
# find strength that removes the fingerprint - test run
gammae = gammae = [1,1.11, 1.25, 1.43, 1.67, 2,2.5, 3, 4, 5, 10, 18]  
    #gammae = [1.11, 1.25, 1.43, 1.67, 2.5]

robustness[fpattr] = dict()
for gamma in gammae:
    scheme = Universal(gamma=gamma, xi=xi, fingerprint_bit_length=fplen, number_of_recipients=100)
    robustness[fpattr][gamma] = []
    for SK in range(10):
        fingerprinted_data = scheme.insertion(data, recipient_id=uid, secret_key=SK, exclude=exclude, primary_key_attribute='Id', 
                                      target_attribute='target')
        for attack_strength in range(1,20):
            # drop least important features
            selected_f = drop_least_important(attack_strength, feature_importances_attack)
            removed = list(feature_importances_attack.keys() - selected_f.keys())
            attacked_data = fingerprinted_data.dataframe.drop(removed, axis=1)

            # try to detect
            suspect = scheme.detection(attacked_data, secret_key=SK, target_attribute='target', primary_key_attribute='Id',
                                  exclude=exclude, original_attributes=pd.Series(data=X.columns.to_list()))
            if suspect != 1:
                robustness[fpattr][gamma].append(attack_strength-1)
                break


In [None]:
robustness

In [None]:
with open('robustness.temp.pkl', 'wb') as outfile:
    pickle.dump(robustness, outfile)

In [None]:
robustness_mean = {gamma: round(np.mean(robustness[gamma]))}

In [None]:
# utility

In [None]:
exclude={8:['debtors','residence_since','age','tel','foreign','employment_since','installment_rate','sex_status',
        'installment_other','housing','job','existing_credits'],
        12:['debtors','residence_since','age','tel','foreign','employment_since','installment_rate','sex_status'],
        16:['debtors','residence_since','age','tel'],
        4:['debtors','residence_since','age','tel','foreign','employment_since','installment_rate','sex_status',
        'installment_other','housing','job','existing_credits','credit_hist','liable_people','savings','credit_amount']}

In [None]:
results = dict()  #
original = dict()  # original fingerprinted data
fpattr=16
for gamma in robustness_mean:
    scheme = Universal(gamma=gamma, xi=xi, fingerprint_bit_length=fplen, number_of_recipients=100)
    results[gamma] = []
    original[gamma] = []
    for SK in range(5):
        fingerprinted_data = scheme.insertion(data, recipient_id=uid, secret_key=SK, exclude=exclude[fpattr], primary_key_attribute='Id', 
                                      target_attribute='target')
        data = GermanCredit().preprocessed(fp_data=fingerprinted_data.dataframe)
        
        #split
        X_fp = data.drop('target', axis=1)
        y_fp = data['target']
        # original accuracy
        model = GradientBoostingClassifier(random_state=9)
        original[gamma].append(cross_val_score(model, X_fp, y_fp))
        
        #
        to_remove = int(20*robustness_mean[gamma])
        selected_f = drop_least_important(to_remove, feature_importances)
        removed = list(feature_importances.keys() - selected_f.keys())
        X_fp = data.drop('target', axis=1)
        X_fp = X_fp.drop(removed, axis=1)
        print(len(X_fp.columns))
        y_fp = data['target']
        model = GradientBoostingClassifier(random_state=9)
        acc = cross_val_score(model, X_fp, y_fp)
        results[gamma].append(acc)

In [None]:
fplen = 8
numbuyers = 100
xi=1
uid=1

In [None]:
data = GermanCredit()
X = data.preprocessed().drop('target', axis=1)
y = data.preprocessed()['target']

In [None]:
# adaptive fp vs random attack
# find strength that removes the fingerprint - test run
gammae = gammae = [1,1.11, 1.25, 1.43, 1.67, 2,2.5, 3, 4, 5, 10, 18]  
    #gammae = [1.11, 1.25, 1.43, 1.67, 2.5]
fpattr=20
robustness[fpattr] = dict()
for gamma in gammae:
    scheme = Universal(gamma=gamma, xi=xi, fingerprint_bit_length=fplen, number_of_recipients=100)
    robustness[fpattr][gamma] = []
    for SK in range(10):
        fingerprinted_data = scheme.insertion(data, recipient_id=uid, secret_key=SK, exclude=exclude[fpattr], primary_key_attribute='Id', 
                                      target_attribute='target')
        for attack_strength in range(1,20):
            # random attack here
            attack=VerticalSubsetAttack()
            attacked_data = attack.run_random(fingerprinted_data.dataframe, attack_strength,
                                                  keep_columns=[data.get_target_attribute(),
                                                                data.get_primary_key_attribute()], seed=SK)

            # try to detect
            suspect = scheme.detection(attacked_data, secret_key=SK, target_attribute='target', primary_key_attribute='Id',
                                  exclude=exclude[fpattr], original_attributes=pd.Series(data=X.columns.to_list()))
            if suspect != 1:
                robustness[fpattr][gamma].append(attack_strength-1)
                break
            elif attack_strength==19:
                robustness[fpattr][gamma].append(20)


In [None]:
robustness

In [None]:
robustness_real={16: {1: 14,
  1.11: 14,
  1.25: 14,
  1.43: 14,
  1.67: 14,
  2: 12,
  2.5: 12,
  3: 11,
  4: 10,
  5: 11,
  10: 3,
  18: 3},
 12: {1: 14,
  1.11: 15,
  1.25: 15,
  1.43: 15,
  1.67: 14,
  2: 13,
  2.5: 13,
  3: 13,
  4: 13,
  5: 11,
  10: 5,
  18: 4},
 8: {1: 15,
  1.11: 15,
  1.25: 14,
  1.43: 13,
  1.67: 12,
  2: 14,
  2.5: 12,
  3: 11,
  4: 9,
  5: 9,
  10: 8,
  18: 2},
 4: {1: 12,
  1.11: 12,
  1.25: 12,
  1.43: 12,
  1.67: 11,
  2: 9,
  2.5: 9,
  3: 9,
  4: 9,
  5: 8,
  10: 4,
  18: 2}}

In [None]:
x = 1.0 / np.array(list(robustness_real[16].keys())) # percentage of marked rows
y_16 = np.array(list(robustness_real[16].values())) / 20 
y_12 = np.array(list(robustness_real[12].values())) / 20 
y_8 = np.array(list(robustness_real[8].values())) / 20 
y_4 = np.array(list(robustness_real[4].values())) / 20 

In [None]:
# read from pickle file

In [None]:
utility_loss_4 = dict()
for gamma in original_4:
    utility_loss_4[gamma] = np.mean([original_4[gamma][i] - results_4[gamma][i] -0.025 for i in range(5)])

In [None]:
utility_loss

In [None]:
results_16[1][0] - original_16[1][0]

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

In [None]:
#plt.plot(horizontal_x, utility_line_2, color='#ffe3e3', label='attacker loses >1% of acc')
#plt.fill_between(horizontal_x, [0.81 for i in horizontal_x], utility_line_2, color='#ffe3e3')

#plt.plot(horizontal_x, utility_line, color='#ffabab', label='attacker loses >2% of acc')#
#plt.fill_between(horizontal_x, [0.81 for i in horizontal_x], utility_line, color='#ffabab')
fig, axs = plt.subplots(1,2, figsize=(14,4))

# ROBUSTNESS PLOT
axs[0].plot(x, y_16, label='16 attributes marked', color=colors[1])
axs[0].plot(x, y_12, label='12 attributes marked',color=colors[2])
axs[0].plot(x, y_8, label='8 attributes marked',color=colors[3])
axs[0].plot(x, y_4, label='4 attributes marked',color=colors[4])

axs[0].set_xlabel('% marks')
axs[0].set_ylabel('robustness')
axs[0].legend()
axs[0].grid()
#axs[0].set_title('Vertical subset attack against adaptive defense')

axs[1].plot(x, [i*100 for i in utility_loss.values()], label='16 attributes marked',color=colors[1])
axs[1].plot(x, [i*100 for i in utility_loss_12.values()], label='12 attributes marked',color=colors[2])
axs[1].plot(x, [i*100 for i in utility_loss_8.values()], label='8 attributes marked',color=colors[3])
axs[1].plot(x, [i*100 for i in utility_loss_4.values()], label='4 attributes marked',color=colors[4])

axs[1].set_xlabel('% marks')
axs[1].set_ylabel('accuracy loss rel to initial accuracy')
#axs[1].set_title('Utility loss after applying the weakest successul naive attack')
axs[1].legend()
axs[1].grid()

In [None]:
# utility
fpattr=4
results = dict()  #
original = dict()  # original fingerprinted data
for gamma in robustness_real[16]:
    scheme = Universal(gamma=gamma, xi=xi, fingerprint_bit_length=fplen, number_of_recipients=100)
    results[gamma] = []
    original[gamma] = []
    for SK in range(5):
        fingerprinted_data = scheme.insertion(data, recipient_id=uid, secret_key=SK, exclude=exclude[fpattr], 
                                      target_attribute='target')
        data = GermanCredit().preprocessed(fp_data=fingerprinted_data.dataframe)
        
        #split
        X_fp = data.drop('target', axis=1)
        y_fp = data['target']
        # original accuracy
        model = GradientBoostingClassifier(random_state=9)
        original[gamma].append(cross_val_score(model, X_fp, y_fp))
        print(original[gamma])
        
        #
        attack=VerticalSubsetAttack()
        attack_strength = robustness_real[fpattr][gamma]
        attacked_data = attack.run_random(fingerprinted_data.dataframe, attack_strength,
                                                  keep_columns=['target'], seed=SK)
        X_fp = attacked_data.drop('target', axis=1)
        y_fp=attacked_data['target']
        model = GradientBoostingClassifier(random_state=9)
        acc = cross_val_score(model, X_fp, y_fp)
        results[gamma].append(acc)
        print(results[gamma])
        
    

In [None]:
results_4 = results
results_4

In [None]:
original_4 = original
original_4

In [None]:
#results_16,12,...
#original_16,...

In [None]:
with open('utility_attacked_adaptiveDef_vs_naiveAtt_16.plk', 'wb') as outfile:
    pickle.dump(results_16, outfile)