In [54]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def input_DP(data, epsilon, delta): # delta_input << 1/n, where n is the number of instances in data

    X = np.array(data)
    n = X.shape[0]
    X = StandardScaler().fit_transform(X)
    
    loss_function = 3 * np.log(n)
    sigma = (loss_function ** 2) * (8 * np.log(2 / delta) + 4 * epsilon)/(n * epsilon ** 2)
    noise = np.random.normal(loc=0.0, scale=sigma, size=X.shape)
    noised_data = X + noise
   
    return noised_data, noise

def input_DP_new_paradigm(data, epsilon, delta, sensitivity, iterations): # delta_input << 1/n, where n is the number of instances in data
    # sensitivity - is the sensitivity of ML model and iterations - is the number of iterations when training the ML model

    X = np.array(data)
    n = X.shape[0]
    X = StandardScaler().fit_transform(X)
    
    loss_function = 3 * np.log(n)
    sigma = (loss_function**2) * iterations * np.log(1/delta)/(n * (n-1) * (epsilon**2)*np.sqrt(sensitivity))
    noise = np.random.normal(loc=0.0, scale=sigma, size=X.shape)
    noised_data = X + noise
   
    return noised_data, noise


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
dp_util_dataset = pd.read_csv(r'data/dp_util_dataset.csv')


In [5]:
y = dp_util_dataset['grade']
X = dp_util_dataset.drop('grade', axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [42]:
dp_util_dataset[:100].to_csv('data100.csv')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X[0:100000], y[0:100000], train_size=0.5, random_state=0)

In [9]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [50]:
y_train.shape[0]

50000

In [11]:
n=100
xtrain=X_train[0:n]
ytrain=y_train[0:n]
xtest=X_test[0:n]
ytest=y_test[0:n]

In [52]:
epsilon_input = 100
delta_input = 1/(n*10)


noise_x_train, noise = input_DP_util(X, epsilon_input, n, delta_input)


In [57]:
input_DP(dp_util_dataset[:10], epsilon=10, delta=0.01)

(array([[-3.41478036, -2.3680554 ,  4.06335648, ..., -0.58394577,
          1.20260374,  3.28353181],
        [ 1.39217724, -1.46406725, -3.88660379, ..., -0.83706135,
          4.43755671,  6.2636107 ],
        [ 1.93726911,  1.83202916,  5.79176161, ..., -1.40313601,
          1.69653976,  2.99180764],
        ...,
        [ 5.08042383,  4.91533919,  2.31143621, ...,  9.69275977,
         -5.69281161,  5.9624449 ],
        [ 0.69527244, -3.56266441, -2.42672648, ..., -4.14956523,
          1.89281892, -3.51366626],
        [ 5.37024762,  0.36276907,  0.24177463, ...,  1.27081379,
          3.21715188,  2.61181412]]),
 array([[-4.75161681, -3.70489186,  2.72652002, ..., -0.58394577,
          1.20260374,  3.28353181],
        [ 2.8019753 , -0.05426918, -2.47680572, ..., -0.83706135,
          4.43755671,  6.2636107 ],
        [ 1.3406229 ,  1.23538295,  5.1951154 , ..., -1.40313601,
          1.69653976,  2.99180764],
        ...,
        [ 6.06989957,  5.90481494,  3.30091196, ...,  