In [61]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler
import csv


from sklearn.impute import SimpleImputer

def laplace_mech_row_wise(matrix, sensitivity=1, epsilon=20):
    # 计算Laplace分布的比例参数
    beta = sensitivity / epsilon
    # 生成与数据集形状相同的Laplace噪声
    noise = np.random.laplace(0, beta, size=matrix.shape)
    # 为每一行随机选择一个元素添加噪声
    print(noise)
    noisy_matrix = matrix.copy()
    print(matrix.shape)
    for i in range(matrix.shape[0]):
        j = np.random.randint(matrix.shape[1])
        print(noise[i,j])
        noisy_matrix[i, j] = noisy_matrix[i, j]+noise[i, j]
        print(noisy_matrix[i,j])
    return noisy_matrix

def load_data(train_files, test_file):
    # train_data = []
    # for file in train_files:
    #     train_data.append(pd.read_csv(file, sep='\t'))
    # train_data = pd.concat(train_data)

    train_data = pd.read_csv(train_files)
    train_data = train_data.head(40000)
    test_data = pd.read_csv(test_file)
    test_data = test_data.head(40000)

    categorical_features = train_data.columns[2:33]
    binary_features = train_data.columns[33:42]
    numerical_features = train_data.columns[42:80]
    
    numerical_data = train_data[numerical_features]
    noisy_X = add_laplace_noise_to_dataset(numerical_data, feature_name=numerical_features, sensitivities=[1]*len(numerical_features))
    train_data[numerical_features]=noisy_X

    numerical_data_test = test_data[numerical_features]
    noisy_X_test = add_laplace_noise_to_dataset(numerical_data_test, feature_name=numerical_features, sensitivities=[1]*len(numerical_features))
    test_data[numerical_features]=noisy_X_test


    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    scaler = StandardScaler()
    imputer = SimpleImputer(strategy='mean')

    X_categorical = encoder.fit_transform(train_data[categorical_features])
    X_binary = train_data[binary_features].values

    # # ##
    # print("Numerical features shape:", train_data[numerical_features].shape)
    # print("Sample numerical features:\n", train_data[numerical_features].head())

    # # ##
    X_numerical = scaler.fit_transform(imputer.fit_transform(train_data[numerical_features]))

    X_train = np.hstack([X_categorical, X_binary, X_numerical])
    y_train = train_data[['is_clicked', 'is_installed']].values

    X_categorical_test = encoder.transform(test_data[categorical_features])
    X_binary_test = test_data[binary_features].values
    X_numerical_test = scaler.transform(imputer.transform(test_data[numerical_features]))

    X_test = np.hstack([X_categorical_test, X_binary_test, X_numerical_test])

    return X_train, y_train, X_test, test_data

def add_laplace_noise_to_dataset(X, feature_name, sensitivities):
    n, d = X.shape # 获取数据集的形状（数据点数量和特征数量）
    
    upperbounds=[] # 确定数据集的最大值，最小值
    lowerbounds=[] 
    count_zero=[] # 若 0 的占比过大，不予处理
    scale = [] # 用于计算 eplison
    epsilons =[]

    for feature in feature_name:
        data = X[feature]
        upperbounds.append(max(data))
        lowerbounds.append(min(data))
        zero = (data == 0).sum()
        count_zero.append(zero>(len(data)/2))
        scale.append(data.sum()/(len(data)-zero))

    for s in scale: # 计算 epsilons
        e = 1
        while s/10>=10:
            s /= 10
            e *= 0.1
        epsilons.append(e)


    for j in range(d):
        if count_zero[j]:
            continue
        upper = upperbounds[j]
        lower = lowerbounds[j]

        sensitivity = sensitivities[j] # 获取第j个特征的敏感度
        epsilon = epsilons[j] # 获取第j个特征的隐私预算
        scale = sensitivity / epsilon # 计算比例

        for i in range(n):
            
            noise = np.random.laplace(0, scale) # 生成Laplace噪声
            if (X.iloc[i][j]+noise) < upper and (X.iloc[i][j]+noise) >lower:
                X.iloc[i][j] = X.iloc[i][j] + noise # 将噪声添加到数据点的特征值中
            else:
                X.iloc[i][j] = X.iloc[i][j]
            
    return X

def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='sigmoid'))

    optimizer = Adam(lr=0.001)

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    return model


def lr_schedule(epoch):
    initial_lr = 0.001
    if epoch > 75:
        lr = initial_lr * 0.01
    elif epoch > 50:
        lr = initial_lr * 0.1
    else:
        lr = initial_lr
    return lr

def main():
    train_files = 'data//data2.csv'  # Update with the training data file path
    test_file = 'toy_test.csv'  # Update with the testing data file path

    X_train, y_train, X_test, test_data = load_data(train_files, test_file)

    model = create_model(X_train.shape[1])

    lr_scheduler = LearningRateScheduler(lr_schedule)

    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model.fit(X_train_split, y_train_split,
              batch_size=32,
              epochs=100,
              validation_data=(X_val_split, y_val_split),
              shuffle=True,
              callbacks=[lr_scheduler])

    y_pred = model.predict(X_test)

    with open('submission.csv', 'w', newline='') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(['RowId', 'is_clicked', 'is_installed'])
        for row_id, (is_clicked, is_installed) in zip(test_data['f_0'], y_pred):
            writer.writerow([row_id, is_clicked, is_installed])


if __name__ == '__main__':
    main()



import pandas as pd
from sklearn.metrics import accuracy_score

result = pd.read_csv('submission.csv')
# result.head()
result[['is_clicked','is_installed']] = result[['is_clicked','is_installed']].apply(lambda x: round(x, 0), axis=0)
test_data = pd.read_csv('toy_test.csv')
mdl = test_data[['f_0','is_clicked','is_installed']].head(40000)
acc1 = accuracy_score(mdl['is_clicked'],result['is_clicked'])
print("Acc1 is: ", acc1)
acc2 = accuracy_score(mdl['is_installed'],result['is_installed'])
print("Acc2 is: ", acc2)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1494933650.py, line 51)

In [6]:
train_files = 'data//data2.csv'  # Update with the training data file path
test_file = 'toy_test.csv'  # Update with the testing data file path

train_data = pd.read_csv(train_files)
train_data = train_data.head(20000)
test_data = pd.read_csv(test_file)
test_data = test_data.head(20000)

categorical_features = train_data.columns[2:33]
binary_features = train_data.columns[33:43]
numerical_features = train_data.columns[43:81]
numerical_features


Index(['f_42', 'f_43', 'f_44', 'f_45', 'f_46', 'f_47', 'f_48', 'f_49', 'f_50',
       'f_51', 'f_52', 'f_53', 'f_54', 'f_55', 'f_56', 'f_57', 'f_58', 'f_59',
       'f_60', 'f_61', 'f_62', 'f_63', 'f_64', 'f_65', 'f_66', 'f_67', 'f_68',
       'f_69', 'f_70', 'f_71', 'f_72', 'f_73', 'f_74', 'f_75', 'f_76', 'f_77',
       'f_78', 'f_79'],
      dtype='object')

In [8]:
numerical_data = train_data[numerical_features]
numerical_data

Unnamed: 0,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,f_50,f_51,...,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79
0,9.448197,1.063783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.336054,...,1.697190,0.000000,1.142243,0.571121,0.000000,1.156922,0.269948,0.0,0.0,0.0
1,1.696819,2.618062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.657535,...,2.550069,0.000000,0.000000,0.000000,0.000000,0.269948,0.231384,0.0,0.0,0.0
2,30.234230,2.603093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031093,...,2.924602,0.000000,0.571121,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0
3,9.062556,0.684893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.110586,...,0.728635,0.000000,0.571121,0.000000,0.077128,1.156922,0.269948,0.0,0.0,0.0
4,9.679581,2.523316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.403480,...,2.426196,0.571121,5.711215,1.713364,0.115692,1.156922,0.269948,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,11.145016,2.225864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.762581,...,2.177379,0.000000,0.000000,0.000000,0.038564,1.156922,0.269948,0.0,0.0,0.0
19996,11.145016,1.222605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531290,...,1.086849,0.000000,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0
19997,31.776792,0.000240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.745542,...,0.000000,0.000000,1.142243,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0
19998,39.296786,0.000740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.621443,...,0.000000,0.000000,0.000000,0.000000,0.038564,1.156922,0.269948,0.0,0.0,0.0


In [None]:
sensitivities = [1]*len(numerical_features) # 假设所有特征的敏感度都是1
epsilons = [10, 10, 10] # 为每个特征分配相同的隐私预算

# 为数据集添加Laplace噪声
noisy_X = add_laplace_noise_to_dataset(X, sensitivities, epsilons, upperbounds, lowerbounds, count_zero, scale)
print(noisy_X)

In [53]:
# numerical_data = train_data[numerical_features]

def add_laplace_noise_to_dataset(X, feature_name, sensitivities):
    n, d = X.shape # 获取数据集的形状（数据点数量和特征数量）
    
    upperbounds=[] # 确定数据集的最大值，最小值
    lowerbounds=[] 
    count_zero=[] # 若 0 的占比过大，不予处理
    scale = [] # 用于计算 eplison
    epsilons =[]

    for feature in feature_name:
        data = X[feature]
        upperbounds.append(max(data))
        lowerbounds.append(min(data))
        zero = (data == 0).sum()
        count_zero.append(zero>(len(data)/2))
        scale.append(data.sum()/(len(data)-zero))

    for s in scale: # 计算 epsilons
        e = 1
        while s/10>=10:
            s /= 10
            e *= 0.1
        epsilons.append(e)


    for j in range(d):
        if count_zero[j]:
            continue
        upper = upperbounds[j]
        lower = lowerbounds[j]

        sensitivity = sensitivities[j] # 获取第j个特征的敏感度
        epsilon = epsilons[j] # 获取第j个特征的隐私预算
        scale = sensitivity / epsilon # 计算比例

        for i in range(n):
            
            noise = np.random.laplace(0, scale) # 生成Laplace噪声
            if (X.iloc[i][j]+noise) < upper and (X.iloc[i][j]+noise) >lower:
                X.iloc[i][j] = X.iloc[i][j] + noise # 将噪声添加到数据点的特征值中
            else:
                X.iloc[i][j] = X.iloc[i][j]
            
    return X

# X = np.array([[1,2,3],[10,20,300],[0,0,3],[0.1, 0.5, 1]])
# print(X)
# 为数据集添加Laplace噪声
numerical_data = train_data[numerical_features]
noisy_X = add_laplace_noise_to_dataset(numerical_data, feature_name=numerical_features, sensitivities=[1]*len(numerical_features))
print(noisy_X)


            f_42      f_43  f_44  f_45  f_46  f_47  f_48  f_49  f_50  \
0       6.121817  1.388597   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
1       1.396884  2.618062   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
2      30.845212  3.569395   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
3      11.017996  0.887068   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
4       9.631855  2.523316   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
...          ...       ...   ...   ...   ...   ...   ...   ...   ...   
19995  11.284755  1.654147   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
19996  11.874244  0.472014   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
19997  31.990946  0.000240   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
19998  37.340115  0.599809   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
19999   3.036537  0.262650   0.0   0.0   0.0   0.0   0.0   0.0   0.0   

            f_51  ...      f_70      f_71      f_72      f_73      f_74  \
0       2.389301  ...  3.203280  0.000000  2.197225  0.57112

In [54]:
train_data[numerical_features]=noisy_X

In [58]:
numerical_data_test = test_data[numerical_features]
noisy_X_test = add_laplace_noise_to_dataset(numerical_data_test, feature_name=numerical_features, sensitivities=[1]*len(numerical_features))
test_data[numerical_features]=noisy_X_test

In [60]:
train_data.to_csv("noisy_train.csv")
test_data.to_csv("noisy_test.csv")

In [6]:
import pandas as pd

df = pd.read_csv("toy_test.csv")
df1 = df[df.columns[1:]]
df1.head(10)


Unnamed: 0,RowId,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed
0,2541770,64,13658,22294,7003,25604,29975,27941,21218,21533,...,1.713364,1.142243,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
1,2542002,62,20095,563,31686,15908,590,27941,18800,23218,...,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
2,2542476,64,9285,22294,12851,25604,31702,27941,19606,21533,...,0.0,0.0,0.038564,1.156922,0.231384,0.0,0.0,0.0,1,0
3,2542692,45,30131,7152,16170,15908,25613,27941,21621,6675,...,0.571121,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,1,1
4,2543505,65,30256,22294,18267,21545,20366,27941,19606,6675,...,4.568972,3.426729,0.115692,1.156922,0.269948,0.0,0.0,0.0,1,0
5,2330710,49,20095,563,22861,25604,22651,27941,21218,869,...,0.571121,0.571121,0.077128,1.156922,0.269948,0.0,0.0,0.0,0,1
6,2331196,57,20095,563,22861,25604,21280,27941,21218,21533,...,5.140093,1.713364,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0
7,2331692,64,3346,22294,6767,19475,14579,27941,18800,21533,...,0.571121,0.571121,0.038564,1.156922,0.269948,0.0,0.0,0.0,0,0
8,2333152,54,27537,22294,24919,15908,12808,27941,21218,9638,...,0.0,0.0,0.0,1.156922,0.269948,0.0,0.0,0.0,1,0
9,2333652,55,23642,22294,13076,25604,26154,27941,21621,23218,...,0.0,0.0,0.077128,1.156922,0.269948,0.0,0.0,0.0,0,0


In [7]:
df1.describe

<bound method NDFrame.describe of           RowId  f_1    f_2    f_3    f_4    f_5    f_6    f_7    f_8    f_9  \
0       2541770   64  13658  22294   7003  25604  29975  27941  21218  21533   
1       2542002   62  20095    563  31686  15908    590  27941  18800  23218   
2       2542476   64   9285  22294  12851  25604  31702  27941  19606  21533   
3       2542692   45  30131   7152  16170  15908  25613  27941  21621   6675   
4       2543505   65  30256  22294  18267  21545  20366  27941  19606   6675   
...         ...  ...    ...    ...    ...    ...    ...    ...    ...    ...   
173471  1986557   55  27426  22294  11338  15908   8247  27941  18800  23218   
173472  1988282   61  30131   7152  16170  29304   5303  27941  21621  23218   
173473  1989096   52  20095    563  22861  21545  22651  27941  19203   6675   
173474  1989191   49   2809  22294   4079  25604   6283  27941  19203    869   
173475  1989403   58  23611  22294   9727  21545  21295  27941  18800  31372   

     