# Data and service model

In [None]:
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch 
import torchvision 
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 
import os 
import os.path 
from torch.utils.data import DataLoader 
from tqdm.auto import tqdm 
from copy import deepcopy 
from resnet import ResNet18 
# Select device
print(torch.cuda.is_available())
device = torch.device('cuda')
#加载mnist
def get_trans():
    # 设置一个转换的集合，先把数据转换到tensor，再归一化为均值.5，标准差.5的正态分布
    trans = torchvision.transforms.Compose(
        [
            torchvision.transforms.ToTensor(),  # ToTensor方法把[0,255]变成[0,1]
            torchvision.transforms.Normalize( [0.5], [0.5] )
            # 变成mean(均值)=0，std（标准差standard deviation）=1的分布
        ]
    )
    return trans
 
DOWNLOAD_MNIST=True
train_set= torchvision.datasets.MNIST( root="./mnist",  # 设置数据集的根目录
    train=True,  # 是否是训练集
    transform=get_trans(),  # 对数据进行转换
    download=DOWNLOAD_MNIST
                                         )
val_set = torchvision.datasets.MNIST( root="./mnist", train=False,  # 测试集，所以false
    transform=get_trans(), download=DOWNLOAD_MNIST
                                       )
print(len(train_set))
print(len(val_set))

In [None]:
#指定训练集大小
train_size = 100
train_data = []
train_data_label = []
#每个类别取相同数量
for c in range(10):
    num = 0
    for i in range(50000):
        if train_set[i][1] == c:
            num+=1
            train_data.append(train_set[i][0].numpy())
            train_data_label.append(train_set[i][1])
            if num == train_size//10:
                break

train_data = torch.tensor(train_data)
train_data_label = torch.tensor(train_data_label)

train_dataset = TensorDataset(train_data,train_data_label)
print("训练集大小：",len(train_dataset))

#指定验证集大小 （同时作为解释器的验证集）
val_size = 100
val_data = []
val_data_label = []
#每个类别取相同数量
for c in range(10):
    num = 0
    for i in range(10000):
        if train_set[i+50000][1] == c:
            num+=1
            val_data.append(train_set[i+50000][0].numpy())
            val_data_label.append(train_set[i+50000][1])
            if num == val_size//10:
                break

val_data = torch.tensor(val_data)
val_data_label = torch.tensor(val_data_label)

val_dataset = TensorDataset(val_data,val_data_label)
print("验证集大小：",len(val_dataset))

#指定测试集大小（同时作为解释器的训练集）
test_size = 1000
test_data = []
test_data_label = []
#每个类别取相同数量
for c in range(10):
    num = 0
    for i in range(10000):
        if val_set[i][1] == c:
            num+=1
            test_data.append(val_set[i][0].numpy())
            test_data_label.append(val_set[i][1])
            if num == test_size//10:
                break

test_data = torch.tensor(test_data)
test_data_label = torch.tensor(test_data_label)

test_dataset = TensorDataset(test_data,test_data_label)
print("测试集大小：",len(test_dataset))



In [None]:

print("分类模型测试集大小",train_data.size(), train_data_label.size())
import torch.nn as nn
import torch
import time

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1=nn.Sequential(
            nn.Conv2d(          #(1,28,28)
                in_channels=1,
                out_channels=16,
                kernel_size=5,
                stride=1,
                padding=2   #padding=(kernelsize-stride)/2
            ),#(16,28,28)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)#(16,14,14)
 
        )
        self.conv2=nn.Sequential(#(16,14,14)
            nn.Conv2d(16,32,5,1,2),#(32,14,14)
            nn.ReLU(),#(32,14,14)
            nn.MaxPool2d(2)#(32,7,7)
        )
        self.out=nn.Linear(32*7*7,10)
    def forward(self,x):
        x = self.conv1( x )
        x = self.conv2( x ) #(batch,32,7,7)
        x=x.view(x.size(0),-1) #(batch,32*7*7)
        output=self.out(x)
        return output
print("start")
EPOCH=50#总的训练次数
BATCH_SIZE=20#批次的大小
LR=0.0001#学习率#交叉熵损失函数不需要太大的学习率

train_loader=DataLoader(
                train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                )
val_loader=DataLoader(
            val_dataset, batch_size=BATCH_SIZE, shuffle=False,
            ) 
test_loader=DataLoader(
            test_dataset, batch_size=BATCH_SIZE, shuffle=False,
            ) 

grand_model_path = 'model/grand_mnist_{}_shuffle.pt'.format(train_size)
null_model_path = 'model/null_mnist_{}_shuffle.pt'.format(train_size)

if os.path.isfile(grand_model_path):
    print('Loading saved class model')
    #加载大模型和空模型
    grand_model = torch.load(grand_model_path).cuda()
    null_model = torch.load(null_model_path).cuda()

else:
    #训练过程
    cnn=CNN().cuda()
    null_model = cnn
    torch.save(null_model, null_model_path)
    optimizer=torch.optim.Adam(cnn.parameters(),lr=LR)
    loss_function=nn.CrossEntropyLoss()

    for ep in range(EPOCH):
        # 记录把所有数据集训练+测试一遍需要多长时间
        startTick =  time.perf_counter()
        for img, label in train_loader:  # 对于训练集的每一个batch
            # print(img,label)  
            img = img.cuda()
            label = label.cuda()

            out = cnn( img )  # 送进网络进行输出
            loss = loss_function( out, label )  # 获得损失
            # print(loss)
            optimizer.zero_grad()  # 梯度归零
            loss.backward()  # 反向传播获得梯度，但是参数还没有更新
            optimizer.step()  # 更新梯度
        num_correct = 0  # 正确分类的个数，在测试集中测试准确率
        for img, label in val_loader:       
            img = img.cuda()
            label = label.cuda()

            out = cnn( img )  # 获得输出

            _, prediction = torch.max( out, 1 )
            # torch.max()返回两个结果，
            # 第一个是最大值，第二个是对应的索引值；
            # 第二个参数 0 代表按列取最大值并返回对应的行索引值，1 代表按行取最大值并返回对应的列索引值。
            num_correct += (prediction == label).sum()  # 找出预测和真实值相同的数量，也就是以预测正确的数量

        accuracy = num_correct.cpu().numpy() / val_size  # 计算正确率，num_correct是gpu上的变量，先转换成cpu变量
        timeSpan =  time.perf_counter() - startTick
        print( "第%d迭代期，验证集准确率为%f,耗时%dS" % (ep + 1, accuracy, timeSpan) )
            
    grand_model = cnn
    torch.save(grand_model, grand_model_path)

  
num_correct = 0    
num_correct1 = 0  # 正确分类的个数，在测试集中测试准确率
num_correct2 = 0 
for img, label in train_loader:       
    img = img.cuda()
    label = label.cuda()
    out1 = grand_model(img)  # 获得输出
    _, prediction1 = torch.max( out1, 1 )
    num_correct += (prediction1 == label).sum() 
    accuracy = num_correct.cpu().numpy() / train_size
    
for img, label in test_loader:       
    img = img.cuda()
    label = label.cuda()
    out1 = grand_model(img)  # 获得输出
    out2 = null_model(img)
    _, prediction1 = torch.max( out1, 1 )
    _, prediction2 = torch.max( out2, 1 )
    # torch.max()返回两个结果，
    # 第一个是最大值，第二个是对应的索引值；
    # 第二个参数 0 代表按列取最大值并返回对应的行索引值，1 代表按行取最大值并返回对应的列索引值。
    num_correct1 += (prediction1 == label).sum()  # 找出预测和真实值相同的数量，也就是以预测正确的数量
    num_correct2 += (prediction2 == label).sum()
accuracy1 = num_correct1.cpu().numpy() / test_size  # 计算正确率，num_correct是gpu上的变量，先转换成cpu变量
accuracy2 = num_correct2.cpu().numpy() / test_size 
print( "grand_model在训练集上准确率为%f,在测试集上准确率为%f,null_model准确率为%f" % (accuracy,accuracy1, accuracy2) )
       
    

# Train explainer model

In [None]:
from unet import UNet
from AFDS import DataFastSHAP
print("cuda:",torch.cuda.is_available(),torch.cuda.device_count(),"个")
# Check for model
num_features = train_size
#加载大模型和空模型
grand_model = torch.load(grand_model_path).cuda()
null_model = torch.load(null_model_path).cuda()
#选取解释器训练集
test_size = 1000
test_data = []
test_data_label = []
for c in range(10):
    num = 0
    for i in range(10000):
        if val_set[i][1] == c and torch.max( 
                grand_model(val_set[i][0].resize(1,1,28,28).cuda()), 1 )[1] == c:
            num+=1
            test_data.append(val_set[i][0].numpy())
            test_data_label.append(val_set[i][1])
            if num == test_size//10:
                break

test_data = torch.tensor(test_data)
test_data_label = torch.tensor(test_data_label)

test_dataset = TensorDataset(test_data,test_data_label)
print("解释器训练集大小：",len(test_dataset))
explainer_path = 'model/AFDS_mnist_100_explainer_10.pt'
loss_path = 'exp_fast_datashapley/AFDS_mnist_{}_explainer_10'.format(train_size)

# explainer_path = 'model/AFDS_mnist_100_explainer_value_avg_10.pt'
# loss_path = 'exp_fast_datashapley/AFDS_mnist_{}_explainer'.format(train_size)

if os.path.isfile(explainer_path):
    print('Loading saved explainer model')
    explainer = torch.load(explainer_path).cuda()
    fastshap = DataFastSHAP(explainer, explainer_path, loss_path, grand_model, null_model, train_data, train_data_label, num_features, normalization='additive',
                        link=nn.Softmax(dim=-1))

else:
    # Create explainer model
    explainer = UNet(n_classes=num_features, num_down=2, num_up=1, num_convs=3).to(device)
     
    # Set up FastSHAP object
    fastshap = DataFastSHAP(explainer, explainer_path, loss_path, grand_model, null_model, train_data, train_data_label, num_features, normalization='additive',
                        link=nn.Softmax(dim=-1))

    # Train
    startTick =  time.perf_counter()
    fastshap.train(
        test_dataset,
        val_data,
        val_data_label,
        class_lr = 10*LR,
        batch_size=10,
        num_samples=1,
        max_epochs=300,
        stop_epoch = 10,
        validation_samples=1,
        verbose=True)
    timeSpan =  time.perf_counter() - startTick
    print( "训练解释器耗时: %dS" % (timeSpan) )
    

# Visualize results

In [None]:
# Select one image from each class
test_x = []
test_x_label = []
shap_test_size = 10
for c in range(10):
    num = 0
    for i in range(len(val_set)):
        if val_set[i][1] == c and torch.max( grand_model(val_set[i][0].resize(1,1,28,28).cuda()), 1 )[1] == c:
            num+=1
            test_x.append(val_set[i][0].numpy())
            test_x_label.append(val_set[i][1])
            if num == shap_test_size//10:
                break
test_x = torch.tensor(test_x)
test_x_label = torch.tensor(test_x_label)

print(test_x.size())
print(test_x_label)
# Get explanations
# values = fastshap.shap_values(test_x.cuda(), test_x_label.cuda(), grand_model, null_model)
values = fastshap.shap_values(test_x.cuda(), grand_model, null_model)


In [None]:
print(values.shape)
label_values = []
for i in range(10):    
    label_values.append(values[i,:,test_x_label[i]]) 
print(np.array(label_values).shape)

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
num_classes = 10
x = test_x
y = test_x_label
top_k = 10 #取前十个贡献大的训练数据
idxs = []  #存放索引值
train_data_topk = []
train_data_label_topk = []
label_values_topk = []
for i in range(10):
    idxs.append(np.array(label_values[i]).argsort()[::-1][0:top_k])
# print(idxs)
for i in range(len(idxs)): 
    # print(train_data.numpy().shape,train_data_label.numpy().shape) 
    train_data_topk.append(train_data.numpy()[list(idxs[i])])
    train_data_label_topk.append(train_data_label.numpy()[list(idxs[i])])
    label_values_topk.append(label_values[i][list(idxs[i])])

train_data_topk = np.array(train_data_topk)
train_data_label_topk = np.array(train_data_label_topk)
label_values_topk = np.array( label_values_topk)
# print(label_values_topk)
# print(train_data_topk.shape,train_data_label_topk.shape, label_values_topk.shape)
#分别对不同类别的一个样本画出前十最相关的训练图像
num_classes = 3
fig, axarr = plt.subplots(num_classes, 11, figsize=(22, 2*num_classes))
for row in range(num_classes):
    
    classes_label = ['0','1','2','3','4','5','6','7','8','9']
    classes = [ 'top1', 'top2', 'top3', 'top4', 'top5', 'top6', 'top7', 'top8', 'top9', 'top10']
    mean = np.array([0.5] )[:, np.newaxis, np.newaxis]
    std = np.array([0.5])[:, np.newaxis, np.newaxis]
    im = test_x[row].numpy() * std + mean
    im = im.transpose(1, 2, 0).astype(float)
    im = np.clip(im, a_min=0, a_max=1)
    axarr[row, 0].imshow(im, vmin=0, vmax=1)
    axarr[row, 0].set_xticks([])
    axarr[row, 0].set_yticks([])
    # class labels
    axarr[row, 0].set_ylabel('{}'.format(y[row]), fontsize=20)
    
    # Explanations 
    for col in range(top_k):
        im = train_data_topk[row, col,:,:,:] * std + mean
        im = im.transpose(1, 2, 0).astype(float)
        axarr[row, col + 1].imshow(im, vmin=0, vmax=1)
        axarr[row, col + 1].set_xticks([])
        axarr[row, col + 1].set_yticks([])
        # topk labels
        if row == 0:
            axarr[row, col + 1].set_title('{}'.format(classes[y[col]]), fontsize=20)
            if col == 0:
                axarr[row, col].set_title('test', fontsize=20)
        # shapley values
        # axarr[row, col + 1].set_xlabel('label:{},shap:{:.2f}'.format(train_data_label_topk[row, col],label_values_topk[row, col]), fontsize=12, fontweight='bold')
        axarr[row, col + 1].set_xlabel('o:{},p:{},s:{:.3f}'.format(train_data_label_topk[row, col],torch.max(grand_model(torch.tensor(train_data_topk[row, col]).resize(1,1,28,28).cuda()), 1 )[1].item(),label_values_topk[row, col]), fontsize=15, fontweight='bold')

plt.tight_layout()
plt.savefig("exp_fast_datashapley/Visualize_AFDS_mnist_{}.png".format(train_size))
plt.show()

# Test performance

In [None]:
def performance_plots(X, y, vals, link,test_x, y_ture, name=None, 
                        num_plot_markers=20, sources=None):
    """Plots the effect of removing valuable points.
     X:training data
     y:training data label
     y_ture: 当前测试样本真实标签
    Args:
        vals: A list of different valuations of data points each
                in the format of an array in the same length of the data.
        name: Name of the saved plot if not None.
        num_plot_markers: number of points in each plot.
        sources: If values are for sources of data points rather than
                individual points. In the format of an assignment array
                or dict.
                
    Returns:
        Plots showing the change in performance as points are removed
        from most valuable to least.
    """
    plt.rcParams['figure.figsize'] = 8,8
    plt.rcParams['font.size'] = 25
    plt.xlabel('Fraction of training data removed (%)')
    plt.ylabel('Value loss', fontsize=20)
    if not isinstance(vals, list) and not isinstance(vals, tuple):
        vals = [vals]
    if sources is None:
        sources = {i:np.array([i]) for i in range(len(X))}
    elif not isinstance(sources, dict):
        sources = {i:np.where(sources==i)[0] for i in set(sources)}
    vals_sources = [np.array([np.sum(np.array(val)[sources[i]]) 
                                for i in range(len(sources.keys()))])
                for val in vals]
    if len(sources.keys()) < num_plot_markers:
        num_plot_markers = len(sources.keys()) - 1
    plot_points = np.arange(
        0, 
        max(len(sources.keys()) - 10, num_plot_markers),
        max(len(sources.keys())//num_plot_markers, 1)
    )
    # print(plot_points)
    # print(len(vals))
    perfs = [portion_performance( X,y,
        # np.argsort 从小到大的索引值 [::-1]从大到小
        np.argsort(vals_source)[::-1], plot_points,link, test_x, y_ture, sources=sources)
                for vals_source in vals_sources]
    # print(torch.tensor(perfs).shape)
    rnd = np.mean([portion_performance(X,y,
        np.random.permutation(np.argsort(vals_sources[0])[::-1]),
        plot_points,link, test_x, y_ture, sources=sources) for _ in range(1)], 0)

    plt.plot(plot_points/len(X) * 100, perfs[0] , 
                '-', lw=5, ms=10, color='b')
    
    plt.plot(plot_points/len(X) * 100, rnd , 
                ':', lw=5, ms=10, color='r') 
    # print("4----------",len(vals))
    if len(vals)==3:
        plt.plot(plot_points/len(X) * 100, perfs[1] , 
                '-.', lw=5, ms=10, color='g') 
        plt.plot(plot_points/len(X) * 100, perfs[-1], 
                    '--', lw=5, ms=10, color='orange')
        legends = ['GFDS+', 'Random','DataShapley ', 'LOO']
    elif len(vals)==2:
        legends = ['GFDS+', 'Random','DataShapley ']
        plt.plot(plot_points/len(X) * 100, perfs[1] , 
                '-.', lw=5, ms=10, color='g') 
    else:
        legends = ['GFDS+', 'Random']   
    plt.legend(legends)
    # print('ours:',perfs[0])
    # if train_size==100:
    #     print('DataShapley:',perfs[1])
    #     print('loo:',perfs[-1])
    # print('random:',rnd)
    perfs.append(rnd)
    
    # plt.savefig( '{}.png'.format(name),
    #             bbox_inches = 'tight')
    # plt.close()
    # plt.show()
    return perfs
       
def portion_performance(X,y,idxs, plot_points, link,test_x, y_ture, sources=None):
    """Given a set of indexes, starts removing points from 
    the first elemnt and evaluates the new model after
    removing each point."""
    if sources is None:
        sources = {i:np.array([i]) for i in range(len(X))}
    elif not isinstance(sources, dict):
        sources = {i:np.where(sources==i)[0] for i in set(sources)}
    scores = []
    # print("1------------------")
    # print(plot_points)
    for i in range(len(plot_points)):
        keep_idxs = np.concatenate([sources[idx] for idx 
                                    in idxs[plot_points[i]:]], -1)
        # print(keep_idxs,len(keep_idxs))
        origin_idxs = keep_idxs[np.argsort(keep_idxs)]
        # print("-------------",origin_idxs)
        X_batch, y_batch = X[origin_idxs], y[origin_idxs]
        X_S = torch.tensor(X_batch).cuda()
        y_X = torch.tensor(y_batch).cuda()
        class_train_dataset = TensorDataset(X_S,y_X)
        # print(X_S.size(),y_X.size())
        class_train_loader=DataLoader(
            class_train_dataset, batch_size=20, shuffle=True, 
            )
        null_model = torch.load(null_model_path).cuda()
        net = null_model.cuda()
        class_optimizer=torch.optim.Adam(net.parameters(),lr=0.0001)
        loss_function=nn.CrossEntropyLoss()
        # print("2------------------")
        for ep in range(50):
                # 记录把所有数据集训练+测试一遍需要多长时间 
            for img, label in class_train_loader:  # 对于训练集的每一个batch
                # print(img,label)  
                img = img.cuda()
                label = label.cuda()
                out = net( img )  # 送进网络进行输出
                # print(img.size(),label.size())
                loss = loss_function( out, label ) 
                class_optimizer.zero_grad()
                loss.backward()
                class_optimizer.step()
        # print(test_x.view(1,1,28,28).shape)
        # print(net(test_x.view(1,1,28,28).cuda()))
        # print("3------------------",link(net(test_x.view(1,1,28,28).cuda()))[0])
        scores.append(nn.CrossEntropyLoss()(net(test_x.view(1,1,28,28).cuda()),y_ture.view(1).cuda()).item())
    # print(scores)
    return np.array(scores)


In [None]:
# 测试第i个生成样本
H_eta = []   
for i in range(shap_test_size):
#读取datashapley和loo结果
    if train_size==100:
        tmc = []
        f=open("exp_fast_datashapley/vals_tmc_mnist_{}.txt".format(train_size),"r")
        for line in f:
            tmc.append(float(line.strip('\n')))
        # print(len(tmc),tmc)
        loo = []
        f=open("exp_fast_datashapley/vals_loo_mnist_{}.txt".format(train_size),"r")
        for line in f:
            loo.append(float(line.strip('\n')))
    # print(len(loo),loo)
    # print(len(values[i,:,test_x_label[i]]),values[i,:,test_x_label[i]])
        perfs =performance_plots(train_data,train_data_label,[values[i,:,test_x_label[i]],tmc,loo], nn.Softmax(dim=-1), test_x[i], test_x_label[i], name ="exp_fast_datashapley/AFDS_mnist__{}_shuffle_compare".format(train_size), num_plot_markers=20)
    else:
        perfs =performance_plots(train_data,train_data_label,[values[i,:,test_x_label[i]]], nn.Softmax(dim=-1), test_x[i], test_x_label[i], name ="exp_fast_datashapley/AFDS_mnist__{}_shuffle_compare".format(train_size), num_plot_markers=20)
    H_eta.append(perfs)
H_std = np.std(np.array(H_eta),axis=0)
H_mean = np.mean(np.array(H_eta),axis=0)
print('ours:',H_mean[0])
print('std:',H_std[0])
if train_size==100:
    print('DataShapley:',H_mean[1])
    print('std:',H_std[1])
    print('loo:',H_mean[2])
    print('std:',H_std[2])
print('random:',H_mean[-1])    
print('std:',H_std[-1])