In [13]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

### 搭建网络模块

In [14]:
from read_file import read_file, getTensorDataset
X, y = read_file(filepath='./4tasks-encode.xlsx', iScaler=False)
X_train, X_test, y_train, y_test = train_test_split(X.astype(np.float32), y, test_size=0.33, random_state=25)
train_loader = DataLoader(dataset=getTensorDataset(X_train, y_train), batch_size=54)
val_loader = DataLoader(dataset=getTensorDataset(X_test,y_test),batch_size=37)

In [62]:
class scaled_dot_product_attention(nn.Module):

    def __init__(self, att_dropout=0.0):
        super(scaled_dot_product_attention, self).__init__()
        self.dropout = nn.Dropout(att_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, scale=None):
        '''
        args:
            q: [batch_size, q_length, q_dimension]
            k: [batch_size, k_length, k_dimension]
            v: [batch_size, v_length, v_dimension]
            q_dimension = k_dimension = v_dimension
            scale: 缩放因子
        return:
            attention, alpha
        '''
        # 快使用神奇的爱因斯坦求和约定吧！
        alpha = torch.einsum('ijk,ilk->ijl', [q, k])# query和key向量相乘
        if scale:
            alpha = alpha * scale
        alpha = self.softmax(alpha)
        alpha = self.dropout(alpha)
        attention = torch.einsum('ijl,ilk->ijk', [alpha, v])
        return attention, alpha

In [4]:
class TaskNet(nn.Module):
    
    def __init__(self, input_dim, hidden_dim):
        '''
        input_dim:输入特征的维度
        hidden_dim:隐藏层维度
        query_dim:query向量的维度,因为其直接由隐藏层输出,故query_dim = output_dim
        此网络模块的最终输出是query向量
        '''
        super(TaskNet, self).__init__()
        self.query_dim = hidden_dim
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
    def forward(self,X):
        query = self.fc1(X)
        return query
        

In [None]:
tasknet = TaskNet(18,64)
for train_X, train_y in train_loader:
    out = tasknet(train_X)
    print(out.shape)
    print(out)

In [9]:
class Expert(nn.Module):
    
    def __init__(self, input_dim, hidden_dim):
        '''
        input_dim:输入特征的维度
        hidden_dim:隐藏层维度
        key_dim:key向量的维度,因为其直接由隐藏层输出,故key_dim = output_dim
        此网络模块的最终输出是key向量
        '''
        super(Expert, self).__init__()
        self.key_dim = hidden_dim
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
    def forward(self,X):
        key = self.fc1(X)
        return key
        

In [12]:
tasknet = TaskNet(18,64)
exp = Expert(18,64)
for train_X, train_y in train_loader:
    query = tasknet(train_X)
    key = exp(train_X)
    alpha = query@key.T
    print(alpha.shape)

torch.Size([54, 54])


In [65]:
class Tower(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(Tower, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16,output_dim)
        )

    def forward(self, X):
        out = self.fc1(X)
        return out

### 拼接网络模块，构建完整网络

In [71]:
class AOE(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, task_num, expert_num):
        super(AOE, self).__init__()
        # model
        self.tasknet_list = nn.ModuleList([TaskNet(18,64) for i in range(task_num)])
        self.expert_list = nn.ModuleList([Expert(18,64) for i in range(expert_num)])
        self.tower_list = nn.ModuleList([Tower(64,1) for i in range(task_num)])
        # vector
        self.query = []
        self.key = []
        self.value = []
        self.alpha = []
        self.attention = []
        # function
        self.softmax = nn.Softmax(dim=1)# dim=0代表作用于行向量，dim=1代表作用于列向量

    def forward(self, X):
        
        # 构建query
        self.query = [task_net(X) for task_net in self.tasknet_list]
        self.query = torch.stack(self.query)

        # 构建key
        self.key = [expert(X) for expert in self.expert_list]
        self.key = torch.stack(self.key)  #shape:torch.Size([5, 64])

        # 计算权重alpha
        for q in self.query:
            a = [(q@k.T) for k in self.key]
            print(a[1].shape)
            a = self.softmax(torch.tensor(a))
            self.alpha.append(a)

        # 构建value
        self.value = self.key.clone()
        
        # 计算attention值
        for a in self.alpha:
            a.view(1,5)
            self.attention.append(a@self.value)
        self.attention = torch.cat(self.attention, dim=0).reshape(4,64) #将attention列表转为tensor类型

        # 拼接attention值和TaskNet输出值，然后送入Tower
        # 其实TaskNet的输出值就是query
        query_add_attention = self.query + self.attention

        ## 传入TOWER
        tower_input = query_add_attention
        tower_output = [tower(ti) for tower, ti in zip(self.tower_list, tower_input)]

        return tower_output

### 读取数据，构建数据集

In [72]:
from read_file import read_file, getTensorDataset
X, y = read_file(filepath='./4tasks-encode.xlsx', iScaler=False)
X_train, X_test, y_train, y_test = train_test_split(X.astype(np.float32), y, test_size=0.33, random_state=25)
train_loader = DataLoader(dataset=getTensorDataset(X_train, y_train), batch_size=54)
val_loader = DataLoader(dataset=getTensorDataset(X_test,y_test),batch_size=37)

In [73]:
model = AOE(18, 64, 1, 4, 5)
for train_X, train_y in val_loader:
    out = model(train_X)
    print(out)
    break

TypeError: expected Tensor as element 0 in argument 0, but got list

In [31]:
X_train.shape
out[1].shape

torch.Size([1])

### 构建q,k,v矩阵，且qkv三个矩阵不参与梯度更新，因此要with torch.no_grad()

In [16]:
from read_file import read_file, getTensorDataset
X, y = read_file(filepath='./4tasks-encode.xlsx', iScaler=False)
X_train, X_test, y_train, y_test = train_test_split(X.astype(np.float32), y, test_size=0.33, random_state=25)
train_loader = DataLoader(dataset=getTensorDataset(X_train, y_train), batch_size=54)
val_loader = DataLoader(dataset=getTensorDataset(X_test,y_test),batch_size=37)

In [34]:
# 构建query矩阵
'''
1. 确定任务数量
2. 初始化task_num个网络,可存入了列表
3. 将每个网络输出的query向量拼接起来
'''
task_num = 4
tasknet_list = nn.ModuleList([TaskNet(18,64) for i in range(task_num)])
query = []
for train_X, train_y in train_loader:
    query = [task_net(train_X) for task_net in tasknet_list]
    query = torch.stack(query) # shape:torch.Size([4, 64])
print(query.shape)  # [task_num, batchsize, hidden_dim]

torch.Size([4, 54, 64])


In [35]:
# 构建key矩阵
'''
1. 确定专家网络数量
2. 初始化expert_num个网络
3. 拼接每个专家网络输出的key向量
'''
expert_num = 6
expert_list = nn.ModuleList([Expert(18,64) for i in range(expert_num)])
key = []
for train_X, train_y in train_loader:
    key = [expert(train_X) for expert in expert_list]
    key = torch.stack(key)  #shape:torch.Size([5, 64])
print(key.shape)  # [task_num, batchsize, hidden_dim]

torch.Size([6, 54, 64])


In [51]:
from torch import cosine_similarity
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import pandas as pd


In [92]:
## 计算alpha矩阵的第二种方法
softmax = nn.Softmax(dim=1)
alpha = []
for q in query:
    alpha_temp = []
    for k in key:
        cos_similarity = cosine_similarity(q, k).mean() # 采用余弦相似度计算行向量之间的相似度（得到54个值， 然后取平均）
        alpha_temp.append(cos_similarity)
    alpha_temp = torch.stack(alpha_temp)  
    print(type(alpha))   
    alpha.append(alpha_temp)
alpha = torch.stack(alpha)
alpha = softmax(alpha)
alpha


<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


tensor([[0.1460, 0.1452, 0.1774, 0.1804, 0.1597, 0.1914],
        [0.1646, 0.1623, 0.1839, 0.1578, 0.1667, 0.1647],
        [0.1659, 0.1582, 0.1660, 0.1759, 0.1710, 0.1631],
        [0.1619, 0.1561, 0.1691, 0.1730, 0.1633, 0.1766]],
       grad_fn=<SoftmaxBackward>)

In [93]:
# 构建value矩阵
value = key.clone()
value[1].shape

torch.Size([54, 64])

In [94]:
attention = []
for a_i in alpha:
    # 计算第一个任务关于专家网络的attention值
    temp = []
    for a_ij, vi in zip(a_i, value):
        # 权重和value值分别相乘, 存入列表后求和
        attention_i = a_ij*vi
        temp.append(attention_i)
    attention.append(sum(temp))
attention = torch.stack(attention)
attention.shape

torch.Size([4, 54, 64])

In [15]:
# 计算attention值
'''
1. alpha[0]:[0.2104, 0.1306, 0.2965, 0.1911, 0.1713]
   针对任务A来说,这五个值分别代表五个专家网络所占的权重
   针对任务B来说,则需要查看alpha[1]

2. value矩阵的形状为[5, 54 64],5代表5个专家网络, 54代表batchsize, 64代表专家网络的输出维度

3. 若想输出针对任务A的attention值,则
'''
attention = []
for a in alpha:
    a.view(1,5)
    attention.append(a@value)

In [16]:
m1 = torch.tensor([1,2,3,4]).reshape(2,2)
m2 = torch.tensor([1,2,3,4]).reshape(2,2)
m1+m2

tensor([[2, 4],
        [6, 8]])