In [4]:
import torch
import torch.nn as nn 
import torch.nn.functional as F

* 时间维度上的线性加和

In [5]:
class ASP(torch.nn.Module):
    ''' Attentive Statistics Pooling 注意力统计池化

        Okabe K , Koshinaka T , Shinoda K . Attentive Statistics Pooling for Deep Speaker Embedding[J]. 2018.
    '''
    def __init__(self, channel):

        super().__init__()

        self.conv1 = torch.nn.Conv1d(in_channels=channel, out_channels=channel, kernel_size=1)
        self.conv2 = torch.nn.Conv1d(in_channels=channel, out_channels=1, kernel_size=1)

        self.conv1 = torch.nn.utils.weight_norm(self.conv1)
        self.conv2 = torch.nn.utils.weight_norm(self.conv2)

    def forward(self, x):
        print("x:",x.shape)
        o = self.conv1(x).relu()     # [B, C, T]
        print("conv1:",o.shape)
        #conv2感觉就是里面已经包括了乘上T了
        o = self.conv2(o).softmax(2) # [B, 1, T],在时间维度上softmax，也就是时间维度上加和为0
        print("conv2:",o.shape)

        u = (o*x).sum(2)          # [B, C]，o首先会复制扩展为[B,C,T],C行里面的每一行都和[1,2...T]是一样的
                                #也就是说o*x是先时间维度上都先乘上一个权值，然后对每一行求和
        print("u:",u.shape)
        s = (o*x**2).sum(2)-u**2  # [B, C]
        print("s:",s.shape)

        return torch.cat([u,s**0.5],1)     # [B, 2C]


In [6]:
model = ASP(39)

In [7]:
x = torch.rand(42,39,135)

In [8]:
model(x)

x: torch.Size([42, 39, 135])
conv1: torch.Size([42, 39, 135])
conv2: torch.Size([42, 1, 135])
u: torch.Size([42, 39])
s: torch.Size([42, 39])


tensor([[0.5338, 0.4669, 0.5200,  ..., 0.3101, 0.2956, 0.2848],
        [0.4621, 0.4887, 0.5111,  ..., 0.2922, 0.2771, 0.2813],
        [0.5171, 0.4744, 0.4782,  ..., 0.2864, 0.2823, 0.2931],
        ...,
        [0.4885, 0.4930, 0.5019,  ..., 0.2850, 0.2883, 0.3005],
        [0.4788, 0.4366, 0.4422,  ..., 0.2658, 0.2730, 0.2883],
        [0.4829, 0.5137, 0.5379,  ..., 0.2701, 0.2771, 0.2840]],
       grad_fn=<CatBackward>)

In [9]:
x.shape

torch.Size([42, 39, 135])

In [10]:
weight = torch.rand(42,1,135)

In [11]:
v = x * weight
v.shape

torch.Size([42, 39, 135])

In [12]:
x_1 = torch.tensor([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])

In [13]:
x_1.shape

torch.Size([2, 2, 3])

In [14]:
x_1

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

In [15]:
x_1.float().softmax(dim = 2)

tensor([[[0.0900, 0.2447, 0.6652],
         [0.0900, 0.2447, 0.6652]],

        [[0.0900, 0.2447, 0.6652],
         [0.0900, 0.2447, 0.6652]]])

In [16]:
x_1

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

In [17]:
x_1.sum(dim = 2)

tensor([[ 6, 15],
        [24, 33]])

In [18]:
x_1.shape

torch.Size([2, 2, 3])

In [19]:
test_weight = torch.tensor([[[1,2,3]],[[4,5,6]]])

In [20]:
x_1

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

In [21]:
test_weight

tensor([[[1, 2, 3]],

        [[4, 5, 6]]])

In [22]:
x_1*test_weight

tensor([[[ 1,  4,  9],
         [ 4, 10, 18]],

        [[28, 40, 54],
         [40, 55, 72]]])

In [23]:
test_weight2 = torch.tensor([[[1],[2]],[[3],[4]]])
test_weight2

tensor([[[1],
         [2]],

        [[3],
         [4]]])

In [24]:
x_1

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

In [25]:
x_1 * test_weight2

tensor([[[ 1,  2,  3],
         [ 8, 10, 12]],

        [[21, 24, 27],
         [40, 44, 48]]])

In [26]:
class myASP(torch.nn.Module):
    ''' Attentive Statistics Pooling 注意力统计池化

        Okabe K , Koshinaka T , Shinoda K . Attentive Statistics Pooling for Deep Speaker Embedding[J]. 2018.
    '''
    def __init__(self, channel):

        super().__init__()

        self.conv1 = torch.nn.Conv1d(in_channels=channel, out_channels=channel, kernel_size=1)
        self.conv2 = torch.nn.Conv1d(in_channels=channel, out_channels= 1 , kernel_size=1)
#         self.v = nn.Parameter(torch.rand(,))
        self.conv1 = torch.nn.utils.weight_norm(self.conv1)
        self.conv2 = torch.nn.utils.weight_norm(self.conv2)

    def forward(self, x):
        print("x:",x.shape)
        o = self.conv1(x).relu()     # [B, C, T]
        print("conv1:",o.shape)
        #好像是conv1然后再conv2才得到权重，conv2的作用好像就是v,因为其实v的话是扫描每个时间维度，然后对通道线性加和得到一个值
        o = self.conv2(o).softmax(2) # [B, 1, T],在时间维度上softmax，也就是时间维度上加和为0
        print("conv2:",o.shape)
        #这里的x应该改成用e_t
        u = (o*x).sum(2)          # [B, C]，o首先会复制扩展为[B,C,T],C行里面的每一行都和[1,2...T]是一样的
                                #也就是说o*x是先时间维度上都先乘上一个权值，然后对每一行求和
        print("u:",u.shape)
        s = (o*x**2).sum(2)-u**2  # [B, C]
        print("s:",s.shape)

        return torch.cat([u,s**0.5],1)     # [B, 2C]


In [27]:
class myASP2(torch.nn.Module):
    ''' Attentive Statistics Pooling 注意力统计池化

        Okabe K , Koshinaka T , Shinoda K . Attentive Statistics Pooling for Deep Speaker Embedding[J]. 2018.
    '''
    def __init__(self, channel):

        super().__init__()

        self.conv1 = torch.nn.Conv1d(in_channels=channel, out_channels=channel, kernel_size=1)
        self.conv2 = torch.nn.Conv1d(in_channels=channel, out_channels= channel , kernel_size=1)
#         self.v = nn.Parameter(torch.rand(,))
        self.conv1 = torch.nn.utils.weight_norm(self.conv1)
        self.conv2 = torch.nn.utils.weight_norm(self.conv2)

    def forward(self, x):
        print("x:",x.shape)
        o = self.conv1(x).relu()     # [B, C, T]
        print("conv1:",o.shape)
        #好像是conv1然后再conv2才得到权重，conv2的作用好像就是v,因为其实v的话是扫描每个时间维度，然后对通道线性加和得到一个值
        o = self.conv2(o).softmax(2) # [B, C, T],在时间维度上softmax，也就是时间维度上加和为0
        print("conv2:",o.shape)
        #这里的x应该改成用e_t
        u = (o*x).sum(2)          # [B, C]，o首先会复制扩展为[B,C,T],C行里面的每一行都和[1,2...T]是一样的
                                #也就是说o*x是先时间维度上都先乘上一个权值，然后对每一行求和
        print("u:",u.shape)
        s = (o*x**2).sum(2)-u**2  # [B, C]
        print("s:",s.shape)

        return torch.cat([u,s**0.5],1)     # [B, 2C]


In [28]:
model2 = myASP2(39)

In [29]:
model2(x)

x: torch.Size([42, 39, 135])
conv1: torch.Size([42, 39, 135])
conv2: torch.Size([42, 39, 135])
u: torch.Size([42, 39])
s: torch.Size([42, 39])


tensor([[0.5270, 0.4705, 0.5324,  ..., 0.3098, 0.2953, 0.2856],
        [0.4577, 0.4902, 0.5157,  ..., 0.2905, 0.2775, 0.2822],
        [0.5090, 0.4753, 0.4802,  ..., 0.2862, 0.2827, 0.2914],
        ...,
        [0.4829, 0.4920, 0.5042,  ..., 0.2845, 0.2883, 0.3016],
        [0.4726, 0.4381, 0.4485,  ..., 0.2669, 0.2723, 0.2894],
        [0.4757, 0.5119, 0.5442,  ..., 0.2702, 0.2745, 0.2847]],
       grad_fn=<CatBackward>)

In [44]:

import numpy as np

import torch
import torch.nn.functional as F



### There are some basic custom components/layers. ###

## Base ✿
class TdnnAffine(torch.nn.Module):
    """ An implemented tdnn affine component by conv1d
        y = splice(w * x, context) + b
    @input_dim: number of dims of frame <=> inputs channels of conv
    @output_dim: number of layer nodes <=> outputs channels of conv
    @context: a list of context
        e.g.  [-2,0,2]
    If context is [0], then the TdnnAffine is equal to linear layer.
    """
    def __init__(self, input_dim, output_dim, context=[0], bias=True, pad=True, stride=1, groups=1, norm_w=False, norm_f=False):
        super(TdnnAffine, self).__init__()
        assert input_dim % groups == 0
        # Check to make sure the context sorted and has no duplicated values
        for index in range(0, len(context) - 1):
            if(context[index] >= context[index + 1]):
                raise ValueError("Context tuple {} is invalid, such as the order.".format(context))

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.context = context
        self.bool_bias = bias
        self.pad = pad
        self.groups = groups

        self.norm_w = norm_w
        self.norm_f = norm_f

        # It is used to subsample frames with this factor
        self.stride = stride

        self.left_context = context[0] if context[0] < 0 else 0 
        self.right_context = context[-1] if context[-1] > 0 else 0 

        self.tot_context = self.right_context - self.left_context + 1

        # Do not support sphereConv now.
        if self.tot_context > 1 and self.norm_f:
            self.norm_f = False
            print("Warning: do not support sphereConv now and set norm_f=False.")

        kernel_size = (self.tot_context,)

        self.weight = torch.nn.Parameter(torch.randn(output_dim, input_dim//groups, *kernel_size))

        if self.bool_bias:
            self.bias = torch.nn.Parameter(torch.randn(output_dim))
        else:
            self.register_parameter('bias', None)

        # init weight and bias. It is important
        self.init_weight()

        # Save GPU memory for no skiping case
        if len(context) != self.tot_context:
            # Used to skip some frames index according to context
            self.mask = torch.tensor([[[ 1 if index in context else 0 \
                                        for index in range(self.left_context, self.right_context + 1) ]]])
        else:
            self.mask = None

        ## Deprecated: the broadcast method could be used to save GPU memory, 
        # self.mask = torch.randn(output_dim, input_dim, 0)
        # for index in range(self.left_context, self.right_context + 1):
        #     if index in context:
        #         fixed_value = torch.ones(output_dim, input_dim, 1)
        #     else:
        #         fixed_value = torch.zeros(output_dim, input_dim, 1)

        #     self.mask=torch.cat((self.mask, fixed_value), dim = 2)

        # Save GPU memory of thi case.

        self.selected_device = False

    def init_weight(self):
        # Note, var should be small to avoid slow-shrinking
        torch.nn.init.normal_(self.weight, 0., 0.01)

        if self.bias is not None:
            torch.nn.init.constant_(self.bias, 0.)


    def forward(self, inputs):
        """
        @inputs: a 3-dimensional tensor (a batch), including [samples-index, frames-dim-index, frames-index]
        """
        assert len(inputs.shape) == 3
        assert inputs.shape[1] == self.input_dim

        # Do not use conv1d.padding for self.left_context + self.right_context != 0 case.
        if self.pad:
            inputs = F.pad(inputs, (-self.left_context, self.right_context), mode="constant", value=0)

        assert inputs.shape[2] >=  self.tot_context

        if not self.selected_device and self.mask is not None:
            # To save the CPU -> GPU moving time
            # Another simple case, for a temporary tensor, jus specify the device when creating it.
            # such as, this_tensor = torch.tensor([1.0], device=inputs.device)
            self.mask = to_device(self, self.mask)
            self.selected_device = True

        filters = self.weight  * self.mask if self.mask is not None else self.weight

        if self.norm_w:
            filters = F.normalize(filters, dim=1)

        if self.norm_f:
            inputs = F.normalize(inputs, dim=1)

        outputs = F.conv1d(inputs, filters, self.bias, stride=self.stride, padding=0, dilation=1, groups=self.groups)

        return outputs

    def extra_repr(self):
        return '{input_dim}, {output_dim}, context={context}, bias={bool_bias}, stride={stride}, ' \
               'pad={pad}, groups={groups}, norm_w={norm_w}, norm_f={norm_f}'.format(**self.__dict__)

    @classmethod
    def thop_count(self, m, x, y):
        x = x[0]

        kernel_ops = torch.zeros(m.weight.size()[2:]).numel()  # Kw x Kh
        bias_ops = 1 if m.bias is not None else 0

        # N x Cout x H x W x  (Cin x Kw x Kh + bias)
        total_ops = y.nelement() * (m.input_dim * kernel_ops + bias_ops)

        m.total_ops += torch.DoubleTensor([int(total_ops)])



In [41]:
# Attention-based
class AttentionAlphaComponent(torch.nn.Module):
    """Compute the alpha with attention module.
            alpha = softmax(v'·f(w·x + b) + k) or softmax(v'·x + k)
    where f is relu here and bias could be lost.
    Support: 
            1. Single or Multi-head attention
            2. One affine or two affine
            3. Share weight (last affine = vector) or un-shared weight (last affine = matrix)
            4. Self-attention or time context attention (supported by context parameter of TdnnAffine)
            5. Different temperatures for different heads.
    """
    def __init__(self, input_dim, num_head=1, split_input=True, share=True, affine_layers=2, 
                 hidden_size=64, context=[0], bias=True, temperature=False, fixed=True):
        super(AttentionAlphaComponent, self).__init__()
        assert num_head >= 1
        # Multi-head case.
        if num_head > 1:
            if split_input:
                # Make sure fatures/planes with input_dim dims could be splited to num_head parts.
                assert input_dim % num_head == 0
            if temperature:
                if fixed:
                    t_list = []
                    for i in range(num_head):
                        t_list.append([[max(1, (i // 2) * 5)]])
                    # shape [1, num_head, 1, 1]
                    self.register_buffer('t', torch.tensor([t_list]))
                else:
                    # Different heads have different temperature.
                    # Use 1 + self.t**2 in forward to make sure temperature >= 1.
                    self.t = torch.nn.Parameter(torch.zeros(1, num_head, 1, 1))

        self.input_dim = input_dim
        self.num_head = num_head
        self.split_input = split_input
        self.share = share
        self.temperature = temperature
        self.fixed = fixed

        if share:
            # weight: [input_dim, 1] or [input_dim, hidden_size] -> [hidden_size, 1]
            final_dim = 1
        else:
            # weight: [input_dim, input_dim] or [input_dim, hidden_size] -> [hidden_size, input_dim]
            final_dim = input_dim

        first_groups = 1
        last_groups = 1

        if affine_layers == 1:
            last_affine_input_dim = input_dim
            # (x, 1) for global case and (x, h) for split case.
            if num_head > 1 and split_input:
                last_groups = num_head
            self.relu_affine = False
        elif affine_layers == 2:
            last_affine_input_dim = hidden_size * num_head
            if num_head > 1:
                # (1, h) for global case and (h, h) for split case.
                last_groups = num_head
                if split_input:
                    first_groups = num_head
            # Add a relu-affine with affine_layers=2.
            self.relu_affine = True
            self.first_affine = TdnnAffine(input_dim, last_affine_input_dim, context=context, bias=bias, groups=first_groups)
            self.relu = torch.nn.ReLU(inplace=True)
        else:
            raise ValueError("Expected 1 or 2 affine layers, but got {}.",format(affine_layers))

        self.last_affine = TdnnAffine(last_affine_input_dim, final_dim * num_head, context=context, bias=bias, groups=last_groups)
        # Dim=2 means to apply softmax in different frames-index (batch is a 3-dim tensor in this case).
        self.softmax = torch.nn.Softmax(dim=2)

    def forward(self, inputs):
        """
        @inputs: a 3-dimensional tensor (a batch), including [samples-index, frames-dim-index, frames-index]
        """
        assert len(inputs.shape) == 3
        assert inputs.shape[1] == self.input_dim

        if self.temperature:
            batch_size = inputs.shape[0]
            chunk_size = inputs.shape[2]

        x = inputs
        if self.relu_affine:
            x = self.relu(self.first_affine(x))
        if self.num_head > 1 and self.temperature:
            if self.fixed:
                t = self.t
            else:
                t = 1 + self.t**2
            x = self.last_affine(x).reshape(batch_size, self.num_head, -1, chunk_size) / t
            return self.softmax(x.reshape(batch_size, -1, chunk_size))
        else:
            return self.softmax(self.last_affine(x))


In [32]:

class AttentiveStatisticsPooling(torch.nn.Module):
    """ An attentive statistics pooling.
    Reference: Okabe, Koji, Takafumi Koshinaka, and Koichi Shinoda. 2018. "Attentive Statistics Pooling 
               for Deep Speaker Embedding." ArXiv Preprint ArXiv:1803.10963.
    """
    def __init__(self, input_dim, affine_layers=2, hidden_size=64, context=[0], stddev=True, stddev_attention=True, eps=1.0e-10):
        super(AttentiveStatisticsPooling, self).__init__()

        self.stddev = stddev
        self.input_dim = input_dim

        if self.stddev :
            self.output_dim = 2 * input_dim
        else :
            self.output_dim = input_dim

        self.eps = eps
        self.stddev_attention = stddev_attention

        self.attention = AttentionAlphaComponent(input_dim, num_head=1, share=True, affine_layers=affine_layers, 
                                                 hidden_size=hidden_size, context=context)

    def forward(self, inputs):
        """
        @inputs: a 3-dimensional tensor (a batch), including [samples-index, frames-dim-index, frames-index]
        """
        assert len(inputs.shape) == 3
        assert inputs.shape[1] == self.input_dim

        alpha = self.attention(inputs)

        # Weight avarage
        mean = torch.sum(alpha * inputs, dim=2, keepdim=True)

        if self.stddev :
            if self.stddev_attention:
                var = torch.sum(alpha * inputs**2, dim=2, keepdim=True) - mean**2
                std = torch.sqrt(var.clamp(min=self.eps))
            else:
                var = torch.mean((inputs - mean)**2, dim=2, keepdim=True)
                std = torch.sqrt(var.clamp(min=self.eps))
            return torch.cat((mean, std), dim=1)
        else :
            return mean

    def get_output_dim(self):
        return self.output_dim


In [35]:
x_1.shape

torch.Size([2, 2, 3])

In [36]:
x_1

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

In [38]:
x_1.sum(dim = 2, keepdim = True)

tensor([[[ 6],
         [15]],

        [[24],
         [33]]])

In [40]:
x_1.sum(dim = 2, keepdim = True).shape

torch.Size([2, 2, 1])

* keepdim的话就是保持原来的维度不变化

In [45]:
attention = AttentionAlphaComponent(39, num_head=1, share=True)

In [46]:
input_ = torch.rand(42,39,125)

In [51]:
alpha = attention(input_)

In [52]:
mean = torch.sum(alpha * input_, dim=2, keepdim=True)

In [53]:
mean.shape

torch.Size([42, 39, 1])

In [90]:
import pandas as pd

df = pd.read_csv(r"C:\Users\admin\Desktop\打分表.csv",error_bad_lines=False, encoding = "utf8")

df.index = list(range(1,15))

d = df.drop(['组号'],axis = 1)

x = pd.DataFrame(pd.Series(d.T.mean()))
x.rename(columns = {0:"平均分"})

In [93]:
x.rename(columns = {0:"平均分"})

Unnamed: 0,平均分
1,3.5
2,1.0
3,1.0
4,1.0
5,3.5
6,1.0
7,1.0
8,1.0
9,1.0
10,1.0
