In [5]:
def _tokenize_chinese_chars(text):
    """Adds whitespace around any CJK character."""
    output = []
    for char in text:
        cp = ord(char)
        if _is_chinese_char(cp):
            output.append(" ")
            output.append(char)
            output.append(" ")
        else:
            output.append(char)
    # return "".join(output)
    return output
def _is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if (
        (cp >= 0x4E00 and cp <= 0x9FFF)
        or (cp >= 0x3400 and cp <= 0x4DBF)  #
        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
        or (cp >= 0xF900 and cp <= 0xFAFF)
        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
    ):  #
        return True

    return False

In [6]:
a = "中国人民attention中过"
res = _tokenize_chinese_chars(a)

In [8]:
print(res)

[' ', '中', ' ', ' ', '国', ' ', ' ', '人', ' ', ' ', '民', ' ', 'a', 't', 't', 'e', 'n', 't', 'i', 'o', 'n', ' ', '中', ' ', ' ', '过', ' ']


In [11]:
a = "  中  过  att"
a.split()

['中', '过', 'att']

In [14]:
import unicodedata

def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False


def _run_split_on_punc(text, never_split=None):
    """Splits punctuation on a piece of text."""
    if never_split is not None and text in never_split:
        return [text]
    chars = list(text)
    i = 0
    start_new_word = True
    output = []
    while i < len(chars):
        char = chars[i]
        if _is_punctuation(char):
            output.append([char])
            start_new_word = True
        else:
            if start_new_word:
                output.append([])
            start_new_word = False
            output[-1].append(char)
        i += 1

    # return ["".join(x) for x in output]
    return output

In [15]:
_run_split_on_punc("daf,dfa*daf")

[['d', 'a', 'f'], [','], ['d', 'f', 'a'], ['*'], ['d', 'a', 'f']]

In [18]:
def sw_run_split_on_punc(text):
    output = [[]]
    for i in text:
        if _is_punctuation(i):
            output.append([i])
            output.append([])
        else:
            output[-1].append(i)
    return [i for i in output if i!=[]]

In [21]:
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


In [36]:
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word


    def tokenize(self, text):
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            
            """
           
            外层循环start从0往后
                内层end每次从token尾往前
                    如果他俩夹的串在vocab中，切出子串
                    start增加子串的长度
            
            注意start为0时，子串前不加"##"
            否则子串前加"##"
            
            """


            while start < len(chars):  # start 0, 
                end = len(chars)       # end 2
                cur_substr = None
                while start < end:      
                    substr = "".join(chars[start:end]) # at
                    if start > 0:                      
                        substr = "##" + substr         # ##at
                    if substr in self.vocab:            
                        cur_substr = substr            # cur_substar=##at
                        break
                    end -= 1                           # end=2
                if cur_substr is None:     # 如果在某一轮发现了token中有字符不在vocab中，那么整个token视为unk_token
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end             # start跳到切出去的子串位置

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens


In [39]:
wpt = WordpieceTokenizer(["at", "##ten", "##tion", "plea", "##s", "##e"], "[unk]")

In [43]:
wpt.tokenize("attention please")

['at', '##ten', '##tion', 'plea', '##s', '##e']

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
a = torch.randn(1,3)
a

tensor([[-0.6312,  1.2852, -1.8901]])

In [8]:
a.expand(3,3)

tensor([[-0.6312,  1.2852, -1.8901],
        [-0.6312,  1.2852, -1.8901],
        [-0.6312,  1.2852, -1.8901]])

In [7]:
a.expand(2,3,3)

tensor([[[-0.6312,  1.2852, -1.8901],
         [-0.6312,  1.2852, -1.8901],
         [-0.6312,  1.2852, -1.8901]],

        [[-0.6312,  1.2852, -1.8901],
         [-0.6312,  1.2852, -1.8901],
         [-0.6312,  1.2852, -1.8901]]])

In [12]:
import torch

In [13]:
a = torch.ones(3,4)
a

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [14]:
a[0] = 0
a

tensor([[0., 0., 0., 0.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [17]:
a.view(-1).contiguous().eq(1)

tensor([False, False, False, False,  True,  True,  True,  True,  True,  True,
         True,  True])

In [18]:
mask = a.view(-1).contiguous().eq(1)
mask

tensor([False, False, False, False,  True,  True,  True,  True,  True,  True,
         True,  True])

In [19]:
torch.arange(len(mask))[mask].long()

tensor([ 4,  5,  6,  7,  8,  9, 10, 11])

In [20]:
layer = torch.nn.Linear(4,3)

In [None]:
[5,4] -(3,4) > [5,3]
[5,4] -(2,4) > [5,2]

In [25]:
layer.weight

Parameter containing:
tensor([[ 0.1996,  0.4603, -0.2883,  0.1713],
        [-0.1496,  0.1463, -0.2608,  0.0918],
        [-0.2600,  0.1853, -0.0806,  0.2524]], requires_grad=True)

In [23]:
layer.weight.index_select(0, torch.tensor([0,1]))

tensor([[ 0.1996,  0.4603, -0.2883,  0.1713],
        [-0.1496,  0.1463, -0.2608,  0.0918]], grad_fn=<IndexSelectBackward0>)

In [27]:
layer = torch.nn.Linear(5,5)
layer.weight

Parameter containing:
tensor([[-0.2172,  0.0892,  0.1601,  0.1991, -0.1320],
        [-0.0744,  0.3415, -0.2958, -0.1294,  0.1244],
        [-0.3874, -0.2104, -0.1770, -0.2696, -0.2908],
        [-0.0500, -0.1520,  0.4151, -0.2474,  0.0696],
        [-0.2954,  0.2502, -0.1591, -0.2732,  0.0095]], requires_grad=True)

In [30]:
layer.bias

Parameter containing:
tensor([-0.4277, -0.4221,  0.3778, -0.1997,  0.4031], requires_grad=True)

In [48]:
import torch
from torch import nn

def prune_linear_layer(layer, index, dim=0):
    """ Prune a linear layer (a model parameters) to keep only entries in index.
        Return the pruned layer as a new layer with requires_grad=True.
        Used to remove heads.   
    """                                    # layer:(4,5)
    index = index.to(layer.weight.device)  # [m,4] W[5,4]     [m,5]
    W = layer.weight.index_select(dim, index).clone().detach()
    if layer.bias is not None:
        if dim == 1:
            b = layer.bias.clone().detach()
        else:
            b = layer.bias[index].clone().detach()
    new_size = list(layer.weight.size())  #[5,4]
    new_size[dim] = len(index)          #[2,4]
    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) #(4,2)
    new_layer.weight.requires_grad = False
    new_layer.weight.copy_(W.contiguous())
    new_layer.weight.requires_grad = True
    if layer.bias is not None:
        new_layer.bias.requires_grad = False
        new_layer.bias.copy_(b.contiguous())
        new_layer.bias.requires_grad = True
    return new_layer   #(infeature, len(index))

In [46]:
layer = nn.Linear(4,5)
index = torch.tensor([1,2])

In [58]:
pruned_heads =set([2,3,4])
heads = [4]
for head in heads:
    a =[1 if h < head else 0 for h in pruned_heads]
    print(a)
    head = head - sum(1 if h < head else 0 for h in pruned_heads)
    print(head)
pruned_heads = pruned_heads.union(heads)
# print(pruned_heads)

[1, 1, 0]
2
{2, 3, 4}


In [1]:
import torch
from torch import nn
import math

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class BertSelfAttention(nn.Module):
    def __init__(self, config):  # config.hidden_size, config.num_attention_heads
        super().__init__()
        if config.hidden_size % config.num_attention_heads !=0:
            raise ValueError(
                "the hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
            ) 

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)


    def transpose_for_scores(self, x): # x: [B,S,H]
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape) #[B,S,num_heads,head_size]
        return x.permute(0, 2, 1, 3) #[B,num_heads,S,head_size]


    def forward(
        self,
        hidden_states,
        option,
        attention_mask=None,
        head_mask=None,   
    ):  

        mixed_query_layer = self.query(hidden_states)  # [B,S,H]
        mixed_key_layer = self.key(hidden_states) 
        mixed_value_layer = self.value(hidden_states)
        
        # 拆分为多头
        query_layer = self.transpose_for_scores(mixed_query_layer) #[B,num_heads, S, head_size]
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # query * key 计算得分矩阵
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1,-2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # 权重矩阵
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        # mask heads
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_layer) # [B,num_heads,S,S], [B,num_heads,S,head_size] -> [B,num_heads,S,head_size]
        
        # 合并多头
        if option == 1:
            context_layer = context_layer.permute(0,2,1,3).contiguous() #[B,S,num_heads,head_size]
            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
            context_layer = context_layer.view(*new_context_layer_shape)
        else:
            context_layer = context_layer.permute(0,2,1,3)
            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
            context_layer = context_layer.reshape(*new_context_layer_shape)

        return context_layer

In [3]:
class Config:
    def __init__(self):
        self.hidden_size = 100
        self.num_attention_heads = 5
        self.attention_probs_dropout_prob = 0
config = Config()

In [6]:
model = BertSelfAttention(config)

In [9]:
input= torch.randn(3,10, 100)
res1 = model(input, option=1)
res2 = model(input, option=2)


In [10]:
res1.equal(res2)

True

In [11]:
a = torch.randn(3,4)
a

tensor([[-0.2324, -0.7838, -0.1866, -1.8796],
        [-0.1204, -0.4605,  0.3686, -0.6623],
        [-0.4158,  0.3718, -1.0046, -2.1423]])

In [12]:
a[:, None, None, :]

tensor([[[[-0.2324, -0.7838, -0.1866, -1.8796]]],


        [[[-0.1204, -0.4605,  0.3686, -0.6623]]],


        [[[-0.4158,  0.3718, -1.0046, -2.1423]]]])

In [16]:
a = torch.randn(3)
a

tensor([ 0.9577, -0.0650,  0.9996])

In [18]:
a.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1).shape

torch.Size([1, 1, 3, 1, 1])