In [1]:
from argparse import Namespace
import sys, os
import json
import pickle

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from apex import amp

from transformers import AutoTokenizer, AutoModelForSequenceClassification


# from Classes import Node, Adjacency_sp, Question_Paragraph

# Question_Paragraph

In [25]:
class Question_Paragraph(object):
    '''Q-P pair and label. for BERT and node init.
    返回q-p对和q-s对, 还要确保能够初始化Node类.
    每个问句有10个paragraph,即10个此类.'''
    def __init__(self, ques_id, para_id, question_tokens, para_title_tokens, para_label, sents_in_para, sentences_label):
        self.question_tokens = question_tokens
        self.para_title_tokens = para_title_tokens
        self.sents_in_para = sents_in_para
        self.sentences_offsets = self.cal_offsets(self.sents_in_para)
        self.sentences_label = sentences_label

        self.ques_id = ques_id
        self.para_id = para_id 
        self.para_label = para_label # 段落label

        self.question_features = None # [N, dim]
        self.para_features = None
#         self.paragraph_features = None

    @classmethod
    def build(cls, ques_id, para_id, question, para_title_tokens, para_label, node_list):
        Snodes = [n for n in node_list if n.paragraph_id == para_id and n.node_type == 'Sentence']
        question_tokens = tokensizer_in_Model(question)
        # para_title_tokens = tokensizer_in_Model(para_title)
        sents_in_para = [n.content_tokens for n in Snodes]
        sentences_label = [int(n.is_support) for n in Snodes]
        return cls(ques_id, para_id, question_tokens, para_title_tokens, para_label, sents_in_para, sentences_label)

    @staticmethod
    def cal_offsets(sents_list):
        cursor = 0
        offsets = []
        for sent_tokens in sents_list:
            len_sent = len(sent_tokens)
            offsets.append((cursor, cursor+len_sent))
            cursor += len_sent
        return offsets
    
    # content tokens
    def get_para_tokens(self, contain_title = False):
        para_token = self.para_title_tokens if contain_title else []
        for i in self.sents_in_para: para_token.extend(i)
        return para_token

    def get_ques_para_label_tuple(self, contain_title = False):
        '''问句-段落对'''
        return (self.question_tokens, self.get_para_tokens(contain_title), self.para_label)

    def get_ques_sent_label_list(self, contain_title = False):
        '''问句-句子对'''
        if contain_title:
            return [(self.question_tokens, self.para_title_tokens.extend(sent_tokens), sent_label)\
                for sent_tokens,sent_label in zip(self.sents_in_para, self.sentences_label)]
        else:
            return [(self.question_tokens, sent_tokens, sent_label)\
                for sent_tokens,sent_label in zip(self.sents_in_para, self.sentences_label)]

    def format_sents_in_para(self):
        return ' '.join([f'{index}:{word}' for index,word in enumerate(self.sents_in_para)])

    # features
    def build_features(self):
        '''build features from LM models'''
    
        self.question_features = get_feature_from_model(self.question_tokens)

        para_features = 0
        for one_line in self.get_ques_sent_label_list():

            sent_features = get_feature_from_model(one_line[0], one_line[1], 'second')

            if type(para_features) == int: para_features = sent_features.clone()
            else: para_features = torch.cat((para_features, sent_features), dim=0)
       
        self.para_features = para_features
        

    def get_question_features(self):
        return self.question_features

    def get_paragraph_features(self):
        return self.para_features

    # other
    def __str__(self):
        return f'Q_P. p_id: {self.para_id}'
    
    def __repr__(self):
        return f'Q_P. p_id: {self.para_id}'

# class Node


In [26]:
class Node(object):
    '''Node class for graph'''
    
    def __init__(self, node_id, node_type, content_raw, content_tokens, \
                 content_NER_list, parent_id, content_features=None, is_support=False):
        self.node_id = node_id
        self.node_type = node_type
        self.content_raw = content_raw
        self.content_tokens = content_tokens
        self.content_NER_list = content_NER_list
        self.parent_id = parent_id
        
        # Q_node doesn't have.
        self.paragraph_id = -1
        self.start_in_paragraph = -1
        self.end_in_paragraph = -1

        # only for E_node
        # E节点能够通过parent_id找到S节点.
        self.start_in_sentence = -1
        self.end_in_sentence = -1

        self.is_support = is_support # 段落 句子 

        self.content_features = content_features
        self.cls_feature = None # final features. [1,dim]
        
    @classmethod
    def build(cls, node_id, node_type, content_raw, parent_id, content_tokens=None):
        '''content_tokens能加快计算速度.'''
        # content_tokens = tokensize_and_repr_in_BERT(content_raw, flatten=True)
        if node_type != 'Entity':
            content_tokens_NOCLS, content_NER_list = find_NER_in_Model(content_raw, content_tokens)
        else:
            content_tokens_NOCLS = content_tokens
            content_NER_list = None
        # print(f'id:{node_id}\n{content_raw}\n{content_NER_list}\n')
        return cls(node_id, node_type, content_raw, content_tokens_NOCLS, content_NER_list, parent_id)
    
    def set_support(self):
        self.is_support = True
    
    def set_span_in_paragraph(self, para_id, start):
        self.paragraph_id = para_id
        self.start_in_paragraph = start
        self.end_in_paragraph = start + len(self.content_tokens)

    # only for E_node.
    def set_span_in_sentence(self, start):
        self.start_in_sentence = start
        self.end_in_sentence = start + len(self.content_tokens)
        
    def __str__(self):
        return f'Node: {self.node_type} {self.node_id}'
    
    def __repr__(self):
        return f'Node: {self.node_type} {self.node_id}'

    def get_NER_tuples_list(self):
        '''返回NER元组. e.g. [('ALLPE',id), ('DELL',id)]'''
        return [(i['content'], self.node_id) for i in self.content_NER_list]


# class Adjacency_sp

In [27]:
import scipy.sparse as sp
import numpy as np

class Adjacency_sp(object):
    '''无重复稀疏邻接矩阵'''
    def __init__(self):
        self.v_i_j = []
        self.i_j_find_table = []

    def append(self, v, i, j):
        if not (i,j) in self.i_j_find_table:
            self.v_i_j.append([v,i,j])
            self.i_j_find_table.append((i,j))
    
    def to_dense(self):
        '''return numpy ndarray.'''
        _len = max([i[0] for i in self.i_j_find_table] + [i[1] for i in self.i_j_find_table]) + 1
        shape = (_len,_len)
        np_adj = np.array(self.v_i_j)
        full_adj = sp.coo_matrix((np_adj[:, 0], (np_adj[:, 1], np_adj[:, 2])), shape=shape, dtype=np.float32).todense()
        full_adj = np.array(full_adj)
        return full_adj

    def to_dense_symmetric(self):
        _len = max([i[0] for i in self.i_j_find_table] + [i[1] for i in self.i_j_find_table]) + 1
        shape = (_len,_len)
        np_adj = np.array(self.v_i_j)
        adj = sp.coo_matrix((np_adj[:, 0], (np_adj[:, 1], np_adj[:, 2])), shape=shape, dtype=np.float32)
        adj_symm = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj).todense()
        adj_symm = np.array(adj_symm)
        return adj_symm

    def __repr__(self):
        return f'Adjacency_sp has {len(self.v_i_j)} edges'
    def __str__(self):
        return self.__repr__()
    def __len__(self):
        return len(self.v_i_j)

In [5]:
try:
    from google.colab import drive
    # drive.mount('/content/folders/')
    json_train_path = '/content/folders/My Drive/HotpotQA/hotpot_train_v1.1.json'
    save_cache_path = '/content/folders/My Drive/save_cache'
    HotpotQA_path = '/content/folders/My Drive/HotpotQA'
except:
    json_train_path = 'data/hotpot_train_v1.1.json'
    save_cache_path = 'save_cache/'
    HotpotQA_path = './'

sys.path.insert(0,HotpotQA_path)

args = Namespace(
    # Data and path information
    json_train_path=json_train_path,
    model_state_file = "model_HotpotQA.pth",
    save_dir = save_cache_path,
    HotpotQA_preprocess_file = 'hotpotQA_train_preprocess100.pkl',

    # Model hyper parameter
    use_proxy = False,
    proxies={"http_proxy": "127.0.0.1:10802",
             "https_proxy": "127.0.0.1:10802"},
    tokenizer_type = "bert-large-cased-whole-word-masking",
    model_type = 'bert-base-uncased',

    # Dataset parameter
    max_seq = 512,
    pad_to_max = True,

    # Training hyper parameter
    num_epochs=2,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    freeze_layer_name = 'bert.encoder.layer.10',

    # Runtime hyper parameter
    cuda=True,
    device=None,
    tpu=False,
    catch_keyboard_interrupt=True,
    reload_from_files=True,
    expand_filepaths_to_save_dir=True,
)

proxies = args.proxies if args.use_proxy else None

args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

if args.expand_filepaths_to_save_dir:
    args.model_state_file = os.path.join(args.save_dir,args.model_state_file)
    args.HotpotQA_preprocess_file = os.path.join(args.save_dir,args.HotpotQA_preprocess_file)

# 方法1: 使用BERT+offset

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_type,proxies=proxies)

config = AutoConfig.from_pretrained(args.model_type)
config.output_hidden_states = True

model = AutoModelForSequenceClassification.from_config(config)
model.load_state_dict(torch.load(args.model_state_file))
_ = model.eval()

if args.cuda: model.to(args.device)

In [7]:
model.config.output_hidden_states

True

# 函数封装

## get_feature_from_model

In [8]:
# 辅助函数
def get_feature_from_model(text, text_pair=None, return_type = 'first', \
                           max_length=512, tokenizer=tokenizer, LM_model=model):
    '''输入[seq_len], 返回[seq_len, dim]'''
    assert return_type in ['first','second']
    assert LM_model.config.output_hidden_states == True
    
    if text_pair == None: return_type = 'first'
    
    model_input = tokenizer.encode_plus(
            text = text,
            text_pair = text_pair,
            add_special_tokens = True,
            max_length = max_length,
            truncation_strategy = 'only_second',
            pad_to_max_length = False,
            return_tensors = 'pt'
        )
    
    if args.cuda:
        model_input = {k:v.to(args.device) for k,v in model_input.items()}

    with torch.no_grad():
        res_tuple=LM_model(**model_input)
        first_part_len = model_input['token_type_ids'].flatten().tolist().count(0)
    
    # last hidden layer.
    seq_hidden = res_tuple[1][-1].squeeze().to('cpu')
    
    if text_pair == None or return_type == 'first': 
        return seq_hidden[1:-1]
    else:
        # first_part_len-1 is [SEP]
        return seq_hidden[first_part_len:-1]

In [9]:
get_feature_from_model('hello i am dog').shape

torch.Size([4, 768])

In [10]:
get_feature_from_model('hello i dog','what are you doing ?', 'second').shape

torch.Size([5, 768])

In [11]:
res = tokenizer.encode_plus('hello i dog','what are you doing ?',return_tensors='pt')
res

{'input_ids': tensor([[  101, 19082,   178,  3676,   102,  1184,  1132,  1128,  1833,   136,
            102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
res = tokenizer.encode_plus('hello, i am jim', return_tensors='pt')
res

{'input_ids': tensor([[  101, 19082,   117,   178,  1821, 23220,  1306,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
res['token_type_ids'].flatten().tolist().count(0)

8

## build_features

In [14]:
# 给定LM_model, tokenizer, 对Q-P list先计算feature. 
# 然后初始化 Q P-->S-->
lines = [
    [['hello','i','am','Jim'],['hello','i','am','Jim','dog']],
    [['hello','i','am','Jim'],['i','am','Jim','dog']],
    [['hello','i','am','Jim'],['hello','i','am','Jim','dog','very','old']],
]

# 建立features

In [22]:
with open(args.HotpotQA_preprocess_file,'rb')as fp:
    hotpotQA_train_preprocess = pickle.load(fp)

In [23]:
hotpotQA_train_preprocess[0].keys()

dict_keys(['id', 'node_list', 'sp_adj', 'ques_para_list'])

In [24]:
ques_para_list = hotpotQA_train_preprocess[0]['ques_para_list']

In [25]:
ques_para_list[0].build_features()

In [26]:
ques_para_list[0].get_paragraph_features().shape

torch.Size([167, 768])

In [18]:
for ques_item in tqdm_notebook(hotpotQA_train_preprocess):
    ques_para_list = ques_item['ques_para_list']
    for q_p_item in ques_para_list:
        q_p_item.build_features()

HBox(children=(IntProgress(value=0), HTML(value='')))




In [19]:
hotpotQA_train_preprocess[7]['ques_para_list'][6].get_paragraph_features().shape

torch.Size([92, 768])

# 获得features

In [20]:
node_list = hotpotQA_train_preprocess[0]['node_list']

In [21]:
for ques_item in tqdm_notebook(hotpotQA_train_preprocess):
    node_list = ques_item['node_list']
    ques_para_list = ques_item['ques_para_list']

    # Q node
    Q_node = node_list[0]
    Q_node.content_features = ques_para_list[0].get_question_features()

    # P node
    for P_node in [i for i in node_list if i.node_type == 'Paragraph']:
        P_node.content_features = [q for q in ques_para_list \
                                   if P_node.paragraph_id == q.para_id][0].get_paragraph_features()

        P_node.content_tokens = [q for q in ques_para_list \
                                   if P_node.paragraph_id == q.para_id][0].get_para_tokens()
        

    # S node
    for S_node in [i for i in node_list if i.node_type == 'Sentence']:
        start = S_node.start_in_paragraph
        end = S_node.end_in_paragraph
        S_node.content_features = node_list[S_node.parent_id].content_features[start:end]
        
        # 验证OK
#         print(S_node.content_tokens)
#         print(start, end)
#         print(node_list[S_node.parent_id].content_tokens[start:end])
#         print('')

    # E node
    for E_node in [i for i in node_list if i.node_type == 'Entity']:
        start = E_node.start_in_sentence
        end = E_node.end_in_sentence
        E_node.content_features = node_list[E_node.parent_id].content_features[start:end]        
        
        # 验证OK
#         print(E_node.content_tokens)
#         print(start, end)
#         print(node_list[E_node.parent_id].content_tokens[start:end])
#         print('')


HBox(children=(IntProgress(value=0), HTML(value='')))




In [22]:
hotpotQA_train_preprocess[0]['node_list'][0].content_features.shape

torch.Size([14, 768])

In [23]:
hotpotQA_train_preprocess[0]['node_list'][1].content_features.shape

torch.Size([167, 768])

In [24]:
hotpotQA_train_preprocess[6]['node_list'][7].content_features.shape

torch.Size([27, 768])

# 建立features矩阵

**目标**: 获取`torch.Size([10, 768])`.

## 使用GRU获取

使用GRU将变长序列缩减为[1,dim],需要考虑padding导致的效率问题. 最好使用`torch.nn.utils.rnn.pack_padded_sequence()`进行加速计算.

难点在于,使用上述包要求输入的可变长度矩阵排列为上三角矩阵, 这要求同时改变邻接矩阵.

**缺点**: 句子长度依然有600+词, 使用GRU依然存在梯度问题.


In [25]:
test_tokens = [
    [1,2,3],
    [1],
    [9,2,3,4,5],
    [1,2],
    [100,2,3],
]
test_adj=[
    [1,0,1,0,0],
    [0,1,1,1,0],
    [1,1,1,0,0],
    [0,1,0,1,1],
    [0,0,0,1,1],
]
np_test_len2 = np.array([len(i) for i in test_tokens])
np_adj = np.array(test_adj)

np_test_len2

array([3, 1, 5, 2, 3])

In [26]:
index_argsort = np_test_len2.argsort(axis=0).tolist()
index_argsort

[1, 3, 0, 4, 2]

In [28]:
# def sort_features_matrix(batch_)

test_adj_sort = np.zeros_like(test_adj)
for index_from,index_to in zip(range(len(np_test_len2)), index_argsort):
    test_adj_sort[index_to,:] = np_adj[index_from,:]
    test_adj_sort[:,index_to] = np_adj[:,index_from]
test_adj_sort

array([[1, 1, 0, 0, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 0, 1, 1],
       [0, 1, 1, 1, 1],
       [0, 1, 1, 1, 1]])

In [29]:
# 模拟变长序列.
seq_max = 512

x_test = [torch.randn((np.random.randint(10,seq_max),768)) for i in range(5)]
np_test_len = np.array([i.size(-2) for i in x_test])

np_test_len

array([126, 325, 438, 188, 338])

In [30]:
test_adj=[
    [1,0,1,0,0],
    [0,1,1,1,0],
    [1,1,1,0,0],
    [0,1,0,1,1],
    [0,0,0,1,1],
]

In [31]:
index_argsort = np_test_len.argsort(axis=0).tolist()
index_argsort

[0, 3, 1, 4, 2]

### 封装adj 排序

In [32]:
def sort_features_adj(bsz_seq_dim, adj):
    test_adj_sort = np.zeros_like(test_adj)
    for index_from,index_to in zip(range(len(x_test)), index_argsort):
        test_adj_sort[index_to,:] = np_adj[index_from,:]
        test_adj_sort[:,index_to] = np_adj[:,index_from]
    test_adj_sort

## 使用线性层

使用线性层也要将sentence进行padding.

In [33]:
# 模拟变长序列.
seq_max = 1024
x_test_padding = []
x_test_mask = []
batch_seq_dim = [torch.randn((1,np.random.randint(0,seq_max),768)) for i in range(10)]
for t in batch_seq_dim:
    # 左边不填, 右边填满0.
    tokens_len = t.shape[-2]
    x_test_mask.append([1]*tokens_len + [0]*(seq_max - tokens_len))
    pd = (0,0,0, seq_max - tokens_len)
    x_test_padding.append(F.pad(t, pd ,"constant", 0))

x_test = torch.cat(x_test_padding, dim=0)
x_test_mask = torch.tensor(x_test_mask)

In [34]:
x_test[0].transpose(-1,-2).shape

torch.Size([768, 1024])

In [35]:
x_test_mask[3]

tensor([1, 1, 1,  ..., 0, 0, 0])

In [36]:
# 封装
def build_padding(batch_seq_dim, seq_max = 19):
    x_padding = []
    x_mask = []
    for t in batch_seq_dim:
        tokens_len = t.shape[-2]
        x_mask.append([1]*tokens_len + [0]*(seq_max - tokens_len))
        pd = (0,0,0, seq_max - tokens_len)
        x_padding.append(F.pad(t, pd ,"constant", 0))

    x_padding = torch.cat(x_padding, dim=0)
    x_mask = torch.tensor(x_mask)
    
    return x_padding, x_mask

In [37]:
torch.matmul(x_test.transpose(-1,-2), torch.randn(seq_max,1).unsqueeze(0)).shape

torch.Size([10, 768, 1])

In [38]:
class Simple_MLP(nn.Module):
    def __init__(self, seq_max, input_dim):
        super(Simple_MLP, self).__init__()
        
        self.reducer = nn.Parameter(torch.randn([seq_max,1]))
        
    def forward(self, x): # [B,N,D]
        out = torch.matmul(x.transpose(-1,-2), self.reducer.unsqueeze(0)).squeeze() # [10, 768, 1]        
        return out
    
simple_mlp = Simple_MLP(seq_max,768)

In [39]:
simple_mlp(x_test).shape

torch.Size([10, 768])

## 直接使用max mean

In [40]:
# 模拟变长序列.
seq_max = 512
x_test_padding = []
for t in [torch.randn((1,np.random.randint(0,seq_max),768)) for i in range(10)]:
    # 左边不填, 右边填满0.
    pd = (0,0,0, seq_max - t.shape[-2])
    x_test_padding.append(F.pad(t, pd ,"constant", 0))
x_test = torch.cat(x_test_padding, dim=0)

In [41]:
torch.mean(x_test, dim=1).shape

torch.Size([10, 768])

In [42]:
values, indices = torch.max(x_test, dim=1)
values.shape

torch.Size([10, 768])

## 使用transformer

获取transformer中`[CLS]`的表达.

In [43]:
from transformers import AutoConfig, AutoModel

model_name = 'bert-base-uncased'
my_config = AutoConfig.from_pretrained(model_name)
model_bert = AutoModel.from_config(my_config)
_ = model_bert.to('cuda')

In [44]:
x_test = [
    [1,2,3,4,5,6],
    [6,7,8,9,0,0],
    [12,14,16,0,0,0]
]
x_test = torch.tensor(x_test, device='cuda')

In [45]:
last_hidden_state , pooler_output  = model_bert(x_test)

In [46]:
pooler_output.shape

torch.Size([3, 768])

# 方法2: 使用CLS

1. 使用`question-sentence features`的`[CLS]`作为**S节点**的初始化, 同时保存`hidden status`.
2. 使用`question-sentence CLS`的`[CLS]`作为**P节点**的初始化.
3. 使用**E节点**的offet获取S节点对应部分的`hidden status`, 使用MLP初始化.
4. 使用`question-P node CLS`的`[CLS]`作为**Q节点**的初始化

**注意** 此方法无需`class Question_Paragraph`

In [28]:
from transformers import AutoConfig, AutoModel, AutoTokenizer

import pickle
import torch
from tqdm import tqdm_notebook

In [29]:
HotpotQA_preprocess_file = 'save_cache/hotpotQA_train_preprocess100_new.pkl'
with open(HotpotQA_preprocess_file,'rb')as fp:
    hotpotQA_train_preprocess = pickle.load(fp)

In [30]:
model_name = 'xlnet-base-cased'
proxies={"http_proxy": "127.0.0.1:10802",
         "https_proxy": "127.0.0.1:10802"}

config = AutoConfig.from_pretrained(model_name,proxies=proxies)


tokenizer_XLNET = AutoTokenizer.from_pretrained(model_name,proxies=proxies)
model_XLNET = AutoModel.from_config(config)
DEVICE = 'cuda:1'
# DEVICE = 'cpu'
_ = model_XLNET.to(DEVICE)
_ = model_XLNET.train()

## 冻结

In [7]:
def freeze_to_layer(model, layer_name):
    '''冻结层. 从0到layer_name.'''
    
    if layer_name == 'all':
        index_start = len(model.state_dict())
    else:
        index_start = -1
        for index, (key, _value) in enumerate(model.state_dict().items()):
            if key.startswith(layer_name): 
                index_start = index
                break

    if index_start < 0:
        print(f"Don't find layer name: {layer_name}")
        return
    
    no_grad_nums = index_start + 1
    grad_nums = 0

    for index, i in enumerate(model.parameters()):
        if index >= index_start:
            i.requires_grad = True
            grad_nums += 1
        else:
            i.requires_grad = False
    
    print(f"freeze layers num: {no_grad_nums}, active layers num: {grad_nums}.")
    # no need to return.

In [8]:
freeze_to_layer(model_XLNET, 'all')

freeze layers num: 207, active layers num: 0.


## 获得特征

In [9]:
def get_features_from_XLNET(text,text_pair=None,
                            tokenizer = None,
                            model = None,
                            add_special_tokens = True,
                           device = 'cuda'):
    '''XLNET在512张TPU v3上训练5.5天得到. 一张TPU 8核心 128GB内存.'''
    
    assert model
    model_input = tokenizer_XLNET.encode_plus(text,text_pair,
                                        add_special_tokens=add_special_tokens,
                                        return_tensors='pt')
    
    model_input = {k:v.to(device) for k,v in model_input.items()}
    
    # 不能在函数里面设置device.
    # model.to(device)
    with torch.no_grad():
        last_hidden_state = model(**model_input)[0]
    
    return last_hidden_state

In [10]:
node_list = hotpotQA_train_preprocess[0]['node_list']

In [31]:
sp_adj = hotpotQA_train_preprocess[0]['sp_adj']

In [33]:
sp_adj.to_dense_symmetric().

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 4., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 7., 0.],
       [0., 0., 0., ..., 7., 0., 7.],
       [0., 0., 0., ..., 0., 7., 0.]], dtype=float32)

In [34]:
for ques_item in tqdm_notebook(hotpotQA_train_preprocess, desc = 'building features'):
    node_list = ques_item['node_list']
#     ques_para_list = ques_item['ques_para_list']

    # Q node
    Q_node = node_list[0]
    Q_node.content_features = get_features_from_XLNET(Q_node.content_raw,
                                                     tokenizer = tokenizer_XLNET,
                                                      model = model_XLNET,
                                                     device = DEVICE) # [1,N,D]
    Q_node.cls_feature = Q_node.content_features[:,-1,:]

    # S node
    for S_node in [i for i in node_list if i.node_type == 'Sentence']:
        # content_features不能包含特殊字符.
        S_node.content_features = get_features_from_XLNET(S_node.content_raw,
                                                            add_special_tokens=False,
                                                            tokenizer = tokenizer_XLNET,
                                                            model = model_XLNET,
                                                            device = DEVICE)
        
        S_node.cls_feature = get_features_from_XLNET(Q_node.content_raw, 
                                                        S_node.content_raw,
                                                        add_special_tokens=True,
                                                        tokenizer = tokenizer_XLNET,
                                                        model = model_XLNET,
                                                        device = DEVICE)[:,-1,:]    
    
    # P node
    for P_i, P_node in [(i,n) for i,n in enumerate(node_list) if n.node_type == 'Paragraph']:
            S_in_P = [n for n in node_list if n.parent_id == P_i]
            all_S_raw = ' '.join([n.content_raw for n in S_in_P])
            P_node.content_features = [n.content_features for n in S_in_P]
            P_node.cls_feature = get_features_from_XLNET(Q_node.content_raw, 
                                                            all_S_raw,
                                                            add_special_tokens=True,
                                                            tokenizer = tokenizer_XLNET,
                                                            model = model_XLNET,
                                                            device = DEVICE)[:,-1,:]

    # E node
    for E_node in [i for i in node_list if i.node_type == 'Entity']:
        start = E_node.start_in_sentence
        end = E_node.end_in_sentence
        E_node.content_features = node_list[E_node.parent_id].content_features[:,start:end,:]
        E_node.cls_feature = torch.mean(E_node.content_features, dim = 1)
        

HBox(children=(IntProgress(value=0, description='building features', style=ProgressStyle(description_width='in…




## 检查

In [13]:
node_list = hotpotQA_train_preprocess[0]['node_list']

In [14]:
for i,n in enumerate(node_list):
    print(f"{i}\t{n.node_type}:\t\t{n.cls_feature.shape}")

0	Question:		torch.Size([1, 768])
1	Paragraph:		torch.Size([1, 768])
2	Sentence:		torch.Size([1, 768])
3	Entity:		torch.Size([1, 768])
4	Sentence:		torch.Size([1, 768])
5	Entity:		torch.Size([1, 768])
6	Sentence:		torch.Size([1, 768])
7	Entity:		torch.Size([1, 768])
8	Sentence:		torch.Size([1, 768])
9	Entity:		torch.Size([1, 768])
10	Entity:		torch.Size([1, 768])
11	Sentence:		torch.Size([1, 768])
12	Entity:		torch.Size([1, 768])
13	Entity:		torch.Size([1, 768])
14	Entity:		torch.Size([1, 768])
15	Sentence:		torch.Size([1, 768])
16	Entity:		torch.Size([1, 768])
17	Sentence:		torch.Size([1, 768])
18	Entity:		torch.Size([1, 768])
19	Paragraph:		torch.Size([1, 768])
20	Sentence:		torch.Size([1, 768])
21	Entity:		torch.Size([1, 768])
22	Entity:		torch.Size([1, 768])
23	Sentence:		torch.Size([1, 768])
24	Entity:		torch.Size([1, 768])
25	Sentence:		torch.Size([1, 768])
26	Entity:		torch.Size([1, 768])
27	Entity:		torch.Size([1, 768])
28	Entity:		torch.Size([1, 768])
29	Sentence:		torch.Size(

In [15]:
for i,n in enumerate(node_list):
    if n.node_type != 'Sentence': continue
    print(f"{i}\t{n.node_type}:\t\t{n.content_features.shape}")

2	Sentence:		torch.Size([1, 19, 768])
4	Sentence:		torch.Size([1, 53, 768])
6	Sentence:		torch.Size([1, 9, 768])
8	Sentence:		torch.Size([1, 24, 768])
11	Sentence:		torch.Size([1, 43, 768])
15	Sentence:		torch.Size([1, 13, 768])
17	Sentence:		torch.Size([1, 9, 768])
20	Sentence:		torch.Size([1, 17, 768])
23	Sentence:		torch.Size([1, 53, 768])
25	Sentence:		torch.Size([1, 31, 768])
29	Sentence:		torch.Size([1, 31, 768])
33	Sentence:		torch.Size([1, 21, 768])
36	Sentence:		torch.Size([1, 33, 768])
38	Sentence:		torch.Size([1, 13, 768])
41	Sentence:		torch.Size([1, 7, 768])
43	Sentence:		torch.Size([1, 5, 768])
45	Sentence:		torch.Size([1, 57, 768])
49	Sentence:		torch.Size([1, 6, 768])
51	Sentence:		torch.Size([1, 22, 768])
53	Sentence:		torch.Size([1, 23, 768])
56	Sentence:		torch.Size([1, 45, 768])
59	Sentence:		torch.Size([1, 13, 768])
60	Sentence:		torch.Size([1, 28, 768])
63	Sentence:		torch.Size([1, 22, 768])
66	Sentence:		torch.Size([1, 27, 768])
70	Sentence:		torch.Size([1, 26, 7

## 保存

直接保存太大.

```python
save_cache_path = 'save_cache/'

with open(save_cache_path+'hotpotQA_train_preprocess100_features.pkl', 'wb') as fp:
    pickle.dump(hotpotQA_train_preprocess, fp, protocol=-1)
!ls -hl $save_cache_path

# -rw-r--r-- 1 root root 2.4G Mar  9 21:06 hotpotQA_train_preprocess100_features.pkl
```

直接保存为`matrix`和`adj`

In [35]:
from copy import deepcopy
from traceback import print_exc

In [36]:
hotpotQA_train_preprocess2 = deepcopy(hotpotQA_train_preprocess)

In [18]:
hotpotQA_train_preprocess[10]['node_list'][0].content_features.shape

torch.Size([1, 24, 768])

In [45]:
for item in tqdm_notebook(hotpotQA_train_preprocess2):
    for node in item['node_list']:
        node.content_features = None

HBox(children=(IntProgress(value=0), HTML(value='')))




In [46]:
save_cache_path = 'save_cache/'

with open(save_cache_path+'hotpotQA_train_preprocess100_features.pkl', 'wb') as fp:
    pickle.dump(hotpotQA_train_preprocess2, fp, protocol=-1)
!ls -hl $save_cache_path

# -rw-r--r-- 1 root root 2.4G Mar  9 21:06 hotpotQA_train_preprocess100_features.pkl

total 1.7G
-rw-r--r-- 1 root root 1.3G Mar  9 21:39 hotpotQA_train_preprocess100_features.pkl
-rw-r--r-- 1 root root 3.5M Mar  9 10:16 hotpotQA_train_preprocess100.pkl
-rw-r--r-- 1 root root 418M Mar  8 11:34 model_HotpotQA.pth


### 只保存特征矩阵和邻接矩阵

**不能!** 因为之后要用到整个句子的tokens, 需要保存整个对象.

对100个ques_item, `node.content_features`部分:

- 保存为tensor, 占用**2.4G**.
- 保存为list, 占用**3.6**.
- 删掉, 占用**1.3G**

In [41]:
matrix_adj_label = []
for ques_item in tqdm_notebook(hotpotQA_train_preprocess2):
    adj = ques_item['sp_adj']
    features_matrix = torch.cat([n.cls_feature for n in ques_item['node_list']], dim = 0)
    labels = torch.stack([torch.tensor([n.is_support]) \
                          if n.node_type != 'Paragraph' else torch.tensor([False])\
                          for n in ques_item['node_list']], dim=0).bool()
    matrix_adj_label.append((features_matrix, adj, labels))

HBox(children=(IntProgress(value=0), HTML(value='')))




In [42]:
save_cache_path = 'save_cache/'

with open(save_cache_path+'hotpotQA_train_preprocess100_feat_adj.pkl', 'wb') as fp:
    pickle.dump(matrix_adj_label, fp, protocol=-1)
!ls -hl $save_cache_path

# -rw-r--r-- 1 root root 2.4G Mar  9 21:06 hotpotQA_train_preprocess100_features.pkl

total 458M
-rw-r--r-- 1 root root  37M Mar 10 10:33 hotpotQA_train_preprocess100_feat_adj.pkl
-rw-r--r-- 1 root root 3.5M Mar  9 10:16 hotpotQA_train_preprocess100.pkl
-rw-r--r-- 1 root root 418M Mar  8 11:34 model_HotpotQA.pth


## 并行(失败)

cpu模型下能跑通, 但非常慢. GPU模式失败.