In [2]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
import joblib
import numpy as np
from tqdm import tqdm
import itertools
import scipy.sparse as sp
import stanza

In [3]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma, depparse', tokenize_no_ssplit=True)

2021-06-14 15:29:16 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-06-14 15:29:16 INFO: Use device: cpu
2021-06-14 15:29:16 INFO: Loading: tokenize
2021-06-14 15:29:16 INFO: Loading: pos
2021-06-14 15:29:16 INFO: Loading: lemma
2021-06-14 15:29:17 INFO: Loading: depparse
2021-06-14 15:29:17 INFO: Done loading processors!


In [5]:
# 数据集
dataset = "ohsumed"
save_path = 'nlp_temp_v2'
corpus_path = 'corpus_v1'

In [6]:
# 参数
window_size = 3
embedding_dim = 300
max_text_len = 300

In [7]:
s = ['.']

In [8]:
# param
stop_words = set(stopwords.words('english') + s)
least_freq = 5
if dataset == "mr" or "SST" in dataset:
    stop_words = set()
    least_freq = 0

In [9]:
# func load texts & labels
def load_dataset(dataset):
    with open(f"{corpus_path}/{dataset}.texts.txt", "r", encoding="latin1") as f:
        texts = f.read().strip().split("\n")
    with open(f"{corpus_path}/{dataset}.labels.txt", "r") as f:
        labels = f.read().strip().split("\n")
    return texts, labels

In [10]:
def pos_text(text: str):
    pos = text.split()
    return nltk.pos_tag(pos)

In [11]:
def clean_str_simple_version(string):
#     string = re.sub(r"\\", "", string)
#     string = re.sub(r"\'", "", string)
#     string = re.sub(r"\"", "", string)
    return string

In [12]:
s = "dome petroleum's \\reaffirms dome mines stake for sale at right price spokesman 'says'"

In [13]:
clean_str_simple_version(s)

"dome petroleum's \\reaffirms dome mines stake for sale at right price spokesman 'says'"

In [14]:
# def filter_text(string):
#     """
#     Tokenization/string cleaning for all datasets except for SST.
#     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
#     """
#     string = re.sub(r"[^A-Za-z0-9().,!?\'\`]", " ", string)
#     string = re.sub(r"\'s", " \'s", string)
#     string = re.sub(r"\'ve", " \'ve", string)
#     string = re.sub(r"n\'t", " n\'t", string)
#     string = re.sub(r"\'re", " \'re", string)
#     string = re.sub(r"\'d", " \'d", string)
#     string = re.sub(r"\'ll", " \'ll", string)
#     string = re.sub(r",", " , ", string)
#     string = re.sub(r"!", " ! ", string)
#     string = re.sub(r"\(", " \( ", string)
#     string = re.sub(r"\)", " \) ", string)
#     string = re.sub(r"\?", " \? ", string)
#     string = re.sub(r"\s{2,}", " ", string)
#     return string.strip().lower()

In [15]:
def filter_text(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9().,!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = string.replace(" can't ", " can not ")
    string = string.replace(" ain't ", " are not ")
    string = string.replace("n't ", " not ")
    string = string.replace(",", " , ")
    string = string.replace("!", " ! ")
    string = string.replace("(", " ( ")
    string = string.replace(")", " ) ")
    string = string.replace("?", " ? ")
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [16]:
texts, labels = load_dataset(dataset)

In [17]:
# handle texts
texts_clean = [clean_str_simple_version(filter_text(t)) for t in texts]

In [18]:
texts_clean[1150]

'surfactant treatment of full term newborns with respiratory failure. surfactant inactivation has been shown to be a significant factor in animal models of lung injury and may also be important in some forms of respiratory failure in full term newborns. fourteen full term newborns with respiratory failure associated with pneumonia ( 7 patients ) and meconium aspiration syndrome ( 7 patients ) were treated with 90 mg kg of a calf lung surfactant extract , given intratracheally up to every 6 hours for a maximum of four doses. the group mean fraction of inspired oxygen ( fi02 ) before treatment was 0.99 0.01 sem , and the mean airway pressure ( map ) was 14.6 1.0 cm h2o. patients showed significant improvement in oxygenation after initial surfactant treatment , with the arterial alveolar oxygenation ratio ( a a ratio ) rising from 0.09 0.01 before surfactant treatment to 0.22 0.05 by 15 minutes ( p .03 ) and remaining improved for 6 hours. the oxygenation index , incorporating map as well

In [19]:
texts[1150]

'Surfactant treatment of full-term newborns with respiratory failure.  Surfactant inactivation has been shown to be a significant factor in animal models of lung injury and may also be important in some forms of respiratory failure in full-term newborns.  Fourteen full-term newborns with respiratory failure associated with pneumonia (7 patients) and meconium aspiration syndrome (7 patients) were treated with 90 mg/kg of a calf lung surfactant extract, given intratracheally up to every 6 hours for a maximum of four doses.  The group mean fraction of inspired oxygen (FI02) before treatment was 0.99 +/- 0.01 SEM, and the mean airway pressure (MAP) was 14.6 +/- 1.0 cm H2O.  Patients showed significant improvement in oxygenation after initial surfactant treatment, with the arterial-alveolar oxygenation ratio (a/A ratio) rising from 0.09 +/- 0.01 before surfactant treatment to 0.22 +/- 0.05 by 15 minutes (P = .03) and remaining improved for 6 hours.  The oxygenation index, incorporating MAP 

In [20]:
def text_split_pos_edge(texts_clean):
    anti_rel = ['punct','cop','case','det','root','cc','mark','aux','advmod','fixed','nummod','aux:pass']
    texts_spilt = []
    texts_pos = []
    texts_edge = [] 
    for t in tqdm(texts_clean):
        doc = nlp(t)
        temp_s = []
        temp_p = []
        temp_e = []
        for word in doc.sentences[0].words:
            temp_s.append(word.text)
            temp_p.append(word.upos)
            if word.deprel not in anti_rel:
                temp_e.append((word.head, word.id))
        texts_spilt.append(temp_s)
        texts_pos.append(temp_p)
        texts_edge.append(temp_e)
    return texts_spilt, texts_pos, texts_edge

In [None]:
split, pos, edge = text_split_pos_edge(texts_clean)

In [None]:
joblib.dump(split, f"{save_path}/{dataset}.split.pkl")
joblib.dump(pos, f"{save_path}/{dataset}.pos.pkl")
joblib.dump(edge, f"{save_path}/{dataset}.edge.pkl")

#### 读取

In [21]:
split = joblib.load(f"{save_path}/{dataset}.split.pkl")
edge = joblib.load(f"{save_path}/{dataset}.edge.pkl")
pos= joblib.load(f"{save_path}/{dataset}.pos.pkl")

In [22]:
word2count = Counter([w for t in split for w in t])
word_count = [[w, c] for w, c in word2count.items() if c >= least_freq and w not in stop_words]
word2index = {w: i for i, (w, c) in enumerate(word_count)}

In [23]:
def words_split_pos_edge(split, pos, edge, word2index):
    split_list = []
    pos_list = []
    edge_list = []
    for t, p, ed in tqdm(zip(split, pos, edge)):
        temp = []
        temp_pos = []
        k = -1
        for i in range(0, len(t)):
            if t[i] in word2index:
                temp.append(t[i])
                temp_pos.append(p[i])
            else:
                temp_edge = []
                k += 1
                for e in ed:
                    e = list(e)
                    if e[0] == i+1-k or e[1] == i+1-k:
                        continue
                    if e[0] > i+1-k:
                        e[0] = e[0] - 1
                    if e[1] > i+1-k:
                        e[1] = e[1] - 1
                    temp_edge.append(tuple(e))
                ed = temp_edge
        split_list.append(temp)
        pos_list.append(temp_pos)
        edge_list.append(ed)
    return split_list, pos_list, edge_list

In [24]:
# normalize
def normalize_adj(adj):
    row_sum = np.array(adj.sum(1))
    # 忽略除零
    with np.errstate(divide='ignore'):
        d_inv_sqrt = np.power(row_sum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = np.diag(d_inv_sqrt)
    adj_normalized = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
    return adj_normalized

In [25]:
def pad_seq(seq, pad_len):
    if len(seq) > pad_len:
        return seq[:pad_len]
    return seq + [0] * (pad_len - len(seq))

In [26]:
# 7400 bit
split_list, pos_list, edge_list = words_split_pos_edge(split, pos, edge, word2index)

7400it [01:00, 121.58it/s]


In [27]:
pos2count = Counter([w for t in pos_list for w in t])
pos_count = [[w, c] for w, c in pos2count.items()]
pos2index = {w: i for i, (w, c) in enumerate(pos_count)}

In [28]:
texts_remove = [" ".join(ws) for ws in split_list]

# labels 2 targets
label2index = {l: i for i, l in enumerate(set(labels))}
targets = [label2index[l] for l in labels]

In [29]:
# save
with open(f"{save_path}/{dataset}.texts.clean.txt", "w") as f:
    f.write("\n".join(texts_clean))

with open(f"{save_path}/{dataset}.texts.remove.txt", "w") as f:
    f.write("\n".join(texts_remove))

np.save(f"{save_path}/{dataset}.targets.npy", targets)
joblib.dump(word2index, f"{save_path}/{dataset}.word2index.pkl")
joblib.dump(pos2index, f"{save_path}/{dataset}.pos2index.pkl")
joblib.dump(pos_list, f"{save_path}/{dataset}.texts.pos.pkl")

['nlp_temp_v2/ohsumed.texts.pos.pkl']

In [30]:
# 词典和对应序号
word2index = joblib.load(f"{save_path}/{dataset}.word2index.pkl")
# 词典和对应序号
pos2index = joblib.load(f"{save_path}/{dataset}.pos2index.pkl")
# # 数据集
# with open(f"nlp_temp/{dataset}.texts.remove.txt", "r") as f:
#     texts = f.read().strip().split("\n")
pos = joblib.load(f"{save_path}/{dataset}.texts.pos.pkl")

In [31]:
# glove
all_vectors = np.load(f"source/glove.6B.{embedding_dim}d.npy")
all_words = joblib.load(f"source/glove.6B.words.pkl")
all_word2index = {w: i for i, w in enumerate(all_words)}
all_word2index_keys = list(all_word2index.keys())

In [32]:
same_remain2index = []
for aw in all_word2index_keys:
    if word2index.get(aw) is not None:
        same_remain2index.append(word2index[aw])

In [33]:
same_del = [',', '!', '(', ')', '?','\'']
same_del2index = []
for sd in same_del:
    if word2index.get(sd) is not None:
        same_del2index.append(word2index[sd])

In [34]:
def check_same(target, words):
    b = []
    for index, nums in enumerate(words):
        if nums == target:
            b.append(index)
    return b

In [35]:
def connect_same(words, same_del2index, same_remain2index):            
    temp_edges = []
    list_set_words = list(set(words))
    for l in list_set_words:
        if l not in same_del2index and l in same_remain2index:
            temp = check_same(l, words)
            if len(temp) > 1:
                temp_edges += list(itertools.permutations(temp, 2))
    return temp_edges 

In [36]:
# 建图
inputs = []
graphs = []
inputs_pos = []

In [37]:
for i in tqdm(range(0, len(split_list))):
    parse = edge_list[i]
    words = [word2index[w] for w in split_list[i]]
    words = words[:max_text_len]
    
    poses = [pos2index[w] for w in pos_list[i]]
    poses = poses[:max_text_len]
    
    nodes = words
    edges = []
    for i in range(len(words)):
        center = i
        for j in range(i - window_size, i + window_size + 1):
            if i != j and 0 <= j < len(words):
                neighbor = j
                edges.append((center, neighbor))
                
#     cs = connect_same(words, same_del2index, same_remain2index)
#     edges += cs
    
    for p in parse:
        p = list(p)
        p[0] = p[0] - 1
        p[1] = p[1] - 1
        if p[0] != -1 and p[1] != -1 and p[0] < max_text_len and p[1] < max_text_len:
            edges.append((p[1], p[0]))
    #去重
    edges = list(set(edges))
    
    edge_count = Counter(edges).items()
     # 邻接矩阵
    row = [x for (x, y), c in edge_count]
    col = [y for (x, y), c in edge_count]
    weight = [c for (x, y), c in edge_count]
    # 归一化
    adj = sp.csr_matrix((weight, (row, col)), shape=(len(nodes), len(nodes)))
    adj_normalized = normalize_adj(adj)
    weight_normalized = [adj_normalized[x][y] for (x, y), c in edge_count]
    # 保存节点和图
    inputs.append(nodes)
    graphs.append([row, col, weight_normalized])
    inputs_pos.append(poses)

100%|██████████| 7400/7400 [00:16<00:00, 457.08it/s]


In [38]:
len_inputs = [len(e) for e in inputs]
len_graphs = [len(x) for x, y, c in graphs]

In [39]:
# padding input
pad_len_inputs = max(len_inputs)
pad_len_graphs = max(len_graphs)
inputs_pad = [pad_seq(e, pad_len_inputs) for e in tqdm(inputs)]
graphs_pad = [[pad_seq(ee, pad_len_graphs) for ee in e] for e in tqdm(graphs)]
poses_pad = [pad_seq(e, pad_len_inputs) for e in tqdm(inputs_pos)]

100%|██████████| 7400/7400 [00:00<00:00, 190241.19it/s]
100%|██████████| 7400/7400 [00:00<00:00, 9344.82it/s] 
100%|██████████| 7400/7400 [00:00<00:00, 168639.06it/s]


In [40]:
inputs_pad = np.array(inputs_pad)
weights_pad = np.array([c for x, y, c in graphs_pad])
graphs_pad = np.array([[x, y] for x, y, c in graphs_pad])
poses_pad = np.array(poses_pad)

In [41]:
# word2vec
index2word = {i: w for w, i in word2index.items()}
word_set = [index2word[i] for i in range(len(index2word))]
oov = np.random.normal(-0.01, 0.01, embedding_dim)
word2vec = [all_vectors[all_word2index[w]] if w in all_word2index else oov for w in word_set]
# word2vec = [all_vectors[all_word2index[w]] if w in all_word2index else np.random.normal(-0.01, 0.01, embedding_dim) for w in word_set]
word2vec.append(np.zeros(embedding_dim))

In [42]:
# pos2vec
index2pos = {i: w for w, i in pos2index.items()}
pos_set = [index2pos[i] for i in range(len(index2pos))]
pos2vec = [np.random.normal(-0.1, 0.1, embedding_dim) for w in pos_set]

In [43]:
# save
joblib.dump(len_inputs, f"{save_path}/{dataset}.len.inputs.pkl")
joblib.dump(len_graphs, f"{save_path}/{dataset}.len.graphs.pkl")
np.save(f"{save_path}/{dataset}.inputs.npy", inputs_pad)
np.save(f"{save_path}/{dataset}.graphs.npy", graphs_pad)
np.save(f"{save_path}/{dataset}.weights.npy", weights_pad)
np.save(f"{save_path}/{dataset}.word2vec.npy", word2vec)
np.save(f"{save_path}/{dataset}.poses_pad.npy", poses_pad)
np.save(f"{save_path}/{dataset}.pos2vec.npy", pos2vec)

In [36]:
doc = nlp(texts_clean[0])

In [37]:
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

id: 1	word: superficial	head id: 2	head: cultures	deprel: amod
id: 2	word: cultures	head id: 0	head: root	deprel: root
id: 3	word: in	head id: 6	head: evaluations	deprel: case
id: 4	word: neonatal	head id: 6	head: evaluations	deprel: amod
id: 5	word: sepsis	head id: 6	head: evaluations	deprel: compound
id: 6	word: evaluations	head id: 2	head: cultures	deprel: nmod
id: 7	word: .	head id: 2	head: cultures	deprel: punct
id: 8	word: impact	head id: 2	head: cultures	deprel: appos
id: 9	word: on	head id: 11	head: decision	deprel: case
id: 10	word: antibiotic	head id: 11	head: decision	deprel: amod
id: 11	word: decision	head id: 8	head: impact	deprel: nmod
id: 12	word: making	head id: 8	head: impact	deprel: acl
id: 13	word: .	head id: 16	head: performed	deprel: punct
id: 14	word: the	head id: 15	head: authors	deprel: det
id: 15	word: authors	head id: 16	head: performed	deprel: nsubj
id: 16	word: performed	head id: 2	head: cultures	deprel: parataxis
id: 17	word: a	head id: 19	head: analysis	de

In [31]:
for word in doc.sentences[0].words:
    print(f'word.id:{word.id} head.id:{word.head}')

word.id:1 head.id:2
word.id:2 head.id:0
word.id:3 head.id:6
word.id:4 head.id:6
word.id:5 head.id:6
word.id:6 head.id:2
word.id:7 head.id:2
word.id:8 head.id:2
word.id:9 head.id:11
word.id:10 head.id:11
word.id:11 head.id:8
word.id:12 head.id:8
word.id:13 head.id:16
word.id:14 head.id:15
word.id:15 head.id:16
word.id:16 head.id:2
word.id:17 head.id:19
word.id:18 head.id:19
word.id:19 head.id:16
word.id:20 head.id:23
word.id:21 head.id:23
word.id:22 head.id:23
word.id:23 head.id:19
word.id:24 head.id:26
word.id:25 head.id:26
word.id:26 head.id:19
word.id:27 head.id:29
word.id:28 head.id:29
word.id:29 head.id:26
word.id:30 head.id:26
word.id:31 head.id:35
word.id:32 head.id:35
word.id:33 head.id:34
word.id:34 head.id:35
word.id:35 head.id:30
word.id:36 head.id:39
word.id:37 head.id:39
word.id:38 head.id:39
word.id:39 head.id:35
word.id:40 head.id:2
word.id:41 head.id:42
word.id:42 head.id:16
word.id:43 head.id:44
word.id:44 head.id:42
word.id:45 head.id:42
word.id:46 head.id:47
word.id:4

In [34]:
[word.upos for word in doc.sentences[0].words]

['ADJ',
 'NOUN',
 'ADP',
 'ADJ',
 'NOUN',
 'NOUN',
 'PUNCT',
 'NOUN',
 'ADP',
 'ADJ',
 'NOUN',
 'VERB',
 'PUNCT',
 'DET',
 'NOUN',
 'VERB',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'ADJ',
 'ADJ',
 'NOUN',
 'CCONJ',
 'PRON',
 'NOUN',
 'ADP',
 'ADJ',
 'NOUN',
 'VERB',
 'ADP',
 'DET',
 'NUM',
 'NOUN',
 'NOUN',
 'ADP',
 'PROPN',
 'ADJ',
 'NOUN',
 'PUNCT',
 'PRON',
 'VERB',
 'CCONJ',
 'VERB',
 'NOUN',
 'ADP',
 'NOUN',
 'PUNCT',
 'NOUN',
 'NUM',
 'PUNCT',
 'AUX',
 'VERB',
 'ADJ',
 'PUNCT',
 'NOUN',
 'CCONJ',
 'CCONJ',
 'ADJ',
 'NOUN',
 'PUNCT',
 'CCONJ',
 'ADJ',
 'PUNCT',
 'NOUN',
 'CCONJ',
 'ADJ',
 'NOUN',
 'PUNCT',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'ADP',
 'ADJ',
 'ADJ',
 'NOUN',
 'PUNCT',
 'ADJ',
 'NOUN',
 'AUX',
 'ADJ',
 'ADP',
 'NOUN',
 'PUNCT',
 'DET',
 'NOUN',
 'CCONJ',
 'ADJ',
 'NOUN',
 'NOUN',
 'PUNCT',
 'ADP',
 'NUM',
 'PUNCT',
 'NUM',
 'NUM',
 'PUNCT',
 'ADP',
 'NOUN',
 'PUNCT',
 'ADJ',
 'NOUN',
 'NOUN',
 'AUX',
 'VERB',
 'ADP',
 'ADV',
 'NUM',
 'ADP',
 'DET',
 'NOUN',
 'PUNCT',
 'CCONJ

In [20]:
[word.text for word in doc.sentences[0].words]

['superficial',
 'cultures',
 'in',
 'neonatal',
 'sepsis',
 'evaluations',
 '.',
 'impact',
 'on',
 'antibiotic',
 'decision',
 'making',
 '.',
 'the',
 'authors',
 'performed',
 'a',
 'retrospective',
 'analysis',
 'of',
 'neonatal',
 'superficial',
 'cultures',
 'and',
 'their',
 'effect',
 'on',
 'antimicrobial',
 'decision',
 'making',
 'during',
 'a',
 'nine',
 'month',
 'period',
 'at',
 'nashville',
 'general',
 'hospital',
 '.',
 'they',
 'obtained',
 'and',
 'reviewed',
 'charts',
 'of',
 'infants',
 '(',
 'n',
 '66',
 ')',
 'having',
 'paired',
 'superficial',
 '(',
 'skin',
 'and',
 'or',
 'gastric',
 'aspirate',
 ')',
 'and',
 'deep',
 '(',
 'blood',
 'and',
 'cerebrospinal',
 'fluid',
 ')',
 'cultures',
 'for',
 'the',
 'evaluation',
 'of',
 'early',
 'onset',
 'sepsis',
 '.',
 'superficial',
 'cultures',
 'were',
 'positive',
 'for',
 'pathogens',
 '(',
 'any',
 'streptococcus',
 'or',
 'enteric',
 'gram',
 'negative',
 ')',
 'in',
 '15',
 '(',
 '10',
 '66',
 ')',
 'of',