In [None]:
import joblib
from tqdm import tqdm
import scipy.sparse as sp
from collections import Counter
import numpy as np
import itertools

In [2]:
# 数据集
dataset = "ohsumed"
save_path = 'temp_v1'

In [3]:
# 参数
window_size = 3
embedding_dim = 300
max_text_len = 300

In [4]:
# normalize
def normalize_adj(adj):
    row_sum = np.array(adj.sum(1))
    # 忽略除零
    with np.errstate(divide='ignore'):
        d_inv_sqrt = np.power(row_sum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = np.diag(d_inv_sqrt)
    adj_normalized = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
    return adj_normalized

In [5]:
def pad_seq(seq, pad_len):
    if len(seq) > pad_len:
        return seq[:pad_len]
    return seq + [0] * (pad_len - len(seq))

In [6]:
def check_same(target, words):
    b = []
    for index, nums in enumerate(words):
        if nums == target:
            b.append(index)
    return b

In [7]:
def connect_same(words):
    temp_edges = []
    list_set_words = list(set(words))
    for l in list_set_words:
        temp = check_same(l, words)
        if len(temp) > 1:
            temp_edges += list(itertools.permutations(temp, 2))
    return temp_edges 

In [8]:
# 词典和对应序号
word2index = joblib.load(f"{save_path}/{dataset}.word2index.pkl")
# 词典和对应序号
pos2index = joblib.load(f"{save_path}/{dataset}.pos2index.pkl")
# 数据集
with open(f"{save_path}/{dataset}.texts.remove.txt", "r") as f:
    texts = f.read().strip().split("\n")
pos = joblib.load(f"{save_path}/{dataset}.texts.pos.pkl")

In [9]:
# 建图
inputs = []
graphs = []
inputs_pos = []

In [10]:
for i in tqdm(range(0, len(texts))):
    words = [word2index[w] for w in texts[i].split()]
    words = words[:max_text_len]
    
    poses = [pos2index[w] for w in pos[i]]
    poses = poses[:max_text_len]
              
    nodes = words
    edges = []
    for i in range(len(words)):
        center = i
        for j in range(i - window_size, i + window_size + 1):
            if i != j and 0 <= j < len(words):
                neighbor = j
                edges.append((center, neighbor))
    cs = connect_same(words)
    edges += cs
    edge_count = Counter(edges).items()
    # 邻接矩阵
    row = [x for (x, y), c in edge_count]
    col = [y for (x, y), c in edge_count]
    weight = [c for (x, y), c in edge_count]
    # 归一化
    adj = sp.csr_matrix((weight, (row, col)), shape=(len(nodes), len(nodes)))
    adj_normalized = normalize_adj(adj)
    weight_normalized = [adj_normalized[x][y] for (x, y), c in edge_count]
    # 保存节点和图
    inputs.append(nodes)
    graphs.append([row, col, weight_normalized])
    inputs_pos.append(poses)

100%|██████████| 7400/7400 [00:23<00:00, 311.21it/s]


In [11]:
len_inputs = [len(e) for e in inputs]
len_graphs = [len(x) for x, y, c in graphs]

In [12]:
# padding input
pad_len_inputs = max(len_inputs)
pad_len_graphs = max(len_graphs)
inputs_pad = [pad_seq(e, pad_len_inputs) for e in tqdm(inputs)]
graphs_pad = [[pad_seq(ee, pad_len_graphs) for ee in e] for e in tqdm(graphs)]
poses_pad = [pad_seq(e, pad_len_inputs) for e in tqdm(inputs_pos)]

100%|██████████| 7400/7400 [00:00<00:00, 150061.64it/s]
100%|██████████| 7400/7400 [00:01<00:00, 5717.04it/s]
100%|██████████| 7400/7400 [00:00<00:00, 240193.85it/s]


In [13]:
inputs_pad = np.array(inputs_pad)
weights_pad = np.array([c for x, y, c in graphs_pad])
graphs_pad = np.array([[x, y] for x, y, c in graphs_pad])
poses_pad = np.array(poses_pad)

In [14]:
# word2vec
all_vectors = np.load(f"source/glove.6B.{embedding_dim}d.npy")
all_words = joblib.load(f"source/glove.6B.words.pkl")
all_word2index = {w: i for i, w in enumerate(all_words)}
index2word = {i: w for w, i in word2index.items()}
word_set = [index2word[i] for i in range(len(index2word))]
oov = np.random.normal(-0.01, 0.01, embedding_dim)
word2vec = [all_vectors[all_word2index[w]] if w in all_word2index else oov for w in word_set]
word2vec.append(np.zeros(embedding_dim))

In [15]:
# pos2vec
index2pos = {i: w for w, i in pos2index.items()}

In [16]:
pos_set = [index2pos[i] for i in range(len(index2pos))]

In [17]:
pos2vec = [np.random.normal(-0.1, 0.1, embedding_dim) for w in pos_set]

In [18]:
# save
joblib.dump(len_inputs, f"{save_path}/{dataset}.len.inputs.pkl")
joblib.dump(len_graphs, f"{save_path}/{dataset}.len.graphs.pkl")
np.save(f"{save_path}/{dataset}.inputs.npy", inputs_pad)
np.save(f"{save_path}/{dataset}.graphs.npy", graphs_pad)
np.save(f"{save_path}/{dataset}.weights.npy", weights_pad)
np.save(f"{save_path}/{dataset}.word2vec.npy", word2vec)
np.save(f"{save_path}/{dataset}.poses_pad.npy", poses_pad)
np.save(f"{save_path}/{dataset}.pos2vec.npy", pos2vec)

In [9]:
# 旧

In [10]:
# # 节点 set
nodes = list(set(words))

In [11]:
# 节点顺序
node2index = {e: i for i, e in enumerate(nodes)}

In [12]:
# 边
edges = []

In [13]:
for i in range(len(words)):
    # 第一个词在node2index中的位置
    center = node2index[words[i]]
    for j in range(i - window_size, i + window_size + 1):
        if i != j and 0 <= j < len(words):
            neighbor = node2index[words[j]]
            edges.append((center, neighbor))
edge_count = Counter(edges).items()

In [52]:
# 新

In [9]:
# 节点 set
nodes = words
# 边
edges = []

In [15]:
for i in range(len(words)):
    center = i
    for j in range(i - window_size, i + window_size + 1):
        if i != j and 0 <= j < len(words):
            neighbor = j
            edges.append((center, neighbor))
cs = connect_same(words)
edges += cs

In [16]:
edge_count = Counter(edges).items()

In [17]:
edge_count

dict_items([((0, 1), 2), ((0, 2), 2), ((0, 3), 2), ((1, 0), 2), ((1, 2), 2), ((1, 3), 2), ((1, 4), 2), ((2, 0), 2), ((2, 1), 2), ((2, 3), 2), ((2, 4), 2), ((2, 5), 2), ((3, 0), 2), ((3, 1), 2), ((3, 2), 2), ((3, 4), 2), ((3, 5), 2), ((3, 6), 2), ((4, 1), 2), ((4, 2), 2), ((4, 3), 2), ((4, 5), 2), ((4, 6), 2), ((4, 7), 2), ((5, 2), 2), ((5, 3), 2), ((5, 4), 2), ((5, 6), 2), ((5, 7), 2), ((5, 8), 2), ((6, 3), 2), ((6, 4), 2), ((6, 5), 2), ((6, 7), 2), ((6, 8), 2), ((6, 9), 2), ((7, 4), 2), ((7, 5), 2), ((7, 6), 2), ((7, 8), 2), ((7, 9), 2), ((7, 10), 2), ((8, 5), 2), ((8, 6), 2), ((8, 7), 2), ((8, 9), 2), ((8, 10), 2), ((8, 11), 2), ((9, 6), 2), ((9, 7), 2), ((9, 8), 2), ((9, 10), 2), ((9, 11), 2), ((9, 12), 2), ((10, 7), 2), ((10, 8), 2), ((10, 9), 2), ((10, 11), 2), ((10, 12), 2), ((10, 13), 2), ((11, 8), 2), ((11, 9), 2), ((11, 10), 2), ((11, 12), 2), ((11, 13), 2), ((11, 14), 2), ((12, 9), 2), ((12, 10), 2), ((12, 11), 2), ((12, 13), 2), ((12, 14), 2), ((12, 15), 2), ((13, 10), 2), (

In [18]:
# 邻接矩阵
row = [x for (x, y), c in edge_count]
col = [y for (x, y), c in edge_count]
weight = [c for (x, y), c in edge_count]

In [19]:
# 归一化
adj = sp.csr_matrix((weight, (row, col)), shape=(len(nodes), len(nodes)))

In [20]:
adj_normalized = normalize_adj(adj)

In [22]:
weight_normalized = [adj_normalized[x][y] for (x, y), c in edge_count]

In [48]:
aa = [2, 9]
bb = list(itertools.permutations(aa, 2))
print(bb)
cc = list(itertools.combinations(aa, 2))
print(cc)

[(2, 9), (9, 2)]
[(2, 9)]


In [1]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp( "I love natural language processing technology!" )
for token in doc:
    print('{0}({1}) <-- {2} -- {3}({4})'.format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
    print(token.text, token.head.text)

I(PRP) <-- nsubj -- love(VBP)
I love
love(VBP) <-- ROOT -- love(VBP)
love love
natural(JJ) <-- amod -- language(NN)
natural language
language(NN) <-- compound -- technology(NN)
language technology
processing(NN) <-- compound -- technology(NN)
processing technology
technology(NN) <-- dobj -- love(VBP)
technology love
!(.) <-- punct -- love(VBP)
! love


In [56]:
import torch
import torch.nn.functional as F
from torch import nn

In [48]:
a = torch.Tensor([[[1],[8]],[[1],[9]],[[1],[10]]]) 

In [49]:
a

tensor([[[ 1.],
         [ 8.]],

        [[ 1.],
         [ 9.]],

        [[ 1.],
         [10.]]])

In [50]:
a.shape

torch.Size([3, 2, 1])

In [38]:
torch.reshape(a, [a.shape[0], a.shape[1]*a.shape[-1]])

tensor([[ 1.,  2.,  3.,  2.,  2.,  3.,  4.,  8.],
        [ 1.,  2.,  3.,  2.,  2.,  3.,  4.,  9.],
        [ 1.,  2.,  3.,  2.,  2.,  3.,  4., 10.]])

In [46]:
a[0,:,:]

tensor([[1., 2., 3., 2.],
        [2., 3., 4., 8.]])

In [52]:
F.softmax(a, dim=1)

tensor([[[9.1105e-04],
         [9.9909e-01]],

        [[3.3535e-04],
         [9.9966e-01]],

        [[1.2339e-04],
         [9.9988e-01]]])

In [53]:
torch.ones(1)

tensor([1.])

In [54]:
torch.ones(10)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [57]:
nn.Parameter(torch.ones(10))

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True)

In [59]:
torch.mul(torch.Tensor([1,4,3,4,5]), nn.Parameter(torch.ones(5)))

tensor([1., 4., 3., 4., 5.], grad_fn=<MulBackward0>)

In [22]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
import joblib
import numpy as np

dataset = "ohsumed"
save_path = 'temp_v1'
corpus_path = 'corpus_v1'

# param
stop_words = set(stopwords.words('english'))
# with open(f"stop_words.txt", "r", encoding="latin1") as f:
#     stop_words = set(f.read().strip().split("\n"))

# 词频
least_freq = 5
if dataset == "mr" or "SST" in dataset:
    stop_words = set()
    least_freq = 0


# func load texts & labels
def load_dataset(dataset):
    with open(f"{corpus_path}/{dataset}.texts.txt", "rb") as f:
        texts = f.read()
        texts = texts.decode('utf-8', 'ignore')
        texts = texts.strip().split("\n")
    with open(f"{corpus_path}/{dataset}.labels.txt", "r") as f:
        labels = f.read().strip().split("\n")
    return texts, labels


def filter_text_old(text: str):
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", text)
    text = text.replace("'ll ", " will ")
    text = text.replace("'d ", " would ")
    text = text.replace("'m ", " am ")
    text = text.replace("'s ", " is ")
    text = text.replace("'re ", " are ")
    text = text.replace("'ve ", " have ")
    text = text.replace(" can't ", " can not ")
    text = text.replace(" ain't ", " are not ")
    text = text.replace("n't ", " not ")
    text = text.replace(",", " , ")
    text = text.replace("!", " ! ")
    text = text.replace("(", " ( ")
    text = text.replace(")", " ) ")
    text = text.replace("?", " ? ")
    text = re.sub(r"\s{2,}", " ", text)
    return " ".join(text.strip().split())

# def filter_text(string):
#     """
#     Tokenization/string cleaning for all datasets except for SST.
#     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
#     """
#     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
#     string = re.sub(r"\'s", " \'s", string)
#     string = re.sub(r"\'ve", " \'ve", string)
#     string = re.sub(r"n\'t", " n\'t", string)
#     string = re.sub(r"\'re", " \'re", string)
#     string = re.sub(r"\'d", " \'d", string)
#     string = re.sub(r"\'ll", " \'ll", string)
#     string = re.sub(r",", " , ", string)
#     string = re.sub(r"!", " ! ", string)
#     string = re.sub(r"\(", " \( ", string)
#     string = re.sub(r"\)", " \) ", string)
#     string = re.sub(r"\?", " \? ", string)
#     string = re.sub(r"\s{2,}", " ", string)
#     return string.strip().lower()


def filter_text(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = string.replace(" can't ", " can not ")
    string = string.replace(" ain't ", " are not ")
    string = string.replace("n't ", " not ")
    string = string.replace(",", " , ")
    string = string.replace("!", " ! ")
    string = string.replace("(", " ( ")
    string = string.replace(")", " ) ")
    string = string.replace("?", " ? ")
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def pos_text(text: str):
    pos = nltk.word_tokenize(text)
    return nltk.pos_tag(pos)


def words_pos_list(texts, text_pos, word2index):
    words_list = []
    pos_list = []
    for t, p in zip(texts, text_pos):
        temp = []
        temp_pos = []
        t_split = t.split()
        for i in range(0, len(t_split)):
            if t_split[i] in word2index:
                temp.append(t_split[i])
                temp_pos.append(p[i][1])
        words_list.append(temp)
        pos_list.append(temp_pos)
    return words_list, pos_list


if __name__ == '__main__':
    texts, labels = load_dataset(dataset)

    # handle texts
    texts_clean = [filter_text(t) for t in texts]
    text_pos = [pos_text(t) for t in texts_clean]

    word2count = Counter([w for t in texts_clean for w in t.split()])
    word_count = [[w, c] for w, c in word2count.items() if c >= least_freq and w not in stop_words]
    word2index = {w: i for i, (w, c) in enumerate(word_count)}

    # words_list = [[w for w in t.split() if w in word2index] for t in texts_clean]
    words_list, pos_list = words_pos_list(texts_clean, text_pos, word2index)

    pos2count = Counter([w for t in pos_list for w in t])
    pos_count = [[w, c] for w, c in pos2count.items()]
    pos2index = {w: i for i, (w, c) in enumerate(pos_count)}

    texts_remove = [" ".join(ws) for ws in words_list]

    # labels 2 targets
    label2index = {l: i for i, l in enumerate(set(labels))}
    targets = [label2index[l] for l in labels]

    # save
    with open(f"{save_path}/{dataset}.texts.clean.txt", "w") as f:
        f.write("\n".join(texts_clean))

    with open(f"{save_path}/{dataset}.texts.remove.txt", "w") as f:
        f.write("\n".join(texts_remove))

    np.save(f"{save_path}/{dataset}.targets.npy", targets)
    joblib.dump(word2index, f"{save_path}/{dataset}.word2index.pkl")
    joblib.dump(pos2index, f"{save_path}/{dataset}.pos2index.pkl")
    joblib.dump(pos_list, f"{save_path}/{dataset}.texts.pos.pkl")

    print('done')


done
