In [1]:
import torch
from torch import nn, optim
import numpy as np
import time
from pqgrams.tree import Node
from pqgrams.PQGram import Profile
import zss
import scripts.pq_gram, scripts.dist, scripts.visualize, scripts.w_pq
from collections import Counter
import pyconll
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
from xml.etree import ElementTree as ET

In [7]:
np.ndarray(3)

array([0. , 0.5, 1. ])

# np.ndarrayとtorch.Tensorによる計算

In [3]:
tensor = torch.randn(2)
vector = tensor.detach().numpy()

print(tensor, vector)

tensor([ 0.3327, -0.3698]) [ 0.3326949 -0.3697978]


In [4]:
print(type(tensor), type(vector))

<class 'torch.Tensor'> <class 'numpy.ndarray'>


In [5]:
def softplus(x:np.ndarray|torch.Tensor):
  if (type(x)==np.ndarray):
    return np.log(1+np.exp(x))
  elif (type(x)==torch.Tensor):
    return torch.log(torch.ones(x.size()[0], dtype=torch.float32).to(x.device)+torch.exp(x))

In [6]:
softplus(vector)

array([0.87326705, 0.52524555], dtype=float32)

In [7]:
def random_vec(dim: int):
  tensor = torch.randn(dim)
  vector = tensor.detach().numpy()
  return tensor, vector

In [8]:
tensors, vectors = [], []
for _ in range(1000):
  tensor, vector = random_vec(2)
  tensors.append(tensor)
  vectors.append(vector)

In [9]:
start = time.time()
for i in range(1000):
  softplus(vectors[i])
end = time.time()
print(end-start)
print()
start = time.time()
for i in range(1000):
  softplus(tensors[i])
end = time.time()
print(end-start)

0.00832819938659668

0.07359433174133301


# 木構造のサンプル

In [10]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# 木構造1
"""
  a
 /|\
a b c
|\
e b 
"""
root1 = Node("a")
root1.addkid(Node("a"))
root1.addkid(Node("b"))
root1.addkid(Node("c"))
root1.children[0].addkid(Node("e"))
root1.children[0].addkid(Node("b"))


# 木構造2
"""
  a
 /|\
a b d
|\
e b
"""
root2 = Node("a")
root2.addkid(Node("a"))
root2.addkid(Node("b"))
root2.addkid(Node("d"))
root2.children[0].addkid(Node("e"))
root2.children[0].addkid(Node("b"))


# PQ-Gram プロファイルの作成
p1 = Profile(root1, p=2, q=3)
p2 = Profile(root2, p=2, q=3)


J = [pqgram for pqgram in p1]
for pqgram in p2:
   if pqgram not in J:
      J.append(pqgram)


## Tensorによる距離計算

In [11]:
tensor1 = scripts.pq_gram.pqgram_to_tensor(p1, J).to(device)
tensor2 = scripts.pq_gram.pqgram_to_tensor(p2, J).to(device)


print(scripts.dist.pqgram_distance_tensor(tensor1, tensor2))


8.0


## np.ndarrayによる距離計算

In [12]:
dimension = len(J)

v1 = np.zeros(dimension, dtype=int)
v2 = np.zeros(dimension, dtype=int)

for pqgram in p1:
   if pqgram in J:
      for i, subtree in enumerate(J):
         if pqgram == subtree:
            v1[i] += 1

for pqgram in p2:
   if pqgram in J:
      for i, subtree in enumerate(J):
         if pqgram == subtree:
            v2[i] += 1

tensor1 = torch.from_numpy(v1.astype(np.float32)).to(device)
tensor2 = torch.from_numpy(v2.astype(np.float32)).to(device)

tensor_min = torch.minimum(tensor1, tensor2)

min12 = np.minimum(v1, v2)

tensor_diff = tensor1 + tensor2 - 2*tensor_min


print(((torch.ones(dimension, dtype=torch.float32).to(device=device)@tensor_diff)).to("cpu").detach().numpy())


8.0


In [13]:
Counter(p1)

Counter({('a', 'b', '*', '*', '*'): 2,
         ('*', 'a', '*', '*', 'a'): 1,
         ('*', 'a', '*', 'a', 'b'): 1,
         ('*', 'a', 'a', 'b', 'c'): 1,
         ('*', 'a', 'b', 'c', '*'): 1,
         ('*', 'a', 'c', '*', '*'): 1,
         ('a', 'a', '*', '*', 'e'): 1,
         ('a', 'a', '*', 'e', 'b'): 1,
         ('a', 'a', 'b', '*', '*'): 1,
         ('a', 'a', 'e', 'b', '*'): 1,
         ('a', 'c', '*', '*', '*'): 1,
         ('a', 'e', '*', '*', '*'): 1})

In [14]:
def pqgram_to_tensor(pqgrams: Profile, J: list) -> torch.Tensor:
    dim = len(J)
    vec = torch.zeros(dim, dtype=torch.float32)
    pqgrams_set = list(set(pqgrams))
    counts = Counter(pqgrams)
    for pqgram in pqgrams_set:
      count = counts[pqgram]
      idx = J.index(pqgram)
      vec[idx] = count
    return vec

tensor1_ = pqgram_to_tensor(p1, J)
tensor2_ = pqgram_to_tensor(p2, J)
print(scripts.dist.pqgram_distance_tensor(tensor1_, tensor2_))

8.0


In [15]:
import scripts.pq_gram
import scripts.trees


def preprocessing(CORPUS_FILE: list, type_of_labels: list):
    
    CORPUS_LIST = []
    for corpus in CORPUS_FILE:
        CORPUS_LIST.append(corpus.split(".")[0].split("/")[-1])

    CORPUS_i_LENGTH = []

    CoNLL = []
    labels = []
    for i in range(len(CORPUS_FILE)):
        tmp_conll = pyconll.load_from_file(CORPUS_FILE[i])
        CoNLL += tmp_conll
        for _ in range(len(tmp_conll)):
            labels.append(type_of_labels[i])
        if i != 0:
            CORPUS_i_LENGTH.append(len(tmp_conll)+CORPUS_i_LENGTH[i-1])
        else:
            CORPUS_i_LENGTH.append(len(tmp_conll))
    
    num_trees = CORPUS_i_LENGTH[-1]
    pqtrees = [scripts.trees.conllTree_to_pqTree_upos(conll.to_tree()) for conll in CoNLL]
    
    pqIndex = [Profile(tree, p=2, q=2) for tree in pqtrees]

    J = set(pqIndex[0])
    for pq_set in pqIndex[1:]:
        J = J.union(pq_set)
    J = list(J)

    tensors = [scripts.pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in pqIndex]
    
    indexes = torch.Tensor(range(num_trees))

    # 訓練データとテストデータに分割
    train_tensors, test_tensors, train_labels, test_labels, train_indexes, test_indexes = train_test_split(tensors, labels, indexes, test_size=0.4, random_state=42)
    valid_tensors, test_tensors, valid_labels, test_labels, valid_indexes, test_indexes = train_test_split(tensors, labels, indexes, test_size=0.5, random_state=42)

    # データの保存
    torch.save(train_tensors, "data/train_tensors_ja_en_pud.pt")
    torch.save(valid_tensors, "data/valid_tensors_ja_en_pud.pt")
    torch.save(test_tensors, "data/test_tensors_ja_en_pud.pt")

    torch.save(train_labels, "data/train_labels_ja_en_pud.pt")
    torch.save(valid_labels, "data/valid_labels_ja_en_pud.pt")
    torch.save(test_labels, "data/test_labels_ja_en_pud.pt")

    torch.save(train_indexes, "data/train_indexes_ja_en_pud.pt")
    torch.save(valid_indexes, "data/valid_indexes_ja_en_pud.pt")
    torch.save(test_indexes, "data/test_indexes_ja_en_pud.pt")
    



preprocessing(
  ["PUD/Japanese-PUDLUW.conllu", 
    "PUD/English-PUD.conllu"],
    ["ja", "en"]
)

In [16]:
tensors[0]

tensor([-0.4504,  1.4049])

In [17]:
tensors[1]

tensor([ 0.6797, -1.4112])

In [18]:
from scripts import w_pq
dim = tensors[0].size()[0]
ones = torch.ones(dim, dtype=torch.float32)
w_pq.weighted_pqgram_distance(ones, tensors[0], tensors[1])

tensor(5.1823)

In [19]:
print(dim)

2


In [20]:
array1 = tensors[0].detach().numpy()
array2 = tensors[1].detach().numpy()
print(tensors[0], tensors[1])
print(array1, array2)

tensor([-0.4504,  1.4049]) tensor([ 0.6797, -1.4112])
[-0.4504161  1.4048688] [ 0.6796851 -1.4111773]


In [21]:
min12 = np.minimum(array1, array2)
diff = array1+array2-2*min12

def softplus_np(x):
  return np.log(1+np.exp(x))

aw = softplus_np(np.ones(dim))

aw @ diff

5.182324248698053

In [22]:
def weighted_pqgram_distance(weights, tensor1: torch.Tensor, tensor2: torch.Tensor):
    device = tensor1.device
    min12 = torch.minimum(tensor1, tensor2).to(device)
    diff = tensor1+tensor2-2*min12
    aw = scripts.func.softplus(weights).to(device)
    return torch.dot(diff, aw)

In [23]:
class WeightedPqgramDistance(nn.Module):
  
  def __init__(self, dimension):
    super(WeightedPqgramDistance, self).__init__()
    self.weights = nn.Parameter(torch.ones(dimension))
  
  def forward(self, tensor1, tensor2):
    dist = weighted_pqgram_distance(self.weights, tensor1, tensor2)
    return dist
  

class MetricLearingLoss(nn.Module):
  def __init__(self, margin1, margin2, beta):
    super(MetricLearingLoss, self).__init__()
    self.margin1 = margin1
    self.margin2 = margin2
    self.beta = beta
  
  def forward(self, dist_func, positive_pairs, negative_pairs):
    loss = 0.0

    for (tensor1, tensor2) in positive_pairs:
      dist = dist_func(tensor1, tensor2)
      if dist > self.margin1:
        loss += dist - self.margin1
    
    for (tensor1, tensor2) in negative_pairs:
      dist = dist_func(tensor1, tensor2)
      if dist < self.margin2:
        loss += self.margin2 - dist
    
    reg_term = torch.norm(dist_func.weights)**2
    loss += self.beta * reg_term

    return loss


In [24]:
def distance_matrix(tensors: list, weights):
  num_tensor = len(tensors)
  dist_mat = torch.zeros(num_tensor, num_tensor, dtype=torch.float32)
  for i in tqdm(range(num_tensor), desc="[distance matrix]"):
    for j in range(i+1, num_tensor):
      dist = weighted_pqgram_distance(weights, tensors[i], tensors[j])
      dist_mat[i][j] = dist
      dist_mat[i][j] = dist
  return dist_mat


def create_pairs_lmnn(data, labels, weights, k):
  distances = distance_matrix(data, weights)

  positive_pairs = []
  negative_pairs = []

  for i in range(len(data)):

    distances_asc_arg = torch.argsort(distances[i])[1:]

    targets = []
    impostors = []
    label_i = labels[i]
    j = 0
    while len(targets)<=k:
      idx = distances_asc_arg[j]
      if labels[idx] == label_i:
        targets.append(idx)
      else:
        impostors.append(idx)
      j += 1
    
    positive_pairs.extend([(data[i], data[j]) for j in targets])
    negative_pairs.extend([(data[i], data[j]) for j in impostors])

    return positive_pairs, negative_pairs

: 

In [25]:
train_tensors = torch.load("data/train_tensors_ja_en_pud.pt")
train_labels = torch.load("data/train_labels_ja_en_pud.pt")

margin1 = margin2 = 5.0
beta = 1e-4
dim = train_tensors[0].size()[0]

positive, negative = create_pairs_lmnn(train_tensors, train_labels, weights=torch.ones(dim), k=1)

dist_func = WeightedPqgramDistance(dim)
criterion = MetricLearingLoss(margin1, margin2, beta)

optimizer = optim.Adam(dist_func.parameters(), lr=0.01)

num_epoch = 600
loss_list = []

for epoch in tqdm(range(num_epoch), desc="[learning]"):
  optimizer.zero_grad()
  loss = criterion(dist_func, positive, negative)
  loss.backward()
  optimizer.step()
  loss_list.append(loss.detach().numpy())

  if epoch == 0:
        print(f'\nEpoch {epoch+1}, Loss: {loss.item()}')
        best_epoch = epoch+1
        best_loss = loss.item()
      
  if epoch in [49, 99, 149, 199, 249, 299, 349, 399, 449, 499, 549]:
    positive, negative = create_pairs_lmnn(train_tensors, train_labels, dist_func.weights, k=3)
    print(f'\nEpoch {epoch+1}, Loss: {loss.item()}')
  
  if loss.item() < best_loss:
    best_epoch = epoch+1
    best_loss = loss.item()
    torch.save(dist_func.state_dict(), 'models/weighted_distance_model_pud_en_ja.pth')

print(f'\nEpoch:{best_epoch},\nLoss:{best_loss}')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.plot(range(num_epoch),loss_list)
plt.show()


[distance matrix]: 100%|██████████| 1200/1200 [01:00<00:00, 19.71it/s] 
  from .autonotebook import tqdm as notebook_tqdm
[learning]:   7%|▋         | 42/600 [00:00<00:02, 236.85it/s]


Epoch 1, Loss: 66.49757385253906


[distance matrix]: 100%|██████████| 1200/1200 [01:48<00:00, 11.07it/s] 


In [6]:
import torch

train_tensors_path = "data/train_tensors_en_corpora_EWT_ESL.pt"
train_tensors = torch.load(train_tensors_path)
dimension = train_tensors[0].size()[0]

In [7]:
dimension

8363

In [8]:
len('ABCDEFGHIJKLMNOPQRSTUVWXYZ')

26

In [1]:
range(10)

range(0, 10)

In [8]:
l = reversed(list(range(10)))
for i in l:
    print(i)

9
8
7
6
5
4
3
2
1
0


In [2]:
tensor_list = []
for _ in range(4):
    tensor_list.append(torch.rand(4))
tensor_list

[tensor([0.4995, 0.6807, 0.8019, 0.0879]),
 tensor([0.9876, 0.3137, 0.9156, 0.9397]),
 tensor([0.9568, 0.4890, 0.7837, 0.4539]),
 tensor([0.0738, 0.6614, 0.8929, 0.0463])]

In [4]:
tensor_key = torch.rand(4)
tensor_key

tensor([0.0994, 0.9053, 0.0328, 0.8189])

In [10]:
distance_list = []
for tensor_i in tensor_list:
    distance_list.append(float(torch.dot(tensor_key, tensor_i).detach().numpy()))
distance_list = np.array(distance_list)
distance_list

array([0.76416141, 1.18173742, 0.93515301, 0.6732254 ])

In [12]:
np.argmin(distance_list)

3

In [8]:
t1 = torch.rand(2)
t2 = torch.rand(2)
print(t1, t2)

tensor([0.0336, 0.1114]) tensor([0.5541, 0.5390])


In [9]:
t1 > t2

tensor([False, False])

In [10]:
t1 = torch.rand(4).cuda()
t1

tensor([0.6080, 0.0500, 0.9096, 0.3938], device='cuda:0')

In [12]:
3*t1

tensor([1.8240, 0.1499, 2.7288, 1.1815], device='cuda:0')

In [8]:

for i in range(1,1048,1):
    k = (402*i)%1049
    if k==1:
        print(i)
        break

441


In [10]:
p=23
g=2
r=6
m=3
x=4

In [12]:
y = (g**x)%p
y

16

In [14]:
u = (g**r)%p
u

18

In [15]:
v = (m*(y**r))%p 
v

12

In [18]:
w = (u**x)%p 
w

4

In [19]:
for i in range(1,p,1):
    if (w*i)%p == 1:
        w_inv = i
w_inv

6

In [14]:
tree = ET.parse("tmp/test.xml")
tree = tree.getroot()
print(tree)

<Element 'dblp' at 0x7f3414516ca0>


In [17]:
from scripts.trees import xml_to_pqTree

In [21]:
tree_gram = xml_to_pqTree(tree)

In [24]:
from collections import deque
from graphviz import Graph


def tree_visualize(root, FILENAME="tmp_tree"):
    """
    木構造の幅優先探索を利用して木を描画する
    """
    queue = deque([root])
    graph = Graph()
    graph.attr("node", shape="circle")
    node_id = 1
    graph.node(str(node_id), root.label)
    node_id += 1
    parent_id = 0
    while queue:
        node = queue.popleft()
        print(node.label)  # ノードのラベルを出力
        parent_id += 1

        for child in node.children:
            queue.append(child)
            graph.node(str(node_id), child.label)
            graph.edge(str(parent_id), str(node_id))
            node_id += 1
    graph.render()


tree_visualize(tree_gram)

dblp
mastersthesis
author
title
year
school


ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

In [6]:
from scripts import create_strings

trees = create_strings.generate_binaries(3, 2, 5)
print(trees)

None


In [3]:
from clearnlp.converter import SubprocessConverter

converter = SubprocessConverter()
trees = ["(ROOT (S (NP (DT The) (NN cat)) (VP (VBD sat) (PP (IN on) (NP (DT the) (NN mat))))))"]
converted_trees = converter.convert_trees(trees)


In [5]:
for word in converted_trees:
    print(word)

1	The	the	DT	_	2	det	_	_	_
2	cat	cat	NN	_	3	dep	_	_	_
3	sat	sit	VBD	_	0	root	_	_	_
4	on	on	IN	_	3	prep	_	_	_
5	the	the	DT	_	6	det	_	_	_
6	mat	mat	NN	_	4	pobj	_	_	_


In [10]:
import torch
from torch.nn.functional import normalize

In [23]:
t = torch.rand((4,1)) 
t

tensor([[0.0816],
        [0.0095],
        [0.7857],
        [0.4315]])

In [25]:
min_val = t.min(dim=0).values
min_val

tensor([0.0095])

In [27]:
max_val = t.max(dim=0).values
max_val

tensor([0.7857])

In [29]:
t_normalized = (t-min_val)/(max_val-min_val)
t_normalized

tensor([[0.0929],
        [0.0000],
        [1.0000],
        [0.5436]])

In [30]:
mean = t.mean(dim=0)
std = t.std(dim=0)

(t-mean)/std

tensor([[-0.6876],
        [-0.8895],
        [ 1.2847],
        [ 0.2925]])