### ライブラリのインポート

In [2]:
import torch
import numpy as np

import ot

from scripts import w_pq_batch as w_pq
from scripts import trees, pq_gram, func

from pqgrams.PQGram import Profile
import pyconll

from tqdm.notebook import tqdm_notebook as tqdm
import random

##### EWT,Atis間のWasserstein距離

In [3]:
train_tensors_path = "data/train_tensors_en_corpora_En_EWT_Atis_unlabel_50.pt"
train_labels_path = "data/train_labels_en_corpora_En_EWT_Atis_unlabel_50.pt"
train_indexes_path = "data/train_indexes_en_corpora_En_EWT_Atis_unlabel_50.pt"

valid_tensors_path = "data/valid_tensors_en_corpora_En_EWT_Atis_unlabel_50.pt"
valid_labels_path = "data/valid_labels_en_corpora_En_EWT_Atis_unlabel_50.pt"
valid_indexes_path = "data/valid_indexes_en_corpora_En_EWT_Atis_unlabel_50.pt"

test_tensors_path = "data/test_tensors_en_corpora_En_EWT_Atis_unlabel_50.pt"
test_labels_path = "data/test_labels_en_corpora_En_EWT_Atis_unlabel_50.pt"
test_indexes_path = "data/test_indexes_en_corpora_En_EWT_Atis_unlabel_50.pt"

model_path = "models/model_en_corpora_EWT_Atis_unlabel_50.pth"

CoNLLU_EWT_PATH = "corpora/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_EWT_PATH)
EWT_tree_count = len(CoNLLU)


In [8]:
train_tensors = torch.load(train_tensors_path)

In [10]:
distance_function = w_pq.WeightedPqgramDistance(train_tensors[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights
weights

Parameter containing:
tensor([-2.0425, -2.2598, -1.7006, -2.2871, -2.0425,  0.3741, -1.7006, -1.7006],
       requires_grad=True)

In [7]:
a = [0.3, 0.7]  # ソース分布（サイズ2）
b = [0.4, 0.4, 0.2]  # ターゲット分布（サイズ3）
M = [[0.0, 1.0, 2.0],  # 2x3 のコスト行列
     [1.0, 0.5, 0.5]]

T = ot.emd2(a, b, M)
print("OTマトリックス:", T)

OTマトリックス: 0.39999999999999997


In [3]:
CoNLLU_GPT_PATH = "corpora/English-chatGPT.conllu"

In [4]:
CoNLLU += pyconll.load_from_file(CoNLLU_GPT_PATH)


PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index, desc="[convert tensor]")]


[convert tensor]: 100%|██████████| 30015/30015 [00:00<00:00, 41490.56it/s]


In [5]:
len(J)

8

In [6]:
def wasserstein_distance_weighted(data1, data2, weights):
    # 重みを適用
    weights = func.softplus(weights)
    weighted_data1 = [t*weights for t in data1]  # 各サンプルに重みを適用
    weighted_data2 = [t*weights for t in data2]

    distances = []
    for dim in range(weighted_data1[0].size(0)):  # 8次元でループ
        # dim次元の要素を全て取得して連結
        x_dim = torch.cat([t[dim].unsqueeze(0) for t in weighted_data1])
        y_dim = torch.cat([t[dim].unsqueeze(0) for t in weighted_data2])

        # 次元ごとに要素をソート
        x_sorted = torch.sort(x_dim)[0]
        y_sorted = torch.sort(y_dim)[0]

        # 累積分布関数 (CDF) を計算
        cdf_x = torch.cumsum(torch.ones_like(x_sorted) / len(x_sorted), dim=0)
        cdf_y = torch.cumsum(torch.ones_like(y_sorted) / len(y_sorted), dim=0)

        # 各次元のWasserstein距離を計算
        distance = torch.mean(torch.abs(cdf_x - cdf_y))
        distances.append(distance)

    print(distances)
    # 各次元の距離の平均を返す
    return torch.mean(torch.tensor(distances)).item()


In [None]:
distance_function = w_pq.WeightedPqgramDistance(tensors[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights


WeightedPqgramDistance()

In [8]:
weights = distance_function.weights

In [9]:
print(min(EWT_tree_count, len(tensors)-EWT_tree_count))

13394


In [10]:
tensors_EWT = tensors[:EWT_tree_count]
tensors_GPT = tensors[EWT_tree_count:]
sample_size = min(EWT_tree_count, len(tensors)-EWT_tree_count)
if EWT_tree_count<sample_size:
    tensors_GPT = random.sample(tensors_GPT, k=sample_size)
else :
    tensors_EWT = random.sample(tensors_EWT, k=sample_size)

In [11]:
Tensors_EWT = torch.zeros((EWT_tree_count,8))
for i, tensor in enumerate(tensors_EWT):
    Tensors_EWT[i] = tensor

Tensors_GPT = torch.zeros((len(tensors_GPT),8))
for i, tensor in enumerate(tensors_GPT):
    Tensors_GPT[i] = tensor


In [12]:
def distance_matrix_chunked(tensors: np.ndarray, weights: np.ndarray, chunk_size: int):
    """
    データをチャンクに分割して距離行列を計算する関数。
    tensors: 入力データ [N, dim] の配列
    weights: 重み [dim] の配列
    chunk_size: 一度に処理するデータのチャンクサイズ
    """

    num_samples = tensors.shape[0]
    dist_mat = np.zeros((num_samples, num_samples))  # 距離行列の初期化
    
    # チャンクごとに計算
    for i in tqdm(range(0, num_samples, chunk_size)):
        end_i = min(i + chunk_size, num_samples)
        tensor_chunk_i = tensors[i:end_i, np.newaxis]  # [chunk_size, 1, dim]

        for j in range(0, num_samples, chunk_size):
            end_j = min(j + chunk_size, num_samples)
            tensor_chunk_j = tensors[j:end_j, np.newaxis]  # [1, chunk_size, dim]
            
            # 差の計算
            diff = np.abs(tensor_chunk_i - tensor_chunk_j)  # [chunk_size, chunk_size, dim]
            aw = np.log1p(np.exp(weights))  # weightsのSoftplus関数を近似
            aw = aw[np.newaxis, np.newaxis, :]  # [1, 1, dim]
            weighted_diff = diff * aw  # アダマール積
            dist_chunk = weighted_diff.sum(axis=2)  # 距離の計算
            
            # 距離行列に結果を格納
            dist_mat[i:end_i, j:end_j] = dist_chunk

    return dist_mat


In [13]:
array_EWT = Tensors_EWT.numpy()
array_GPT = Tensors_GPT.numpy()

weights_np = weights.detach().numpy()

array_size = len(array_EWT) + len(array_GPT)

array_all = np.zeros((array_size, 8))

print(len(array_EWT))

for i in range(len(array_EWT)):
    array_all[i] = array_EWT[i]

for i in range(len(array_GPT)):
    array_all[i+len(array_GPT)] = array_GPT[i]

16621


In [14]:
def distance_matrix_chunked(tensors: torch.Tensor, weights: torch.Tensor, chunk_size: int):
    """
    データをチャンクに分割して距離行列を計算する関数。
    tensors: 入力データ [N, dim] のテンソル
    weights: 重み [dim] のテンソル
    chunk_size: 一度に処理するデータのチャンクサイズ
    """
    device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

    num_samples = tensors.shape[0]
    dist_mat = torch.zeros((num_samples, num_samples), device=device)  # 距離行列の初期化
    
    # チャンクごとに計算
    for i in range(0, num_samples, chunk_size):
        end_i = min(i + chunk_size, num_samples)
        tensor_chunk_i = tensors[i:end_i].unsqueeze(1)  # [chunk_size, 1, dim]

        for j in range(0, num_samples, chunk_size):
            end_j = min(j + chunk_size, num_samples)
            tensor_chunk_j = tensors[j:end_j].unsqueeze(0)  # [1, chunk_size, dim]
            
            # 差の計算
            diff = torch.abs(tensor_chunk_i - tensor_chunk_j).to(device)  # [chunk_size, chunk_size, dim]
            aw = func.softplus(weights).to(device).unsqueeze(0).unsqueeze(0)  # [1, 1, dim]
            weighted_diff = diff * aw  # アダマール積
            dist_chunk = weighted_diff.sum(dim=2)  # 距離の計算
            
            # 距離行列に結果を格納
            dist_mat[i:end_i, j:end_j] = dist_chunk
        del dist_chunk, end_j, aw, weighted_diff
        torch.cuda.empty_cache()
    
    del tensors, end_i
    torch.cuda.empty_cache()

    return dist_mat

In [15]:
distance_matrix=distance_matrix_chunked(torch.from_numpy(array_all), weights, len(tensors)//512)
torch.cuda.empty_cache()

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 3 has a total capacity of 23.65 GiB of which 3.19 MiB is free. Including non-PyTorch memory, this process has 23.64 GiB memory in use. Of the allocated memory 21.73 GiB is allocated by PyTorch, and 1.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [16]:
distance_matrix = np.zeros((array_size,array_size))

for i in tqdm(range(array_size)):
    for j in range(array_size):
        dist = weighted
        distance_matrix[i][j] = 

In [15]:
distance_matrix_chunked(array_all, weights_np, len(array_all)//128)

  0%|          | 0/129 [00:00<?, ?it/s]


ValueError: operands could not be broadcast together with shapes (234,1,8) (63,1,8) 

In [27]:
def compute_cost_matrix(data, distance_func):
    num_points = data.shape[0]
    cost_matrix = np.zeros((num_points, num_points))

    for i in range(num_points):
        for j in range(num_points):
            if i != j:
                cost_matrix[i, j] = distance_func(data[i], data[j])
    
    return cost_matrix

In [None]:
cost_matrix = compute_cost_matrix(data_a, custom_distance)

### Wasserstein距離の計測

#### English-EWT, chatGPT

In [None]:
CoNLLU_EWT_PATH = "corpora/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_EWT_PATH)
EWT_tree_count = len(CoNLLU)

CoNLLU_GPT_PATH = "corpora/English-chatGPT.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_GPT_PATH)
GPT_tree_count = len(CoNLLU) - EWT_tree_count


PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_EWT = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:EWT_tree_count], desc="[convert tensor]")]
tensors_GPT = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[EWT_tree_count:], desc="[convert tensor]")]


model_path = "models/model_en_corpora_EWT_Atis_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_EWT[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights


In [3]:
a = []
for _ in range(EWT_tree_count):
    a.append(1/EWT_tree_count)
b = []
for _ in range(GPT_tree_count):
    b.append(1/GPT_tree_count)

In [4]:
cost_matrix = torch.zeros((EWT_tree_count, GPT_tree_count))

tensors_EWT = torch.stack([t.to("cuda") for t in tensors_EWT])
tensors_GPT = torch.stack([t.to("cuda") for t in tensors_GPT])

for i in tqdm(range(EWT_tree_count)):
    t_ewt = tensors_EWT[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_GPT, t_ewt.repeat(tensors_GPT.size(0), 1))

  0%|          | 0/16621 [00:00<?, ?it/s]

In [7]:
ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000)

1.5900653316080489

### EWT - EWT

In [2]:
CoNLLU_source_PATH = "corpora/English/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/English/English-EWT.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count


PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_en_corpora_EWT_EWT_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda") for t in tensors_target])

for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1))

print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/16621 [00:00<?, ?it/s]

0.0


### EWT-ESL

In [2]:
CoNLLU_source_PATH = "corpora/English/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/English/English-ESL.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_en_corpora_En_EWT_ESL_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])

for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1))

print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

16621 5124


[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/5124 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/16621 [00:00<?, ?it/s]

5.99661802692641


### EWT - Atis

In [4]:
CoNLLU_source_PATH = "corpora/English/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/English/English-Atis.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_en_corpora_EWT_Atis_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])

for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1))

print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

16621 5432


[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/5432 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/16621 [00:00<?, ?it/s]

5.267264440750935


### EWT - Ja-BCCWJ

In [7]:
CoNLLU_source_PATH = "corpora/English/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/Japanese/Japanese-BCCWJ.conllu"
CoNLLU += random.sample(pyconll.load_from_file(CoNLLU_target_PATH), k=30000)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_en_corpora_EWT_Ja-BCCWJ_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])

for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1))

print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

16621 16621


[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/16621 [00:00<?, ?it/s]

2.9210620399488905


### En-EWT -- Fr-GSD

In [10]:
CoNLLU_source_PATH = "corpora/English/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/French/French-GSD.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_En-EWT_Fr-GSD_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])

for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1))

print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

16621 14450


[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/14450 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/16621 [00:00<?, ?it/s]

3.909186519127199


  check_result(result_code)


### EWT -- Korean-Kaist

In [11]:
CoNLLU_source_PATH = "corpora/English/English-EWT.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/Korean/Korean-Kaist.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_En-EWT_Ko-Kaist_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])

for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1))

print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

16621 23010


[convert tensor]:   0%|          | 0/16621 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/23010 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/16621 [00:00<?, ?it/s]

2.889577513517188


  check_result(result_code)


### Fr-GSD -- Ja-BCCWJ

In [4]:
CoNLLU_source_PATH = "corpora/Japanese/Japanese-BCCWJ.conllu"
CoNLLU = random.sample(pyconll.load_from_file(CoNLLU_source_PATH), k=30000)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/French/French-GSD.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_Fr-GSD_Ja-BCCWJ_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])


for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1)).to("cpu")


print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

30000 14450


[convert tensor]:   0%|          | 0/30000 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/14450 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/30000 [00:00<?, ?it/s]

1.4892249959879988


  check_result(result_code)


### Fr-GSD -- Korean-kaist

In [3]:
CoNLLU_source_PATH = "corpora/Korean/Korean-Kaist.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/French/French-GSD.conllu"
CoNLLU += pyconll.load_from_file(CoNLLU_target_PATH)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_Fr-GSD_Ko-Kaist_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])


for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1)).to("cpu")


print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

23010 14450


[convert tensor]:   0%|          | 0/23010 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/14450 [00:00<?, ?it/s]

[cost matrix]:   0%|          | 0/23010 [00:00<?, ?it/s]

6.481494471447322


  check_result(result_code)


### Ja-BCCWJ -- Ko-Kaist

In [5]:
CoNLLU_source_PATH = "corpora/Korean/Korean-Kaist.conllu"
CoNLLU = pyconll.load_from_file(CoNLLU_source_PATH)
source_tree_count = len(CoNLLU)

CoNLLU_target_PATH = "corpora/Japanese/Japanese-BCCWJ.conllu"
CoNLLU += random.sample(pyconll.load_from_file(CoNLLU_target_PATH), k=30000)
target_tree_count = len(CoNLLU) - source_tree_count

print(source_tree_count, target_tree_count)

PQ_Trees = [trees.conllTree_to_pqTree_unlabeled(conll.to_tree()) for conll in CoNLLU]
PQ_Index = [Profile(tree, p=2, q=2) for tree in PQ_Trees]

J = set(PQ_Index[0])
for pq_set  in PQ_Index[1:]:
    J = J.union(pq_set)
J = list(J)

tensors_source = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[:source_tree_count], desc="[convert tensor]")]
tensors_target = [pq_gram.pqgram_to_tensor(pqgram, J) for pqgram in tqdm(PQ_Index[source_tree_count:], desc="[convert tensor]")]


model_path = "models/model_Ko-Kaist_Ja-BCCWJ_unlabel_50.pth"
distance_function = w_pq.WeightedPqgramDistance(tensors_source[0].size(), [], [])
distance_function.load_state_dict(torch.load(model_path))
distance_function.eval()
weights = distance_function.weights

a = []
for _ in range(source_tree_count):
    a.append(1/source_tree_count)
b = []
for _ in range(target_tree_count):
    b.append(1/target_tree_count)

cost_matrix = torch.zeros((source_tree_count, target_tree_count))

tensors_source = torch.stack([t.to("cuda:3") for t in tensors_source])
tensors_target = torch.stack([t.to("cuda:3") for t in tensors_target])


for i in tqdm(range(source_tree_count), desc="[cost matrix]"):
    t_source = tensors_source[i].unsqueeze(0)
    cost_matrix[i] = w_pq.weighted_pqgram_distance_batch(weights, tensors_target, t_source.repeat(tensors_target.size(0), 1)).to("cpu")


print(ot.emd2(a, b, cost_matrix.detach().numpy(), numItermax=1000000))

23010 30000


[convert tensor]:   0%|          | 0/23010 [00:00<?, ?it/s]

[convert tensor]:   0%|          | 0/30000 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'models/model_Ko-Kaist_Ja-BCCWJ_unlabel_50.pth'