In [1]:
import numpy as np
from tqdm.notebook import tqdm_notebook as tqdm
from time import time

import pyconll
from scripts.TreeKernel import tree, tree_kernels

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [8]:

def to_prolog(tree: pyconll.tree.tree.Tree) -> str:
    if tree._children:
        children_repr = ', '.join(to_prolog(child) for child in tree._children)
        return f'_({children_repr})'
    else:
        return f'_'


def calc_kernel_matrix(data1, data2, kernel: tree_kernels.KernelST):
    n1, n2 = len(data1), len(data2)
    matrix = np.zeros((n1, n2))
    for i in range(n1):
        for j in range(n2):
            matrix[i][j] = kernel.kernel(data1[i], data2[j])
    return matrix


In [3]:
random_state = 50

In [4]:
CoNLL = pyconll.load_from_file("corpora/English/English-EWT.conllu")
count_en = len(CoNLL)
CoNLL += pyconll.load_from_file("corpora/English/English-Atis.conllu")
count_ja = len(CoNLL) - count_en

labels = [0]*count_en + [1]*count_ja

trees = [conll.to_tree() for conll in CoNLL]

data = []

for t in trees:
    root = tree.TreeNode.fromPrologString(to_prolog(t))
    tree_ = tree.Tree(root)
    data.append(tree_)


In [6]:
train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.20, random_state=random_state
)

    # 部分木の比較用カーネルを作成
lambda_value = 0.5
kernel = tree_kernels.KernelST(lambda_value)


In [9]:
train_kernel_matrix = calc_kernel_matrix(train_data[:100], train_data[:100], kernel)

In [10]:
train_kernel_matrix

array([[ 84.91412359, 100.8125    ,  97.25      , ...,  22.25      ,
         16.25      ,  27.75      ],
       [100.8125    , 125.46319962, 123.375     , ...,  31.375     ,
         24.875     ,  40.875     ],
       [ 97.25      , 123.375     , 122.67604268, ...,  32.625     ,
         26.375     ,  42.875     ],
       ...,
       [ 22.25      ,  31.375     ,  32.625     , ...,  10.63330078,
          9.375     ,  14.875     ],
       [ 16.25      ,  24.875     ,  26.375     , ...,   9.375     ,
          8.88476562,  13.875     ],
       [ 27.75      ,  40.875     ,  42.875     , ...,  14.875     ,
         13.875     ,  22.37530518]])

In [11]:
matrix_T = train_kernel_matrix.copy().transpose()

In [12]:
matrix_T

array([[ 84.91412359, 100.8125    ,  97.25      , ...,  22.25      ,
         16.25      ,  27.75      ],
       [100.8125    , 125.46319962, 123.375     , ...,  31.375     ,
         24.875     ,  40.875     ],
       [ 97.25      , 123.375     , 122.67604268, ...,  32.625     ,
         26.375     ,  42.875     ],
       ...,
       [ 22.25      ,  31.375     ,  32.625     , ...,  10.63330078,
          9.375     ,  14.875     ],
       [ 16.25      ,  24.875     ,  26.375     , ...,   9.375     ,
          8.88476562,  13.875     ],
       [ 27.75      ,  40.875     ,  42.875     , ...,  14.875     ,
         13.875     ,  22.37530518]])

In [13]:
np.equal(train_kernel_matrix, matrix_T)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [14]:
np.array_equal(train_kernel_matrix, matrix_T)

True

In [20]:
n = 5
matrix = np.zeros((n,n))
for i in range(n):
    for j in range(i,n):
        matrix[i,j] = kernel.kernel(train_data[i], train_data[j])

In [21]:
matrix

array([[ 84.91412359, 100.8125    ,  97.25      ,  44.        ,
         26.5       ],
       [  0.        , 125.46319962, 123.375     ,  53.        ,
         30.5       ],
       [  0.        ,   0.        , 122.67604268,  51.5       ,
         29.        ],
       [  0.        ,   0.        ,   0.        ,  23.50793457,
         13.5       ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          8.515625  ]])

In [22]:
matrix + matrix.T - np.diag(matrix.diagonal())

array([[ 84.91412359, 100.8125    ,  97.25      ,  44.        ,
         26.5       ],
       [100.8125    , 125.46319962, 123.375     ,  53.        ,
         30.5       ],
       [ 97.25      , 123.375     , 122.67604268,  51.5       ,
         29.        ],
       [ 44.        ,  53.        ,  51.5       ,  23.50793457,
         13.5       ],
       [ 26.5       ,  30.5       ,  29.        ,  13.5       ,
          8.515625  ]])